From 138885ca44882168bb8def38d52b3b404c7363a2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 1 Apr 2017 19:33:18 +0200
Subject: [PATCH] 4.9-stable patches

added patches:
	xfs-allow-unwritten-extents-in-the-cow-fork.patch
	xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
	xfs-don-t-fail-xfs_extent_busy-allocation.patch
	xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
	xfs-fail-_dir_open-when-readahead-fails.patch
	xfs-filter-out-obviously-bad-btree-pointers.patch
	xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
	xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
	xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
	xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
	xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
	xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
	xfs-only-reclaim-unwritten-cow-extents-periodically.patch
	xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
	xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
	xfs-split-indlen-reservations-fairly-when-under-reserved.patch
	xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
	xfs-tune-down-agno-asserts-in-the-bmap-code.patch
	xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
	xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
	xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
	xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
	xfs-verify-free-block-header-fields.patch
---
 queue-4.9/series                              |  23 ++
 ...ow-unwritten-extents-in-the-cow-fork.patch | 278 ++++++++++++++
 ...ly-bad-level-values-in-the-bmbt-root.patch |  55 +++
 ...on-t-fail-xfs_extent_busy-allocation.patch |  45 +++
 ...-blocks-for-right-shift-transactions.patch |  94 +++++
 ...-fail-_dir_open-when-readahead-fails.patch |  75 ++++
 ...ter-out-obviously-bad-btree-pointers.patch |  66 ++++
 ...eamline-error-handling-in-xfs_end_io.patch | 111 ++++++
 ...with-file-extending-async-dio-writes.patch |  61 ++++
 ...king-an-inode-to-access-the-data-map.patch |  48 +++
 ...zed-variable-in-_reflink_convert_cow.patch |  31 ++
 ...en-shortage-on-delalloc-extent-merge.patch |  76 ++++
 ...-prealloc-cow-fork-extents-unwritten.patch | 339 ++++++++++++++++++
 ...m-unwritten-cow-extents-periodically.patch | 159 ++++++++
 ...ned-direct-writes-to-reflinked-files.patch | 125 +++++++
 ...-clear-the-retry-status-of-xfs_buf_t.patch |  32 ++
 ...ervations-fairly-when-under-reserved.patch | 118 ++++++
 ...he-first-btree-block-when-reflinking.patch |  97 +++++
 ...e-down-agno-asserts-in-the-bmap-code.patch |  83 +++++
 ...-mtime-on-clone-destinatation-inodes.patch |  66 ++++
 ...-for-newly-allocated-delalloc-blocks.patch | 150 ++++++++
 ...sb-to-calculate-inode-alignment-mask.patch |  44 +++
 ...b-to-calculate-inode-chunk-alignment.patch |  91 +++++
 .../xfs-verify-free-block-header-fields.patch |  93 +++++
 24 files changed, 2360 insertions(+)
 create mode 100644 queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch
 create mode 100644 queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
 create mode 100644 queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch
 create mode 100644 queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
 create mode 100644 queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch
 create mode 100644 queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch
 create mode 100644 queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
 create mode 100644 queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
 create mode 100644 queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
 create mode 100644 queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
 create mode 100644 queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
 create mode 100644 queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
 create mode 100644 queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch
 create mode 100644 queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
 create mode 100644 queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
 create mode 100644 queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch
 create mode 100644 queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
 create mode 100644 queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch
 create mode 100644 queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
 create mode 100644 queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
 create mode 100644 queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
 create mode 100644 queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
 create mode 100644 queue-4.9/xfs-verify-free-block-header-fields.patch

diff --git a/queue-4.9/series b/queue-4.9/series
index db553e76a6f..1c97673490f 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -4,3 +4,26 @@ xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch
 xfs-use-per-ag-reservations-for-the-finobt.patch
 xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
 xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
+xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
+xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
+xfs-fail-_dir_open-when-readahead-fails.patch
+xfs-filter-out-obviously-bad-btree-pointers.patch
+xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
+xfs-verify-free-block-header-fields.patch
+xfs-allow-unwritten-extents-in-the-cow-fork.patch
+xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
+xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
+xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
+xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
+xfs-don-t-fail-xfs_extent_busy-allocation.patch
+xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
+xfs-split-indlen-reservations-fairly-when-under-reserved.patch
+xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
+xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
+xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
+xfs-tune-down-agno-asserts-in-the-bmap-code.patch
+xfs-only-reclaim-unwritten-cow-extents-periodically.patch
+xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
+xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
+xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
+xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
diff --git a/queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch b/queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch
new file mode 100644
index 00000000000..900ce34164d
--- /dev/null
+++ b/queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch
@@ -0,0 +1,278 @@
+From 05a630d76bd3f39baf0eecfa305bed2820796dee Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:01 -0800
+Subject: xfs: allow unwritten extents in the CoW fork
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 05a630d76bd3f39baf0eecfa305bed2820796dee upstream.
+
+In the data fork, we only allow extents to perform the following state
+transitions:
+
+delay -> real <-> unwritten
+
+There's no way to move directly from a delalloc reservation to an
+/unwritten/ allocated extent.  However, for the CoW fork we want to be
+able to do the following to each extent:
+
+delalloc -> unwritten -> written -> remapped to data fork
+
+This will help us to avoid a race in the speculative CoW preallocation
+code between a first thread that is allocating a CoW extent and a second
+thread that is remapping part of a file after a write.  In order to do
+this, however, we need two things: first, we have to be able to
+transition from da to unwritten, and second the function that converts
+between real and unwritten has to be made aware of the cow fork.  Do
+both of those things.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   80 +++++++++++++++++++++++++++++------------------
+ 1 file changed, 50 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -1952,6 +1952,7 @@ xfs_bmap_add_extent_delay_real(
+ 		 */
+ 		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ 		xfs_bmbt_set_startblock(ep, new->br_startblock);
++		xfs_bmbt_set_state(ep, new->br_state);
+ 		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ 
+ 		(*nextents)++;
+@@ -2290,6 +2291,7 @@ STATIC int				/* error */
+ xfs_bmap_add_extent_unwritten_real(
+ 	struct xfs_trans	*tp,
+ 	xfs_inode_t		*ip,	/* incore inode pointer */
++	int			whichfork,
+ 	xfs_extnum_t		*idx,	/* extent number to update/insert */
+ 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
+ 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
+@@ -2309,12 +2311,14 @@ xfs_bmap_add_extent_unwritten_real(
+ 					/* left is 0, right is 1, prev is 2 */
+ 	int			rval=0;	/* return value (logging flags) */
+ 	int			state = 0;/* state bits, accessed thru macros */
+-	struct xfs_mount	*mp = tp->t_mountp;
++	struct xfs_mount	*mp = ip->i_mount;
+ 
+ 	*logflagsp = 0;
+ 
+ 	cur = *curp;
+-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
++	ifp = XFS_IFORK_PTR(ip, whichfork);
++	if (whichfork == XFS_COW_FORK)
++		state |= BMAP_COWFORK;
+ 
+ 	ASSERT(*idx >= 0);
+ 	ASSERT(*idx <= xfs_iext_count(ifp));
+@@ -2373,7 +2377,7 @@ xfs_bmap_add_extent_unwritten_real(
+ 	 * Don't set contiguous if the combined extent would be too large.
+ 	 * Also check for all-three-contiguous being too large.
+ 	 */
+-	if (*idx < xfs_iext_count(&ip->i_df) - 1) {
++	if (*idx < xfs_iext_count(ifp) - 1) {
+ 		state |= BMAP_RIGHT_VALID;
+ 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+ 		if (isnullstartblock(RIGHT.br_startblock))
+@@ -2413,7 +2417,8 @@ xfs_bmap_add_extent_unwritten_real(
+ 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ 
+ 		xfs_iext_remove(ip, *idx + 1, 2, state);
+-		ip->i_d.di_nextents -= 2;
++		XFS_IFORK_NEXT_SET(ip, whichfork,
++				XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
+ 		if (cur == NULL)
+ 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ 		else {
+@@ -2456,7 +2461,8 @@ xfs_bmap_add_extent_unwritten_real(
+ 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ 
+ 		xfs_iext_remove(ip, *idx + 1, 1, state);
+-		ip->i_d.di_nextents--;
++		XFS_IFORK_NEXT_SET(ip, whichfork,
++				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ 		if (cur == NULL)
+ 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ 		else {
+@@ -2491,7 +2497,8 @@ xfs_bmap_add_extent_unwritten_real(
+ 		xfs_bmbt_set_state(ep, newext);
+ 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ 		xfs_iext_remove(ip, *idx + 1, 1, state);
+-		ip->i_d.di_nextents--;
++		XFS_IFORK_NEXT_SET(ip, whichfork,
++				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ 		if (cur == NULL)
+ 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ 		else {
+@@ -2603,7 +2610,8 @@ xfs_bmap_add_extent_unwritten_real(
+ 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ 
+ 		xfs_iext_insert(ip, *idx, 1, new, state);
+-		ip->i_d.di_nextents++;
++		XFS_IFORK_NEXT_SET(ip, whichfork,
++				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ 		if (cur == NULL)
+ 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ 		else {
+@@ -2681,7 +2689,8 @@ xfs_bmap_add_extent_unwritten_real(
+ 		++*idx;
+ 		xfs_iext_insert(ip, *idx, 1, new, state);
+ 
+-		ip->i_d.di_nextents++;
++		XFS_IFORK_NEXT_SET(ip, whichfork,
++				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ 		if (cur == NULL)
+ 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ 		else {
+@@ -2729,7 +2738,8 @@ xfs_bmap_add_extent_unwritten_real(
+ 		++*idx;
+ 		xfs_iext_insert(ip, *idx, 2, &r[0], state);
+ 
+-		ip->i_d.di_nextents += 2;
++		XFS_IFORK_NEXT_SET(ip, whichfork,
++				XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
+ 		if (cur == NULL)
+ 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ 		else {
+@@ -2783,17 +2793,17 @@ xfs_bmap_add_extent_unwritten_real(
+ 	}
+ 
+ 	/* update reverse mappings */
+-	error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
++	error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new);
+ 	if (error)
+ 		goto done;
+ 
+ 	/* convert to a btree if necessary */
+-	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
++	if (xfs_bmap_needs_btree(ip, whichfork)) {
+ 		int	tmp_logflags;	/* partial log flag return val */
+ 
+ 		ASSERT(cur == NULL);
+ 		error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
+-				0, &tmp_logflags, XFS_DATA_FORK);
++				0, &tmp_logflags, whichfork);
+ 		*logflagsp |= tmp_logflags;
+ 		if (error)
+ 			goto done;
+@@ -2805,7 +2815,7 @@ xfs_bmap_add_extent_unwritten_real(
+ 		*curp = cur;
+ 	}
+ 
+-	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
++	xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+ done:
+ 	*logflagsp |= rval;
+ 	return error;
+@@ -4458,10 +4468,16 @@ xfs_bmapi_allocate(
+ 	bma->got.br_state = XFS_EXT_NORM;
+ 
+ 	/*
+-	 * A wasdelay extent has been initialized, so shouldn't be flagged
+-	 * as unwritten.
++	 * In the data fork, a wasdelay extent has been initialized, so
++	 * shouldn't be flagged as unwritten.
++	 *
++	 * For the cow fork, however, we convert delalloc reservations
++	 * (extents allocated for speculative preallocation) to
++	 * allocated unwritten extents, and only convert the unwritten
++	 * extents to real extents when we're about to write the data.
+ 	 */
+-	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
++	if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
++	    (bma->flags & XFS_BMAPI_PREALLOC) &&
+ 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
+ 		bma->got.br_state = XFS_EXT_UNWRITTEN;
+ 
+@@ -4512,8 +4528,6 @@ xfs_bmapi_convert_unwritten(
+ 			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+ 		return 0;
+ 
+-	ASSERT(whichfork != XFS_COW_FORK);
+-
+ 	/*
+ 	 * Modify (by adding) the state flag, if writing.
+ 	 */
+@@ -4538,8 +4552,8 @@ xfs_bmapi_convert_unwritten(
+ 			return error;
+ 	}
+ 
+-	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+-			&bma->cur, mval, bma->firstblock, bma->dfops,
++	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
++			&bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
+ 			&tmp_logflags);
+ 	/*
+ 	 * Log the inode core unconditionally in the unwritten extent conversion
+@@ -4548,8 +4562,12 @@ xfs_bmapi_convert_unwritten(
+ 	 * in the transaction for the sake of fsync(), even if nothing has
+ 	 * changed, because fsync() will not force the log for this transaction
+ 	 * unless it sees the inode pinned.
++	 *
++	 * Note: If we're only converting cow fork extents, there aren't
++	 * any on-disk updates to make, so we don't need to log anything.
+ 	 */
+-	bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
++	if (whichfork != XFS_COW_FORK)
++		bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
+ 	if (error)
+ 		return error;
+ 
+@@ -4623,15 +4641,15 @@ xfs_bmapi_write(
+ 	ASSERT(*nmap >= 1);
+ 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+ 	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+-	ASSERT(tp != NULL);
++	ASSERT(tp != NULL ||
++	       (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
++			(XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ 	ASSERT(len > 0);
+ 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+ 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ 	ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
+ 	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
+ 	ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+-	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
+-	ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
+ 
+ 	/* zeroing is for currently only for data extents, not metadata */
+ 	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+@@ -5653,8 +5671,8 @@ __xfs_bunmapi(
+ 			}
+ 			del.br_state = XFS_EXT_UNWRITTEN;
+ 			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+-					&lastx, &cur, &del, firstblock, dfops,
+-					&logflags);
++					whichfork, &lastx, &cur, &del,
++					firstblock, dfops, &logflags);
+ 			if (error)
+ 				goto error0;
+ 			goto nodelete;
+@@ -5711,8 +5729,9 @@ __xfs_bunmapi(
+ 				prev.br_state = XFS_EXT_UNWRITTEN;
+ 				lastx--;
+ 				error = xfs_bmap_add_extent_unwritten_real(tp,
+-						ip, &lastx, &cur, &prev,
+-						firstblock, dfops, &logflags);
++						ip, whichfork, &lastx, &cur,
++						&prev, firstblock, dfops,
++						&logflags);
+ 				if (error)
+ 					goto error0;
+ 				goto nodelete;
+@@ -5720,8 +5739,9 @@ __xfs_bunmapi(
+ 				ASSERT(del.br_state == XFS_EXT_NORM);
+ 				del.br_state = XFS_EXT_UNWRITTEN;
+ 				error = xfs_bmap_add_extent_unwritten_real(tp,
+-						ip, &lastx, &cur, &del,
+-						firstblock, dfops, &logflags);
++						ip, whichfork, &lastx, &cur,
++						&del, firstblock, dfops,
++						&logflags);
+ 				if (error)
+ 					goto error0;
+ 				goto nodelete;
diff --git a/queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch b/queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
new file mode 100644
index 00000000000..5d1c3147e6e
--- /dev/null
+++ b/queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
@@ -0,0 +1,55 @@
+From b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:59 -0800
+Subject: xfs: check for obviously bad level values in the bmbt root
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 upstream.
+
+We can't handle a bmbt that's taller than BTREE_MAXLEVELS, and there's
+no such thing as a zero-level bmbt (for that we have extents format),
+so if we see this, send back an error code.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_inode_fork.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -26,6 +26,7 @@
+ #include "xfs_inode.h"
+ #include "xfs_trans.h"
+ #include "xfs_inode_item.h"
++#include "xfs_btree.h"
+ #include "xfs_bmap_btree.h"
+ #include "xfs_bmap.h"
+ #include "xfs_error.h"
+@@ -429,11 +430,13 @@ xfs_iformat_btree(
+ 	/* REFERENCED */
+ 	int			nrecs;
+ 	int			size;
++	int			level;
+ 
+ 	ifp = XFS_IFORK_PTR(ip, whichfork);
+ 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+ 	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+ 	nrecs = be16_to_cpu(dfp->bb_numrecs);
++	level = be16_to_cpu(dfp->bb_level);
+ 
+ 	/*
+ 	 * blow out if -- fork has less extents than can fit in
+@@ -446,7 +449,8 @@ xfs_iformat_btree(
+ 					XFS_IFORK_MAXEXT(ip, whichfork) ||
+ 		     XFS_BMDR_SPACE_CALC(nrecs) >
+ 					XFS_DFORK_SIZE(dip, mp, whichfork) ||
+-		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
++		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
++		     level == 0 || level > XFS_BTREE_MAXLEVELS) {
+ 		xfs_warn(mp, "corrupt inode %Lu (btree).",
+ 					(unsigned long long) ip->i_ino);
+ 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
diff --git a/queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch b/queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch
new file mode 100644
index 00000000000..4d007c3ef7e
--- /dev/null
+++ b/queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch
@@ -0,0 +1,45 @@
+From 5e30c23d13919a718b22d4921dc5c0accc59da27 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 7 Feb 2017 14:06:46 -0800
+Subject: xfs: don't fail xfs_extent_busy allocation
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 5e30c23d13919a718b22d4921dc5c0accc59da27 upstream.
+
+We don't just need the structure to track busy extents which can be
+avoided with a synchronous transaction, but also to keep track of
+pending discard.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_extent_busy.c |   13 +------------
+ 1 file changed, 1 insertion(+), 12 deletions(-)
+
+--- a/fs/xfs/xfs_extent_busy.c
++++ b/fs/xfs/xfs_extent_busy.c
+@@ -45,18 +45,7 @@ xfs_extent_busy_insert(
+ 	struct rb_node		**rbp;
+ 	struct rb_node		*parent = NULL;
+ 
+-	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+-	if (!new) {
+-		/*
+-		 * No Memory!  Since it is now not possible to track the free
+-		 * block, make this a synchronous transaction to insure that
+-		 * the block is not reused before this transaction commits.
+-		 */
+-		trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
+-		xfs_trans_set_sync(tp);
+-		return;
+-	}
+-
++	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+ 	new->agno = agno;
+ 	new->bno = bno;
+ 	new->length = len;
diff --git a/queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch b/queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
new file mode 100644
index 00000000000..a7aa9fdfacc
--- /dev/null
+++ b/queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
@@ -0,0 +1,94 @@
+From 48af96ab92bc68fb645068b978ce36df2379e076 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 15 Feb 2017 10:18:10 -0800
+Subject: xfs: don't reserve blocks for right shift transactions
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 48af96ab92bc68fb645068b978ce36df2379e076 upstream.
+
+The block reservation for the transaction allocated in
+xfs_shift_file_space() is an artifact of the original collapse range
+support. It exists to handle the case where a collapse range occurs,
+the initial extent is left shifted into a location that forms a
+contiguous boundary with the previous extent and thus the extents
+are merged. This code was subsequently refactored and reused for
+insert range (right shift) support.
+
+If an insert range occurs under low free space conditions, the
+extent at the starting offset is split before the first shift
+transaction is allocated. If the block reservation fails, this
+leaves separate, but contiguous extents around in the inode. While
+not a fatal problem, this is unexpected and will flag a warning on
+subsequent insert range operations on the inode. This problem has
+been reproduce intermittently by generic/270 running against a
+ramdisk device.
+
+Since right shift does not create new extent boundaries in the
+inode, a block reservation for extent merge is unnecessary. Update
+xfs_shift_file_space() to conditionally reserve fs blocks for left
+shift transactions only. This avoids the warning reproduced by
+generic/270.
+
+Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |   20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1387,10 +1387,16 @@ xfs_shift_file_space(
+ 	xfs_fileoff_t		stop_fsb;
+ 	xfs_fileoff_t		next_fsb;
+ 	xfs_fileoff_t		shift_fsb;
++	uint			resblks;
+ 
+ 	ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+ 
+ 	if (direction == SHIFT_LEFT) {
++		/*
++		 * Reserve blocks to cover potential extent merges after left
++		 * shift operations.
++		 */
++		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ 		next_fsb = XFS_B_TO_FSB(mp, offset + len);
+ 		stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+ 	} else {
+@@ -1398,6 +1404,7 @@ xfs_shift_file_space(
+ 		 * If right shift, delegate the work of initialization of
+ 		 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+ 		 */
++		resblks = 0;
+ 		next_fsb = NULLFSBLOCK;
+ 		stop_fsb = XFS_B_TO_FSB(mp, offset);
+ 	}
+@@ -1439,21 +1446,14 @@ xfs_shift_file_space(
+ 	}
+ 
+ 	while (!error && !done) {
+-		/*
+-		 * We would need to reserve permanent block for transaction.
+-		 * This will come into picture when after shifting extent into
+-		 * hole we found that adjacent extents can be merged which
+-		 * may lead to freeing of a block during record update.
+-		 */
+-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+-				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
++		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
++					&tp);
+ 		if (error)
+ 			break;
+ 
+ 		xfs_ilock(ip, XFS_ILOCK_EXCL);
+ 		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+-				ip->i_gdquot, ip->i_pdquot,
+-				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
++				ip->i_gdquot, ip->i_pdquot, resblks, 0,
+ 				XFS_QMOPT_RES_REGBLKS);
+ 		if (error)
+ 			goto out_trans_cancel;
diff --git a/queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch b/queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch
new file mode 100644
index 00000000000..2e5501f482f
--- /dev/null
+++ b/queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch
@@ -0,0 +1,75 @@
+From 7a652bbe366464267190c2792a32ce4fff5595ef Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:58 -0800
+Subject: xfs: fail _dir_open when readahead fails
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 7a652bbe366464267190c2792a32ce4fff5595ef upstream.
+
+When we open a directory, we try to readahead block 0 of the directory
+on the assumption that we're going to need it soon.  If the bmbt is
+corrupt, the directory will never be usable and the readahead fails
+immediately, so we might as well prevent the directory from being opened
+at all.  This prevents a subsequent read or modify operation from
+hitting it and taking the fs offline.
+
+NOTE: We're only checking for early failures in the block mapping, not
+the readahead directory block itself.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_da_btree.c |    6 ++----
+ fs/xfs/libxfs/xfs_da_btree.h |    2 +-
+ fs/xfs/xfs_file.c            |    4 ++--
+ 3 files changed, 5 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_da_btree.c
++++ b/fs/xfs/libxfs/xfs_da_btree.c
+@@ -2633,7 +2633,7 @@ out_free:
+ /*
+  * Readahead the dir/attr block.
+  */
+-xfs_daddr_t
++int
+ xfs_da_reada_buf(
+ 	struct xfs_inode	*dp,
+ 	xfs_dablk_t		bno,
+@@ -2664,7 +2664,5 @@ out_free:
+ 	if (mapp != &map)
+ 		kmem_free(mapp);
+ 
+-	if (error)
+-		return -1;
+-	return mappedbno;
++	return error;
+ }
+--- a/fs/xfs/libxfs/xfs_da_btree.h
++++ b/fs/xfs/libxfs/xfs_da_btree.h
+@@ -201,7 +201,7 @@ int	xfs_da_read_buf(struct xfs_trans *tr
+ 			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ 			       struct xfs_buf **bpp, int whichfork,
+ 			       const struct xfs_buf_ops *ops);
+-xfs_daddr_t	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
++int	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+ 				xfs_daddr_t mapped_bno, int whichfork,
+ 				const struct xfs_buf_ops *ops);
+ int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -992,9 +992,9 @@ xfs_dir_open(
+ 	 */
+ 	mode = xfs_ilock_data_map_shared(ip);
+ 	if (ip->i_d.di_nextents > 0)
+-		xfs_dir3_data_readahead(ip, 0, -1);
++		error = xfs_dir3_data_readahead(ip, 0, -1);
+ 	xfs_iunlock(ip, mode);
+-	return 0;
++	return error;
+ }
+ 
+ STATIC int
diff --git a/queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch b/queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch
new file mode 100644
index 00000000000..593fc6f95f7
--- /dev/null
+++ b/queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch
@@ -0,0 +1,66 @@
+From d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:58 -0800
+Subject: xfs: filter out obviously bad btree pointers
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 upstream.
+
+Don't let anybody load an obviously bad btree pointer.  Since the values
+come from disk, we must return an error, not just ASSERT.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c  |    5 +----
+ fs/xfs/libxfs/xfs_btree.c |    3 ++-
+ fs/xfs/libxfs/xfs_btree.h |    2 +-
+ 3 files changed, 4 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -1278,7 +1278,6 @@ xfs_bmap_read_extents(
+ 	/* REFERENCED */
+ 	xfs_extnum_t		room;	/* number of entries there's room for */
+ 
+-	bno = NULLFSBLOCK;
+ 	mp = ip->i_mount;
+ 	ifp = XFS_IFORK_PTR(ip, whichfork);
+ 	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+@@ -1291,9 +1290,7 @@ xfs_bmap_read_extents(
+ 	ASSERT(level > 0);
+ 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ 	bno = be64_to_cpu(*pp);
+-	ASSERT(bno != NULLFSBLOCK);
+-	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+-	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
++
+ 	/*
+ 	 * Go down the tree until leaf level is reached, following the first
+ 	 * pointer (leftmost) at each level.
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -810,7 +810,8 @@ xfs_btree_read_bufl(
+ 	xfs_daddr_t		d;		/* real disk block address */
+ 	int			error;
+ 
+-	ASSERT(fsbno != NULLFSBLOCK);
++	if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
++		return -EFSCORRUPTED;
+ 	d = XFS_FSB_TO_DADDR(mp, fsbno);
+ 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ 				   mp->m_bsize, lock, &bp, ops);
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -491,7 +491,7 @@ static inline int xfs_btree_get_level(st
+ #define	XFS_FILBLKS_MAX(a,b)	max_t(xfs_filblks_t, (a), (b))
+ 
+ #define	XFS_FSB_SANITY_CHECK(mp,fsb)	\
+-	(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
++	(fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+ 		XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+ 
+ /*
diff --git a/queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch b/queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
new file mode 100644
index 00000000000..92637ba6d41
--- /dev/null
+++ b/queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
@@ -0,0 +1,111 @@
+From 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 2 Mar 2017 15:02:51 -0800
+Subject: xfs: fix and streamline error handling in xfs_end_io
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe upstream.
+
+There are two different cases of buffered I/O errors:
+
+ - first we can have an already shutdown fs.  In that case we should skip
+   any on-disk operations and just clean up the appen transaction if
+   present and destroy the ioend
+ - a real I/O error.  In that case we should cleanup any lingering COW
+   blocks.  This gets skipped in the current code and is fixed by this
+   patch.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c |   59 ++++++++++++++++++++++++------------------------------
+ 1 file changed, 27 insertions(+), 32 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -279,54 +279,49 @@ xfs_end_io(
+ 	struct xfs_ioend	*ioend =
+ 		container_of(work, struct xfs_ioend, io_work);
+ 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
++	xfs_off_t		offset = ioend->io_offset;
++	size_t			size = ioend->io_size;
+ 	int			error = ioend->io_bio->bi_error;
+ 
+ 	/*
+-	 * Set an error if the mount has shut down and proceed with end I/O
+-	 * processing so it can perform whatever cleanups are necessary.
++	 * Just clean up the in-memory strutures if the fs has been shut down.
+ 	 */
+-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
++	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ 		error = -EIO;
++		goto done;
++	}
+ 
+ 	/*
+-	 * For a CoW extent, we need to move the mapping from the CoW fork
+-	 * to the data fork.  If instead an error happened, just dump the
+-	 * new blocks.
++	 * Clean up any COW blocks on an I/O error.
+ 	 */
+-	if (ioend->io_type == XFS_IO_COW) {
+-		if (error)
+-			goto done;
+-		if (ioend->io_bio->bi_error) {
+-			error = xfs_reflink_cancel_cow_range(ip,
+-					ioend->io_offset, ioend->io_size, true);
+-			goto done;
++	if (unlikely(error)) {
++		switch (ioend->io_type) {
++		case XFS_IO_COW:
++			xfs_reflink_cancel_cow_range(ip, offset, size, true);
++			break;
+ 		}
+-		error = xfs_reflink_end_cow(ip, ioend->io_offset,
+-				ioend->io_size);
+-		if (error)
+-			goto done;
++
++		goto done;
+ 	}
+ 
+ 	/*
+-	 * For unwritten extents we need to issue transactions to convert a
+-	 * range to normal written extens after the data I/O has finished.
+-	 * Detecting and handling completion IO errors is done individually
+-	 * for each case as different cleanup operations need to be performed
+-	 * on error.
++	 * Success:  commit the COW or unwritten blocks if needed.
+ 	 */
+-	if (ioend->io_type == XFS_IO_UNWRITTEN) {
+-		if (error)
+-			goto done;
+-		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+-						  ioend->io_size);
+-	} else if (ioend->io_append_trans) {
+-		error = xfs_setfilesize_ioend(ioend, error);
+-	} else {
+-		ASSERT(!xfs_ioend_is_append(ioend) ||
+-		       ioend->io_type == XFS_IO_COW);
++	switch (ioend->io_type) {
++	case XFS_IO_COW:
++		error = xfs_reflink_end_cow(ip, offset, size);
++		break;
++	case XFS_IO_UNWRITTEN:
++		error = xfs_iomap_write_unwritten(ip, offset, size);
++		break;
++	default:
++		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
++		break;
+ 	}
+ 
+ done:
++	if (ioend->io_append_trans)
++		error = xfs_setfilesize_ioend(ioend, error);
+ 	xfs_destroy_ioend(ioend, error);
+ }
+ 
diff --git a/queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch b/queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
new file mode 100644
index 00000000000..b67003b4eeb
--- /dev/null
+++ b/queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
@@ -0,0 +1,61 @@
+From e4229d6b0bc9280f29624faf170cf76a9f1ca60e Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 27 Jan 2017 23:22:57 -0800
+Subject: xfs: fix eofblocks race with file extending async dio writes
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e4229d6b0bc9280f29624faf170cf76a9f1ca60e upstream.
+
+It's possible for post-eof blocks to end up being used for direct I/O
+writes. dio write performs an upfront unwritten extent allocation, sends
+the dio and then updates the inode size (if necessary) on write
+completion. If a file release occurs while a file extending dio write is
+in flight, it is possible to mistake the post-eof blocks for speculative
+preallocation and incorrectly truncate them from the inode. This means
+that the resulting dio write completion can discover a hole and allocate
+new blocks rather than perform unwritten extent conversion.
+
+This requires a strange mix of I/O and is thus not likely to reproduce
+in real world workloads. It is intermittently reproduced by generic/299.
+The error manifests as an assert failure due to transaction overrun
+because the aforementioned write completion transaction has only
+reserved enough blocks for btree operations:
+
+  XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, \
+   file: fs/xfs//xfs_trans.c, line: 309
+
+The root cause is that xfs_free_eofblocks() uses i_size to truncate
+post-eof blocks from the inode, but async, file extending direct writes
+do not update i_size until write completion, long after inode locks are
+dropped. Therefore, xfs_free_eofblocks() effectively truncates the inode
+to the incorrect size.
+
+Update xfs_free_eofblocks() to serialize against dio similar to how
+extending writes are serialized against i_size updates before post-eof
+block zeroing. Specifically, wait on dio while under the iolock. This
+ensures that dio write completions have updated i_size before post-eof
+blocks are processed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -959,6 +959,9 @@ xfs_free_eofblocks(
+ 		if (error)
+ 			return error;
+ 
++		/* wait on dio to ensure i_size has settled */
++		inode_dio_wait(VFS_I(ip));
++
+ 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+ 				&tp);
+ 		if (error) {
diff --git a/queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch b/queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
new file mode 100644
index 00000000000..a35bb54161f
--- /dev/null
+++ b/queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
@@ -0,0 +1,48 @@
+From 4b5bd5bf3fb182dc504b1b64e0331300f156e756 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:57 -0800
+Subject: xfs: fix toctou race when locking an inode to access the data map
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 4b5bd5bf3fb182dc504b1b64e0331300f156e756 upstream.
+
+We use di_format and if_flags to decide whether we're grabbing the ilock
+in btree mode (btree extents not loaded) or shared mode (anything else),
+but the state of those fields can be changed by other threads that are
+also trying to load the btree extents -- IFEXTENTS gets set before the
+_bmap_read_extents call and cleared if it fails.
+
+We don't actually need to have IFEXTENTS set until after the bmbt
+records are successfully loaded and validated, which will fix the race
+between multiple threads trying to read the same directory.  The next
+patch strengthens directory bmbt validation by refusing to open the
+directory if reading the bmbt to start directory readahead fails.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_inode_fork.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -497,15 +497,14 @@ xfs_iread_extents(
+ 	 * We know that the size is valid (it's checked in iformat_btree)
+ 	 */
+ 	ifp->if_bytes = ifp->if_real_bytes = 0;
+-	ifp->if_flags |= XFS_IFEXTENTS;
+ 	xfs_iext_add(ifp, 0, nextents);
+ 	error = xfs_bmap_read_extents(tp, ip, whichfork);
+ 	if (error) {
+ 		xfs_iext_destroy(ifp);
+-		ifp->if_flags &= ~XFS_IFEXTENTS;
+ 		return error;
+ 	}
+ 	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
++	ifp->if_flags |= XFS_IFEXTENTS;
+ 	return 0;
+ }
+ /*
diff --git a/queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch b/queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
new file mode 100644
index 00000000000..aab604126c2
--- /dev/null
+++ b/queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
@@ -0,0 +1,31 @@
+From 93aaead52a9eebdc20dc8fa673c350e592a06949 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 13 Feb 2017 22:52:27 -0800
+Subject: xfs: fix uninitialized variable in _reflink_convert_cow
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 93aaead52a9eebdc20dc8fa673c350e592a06949 upstream.
+
+Fix an uninitialize variable.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_reflink.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -363,7 +363,7 @@ xfs_reflink_convert_cow(
+ 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+ 	xfs_extnum_t		idx;
+ 	bool			found;
+-	int			error;
++	int			error = 0;
+ 
+ 	xfs_ilock(ip, XFS_ILOCK_EXCL);
+ 
diff --git a/queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch b/queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
new file mode 100644
index 00000000000..d9817ff2c4c
--- /dev/null
+++ b/queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
@@ -0,0 +1,76 @@
+From 0e339ef8556d9e567aa7925f8892c263d79430d9 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Mon, 13 Feb 2017 22:48:18 -0800
+Subject: xfs: handle indlen shortage on delalloc extent merge
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 0e339ef8556d9e567aa7925f8892c263d79430d9 upstream.
+
+When a delalloc extent is created, it can be merged with pre-existing,
+contiguous, delalloc extents. When this occurs,
+xfs_bmap_add_extent_hole_delay() merges the extents along with the
+associated indirect block reservations. The expectation here is that the
+combined worst case indlen reservation is always less than or equal to
+the indlen reservation for the individual extents.
+
+This is not always the case, however, as existing extents can less than
+the expected indlen reservation if the extent was previously split due
+to a hole punch. If a new extent merges with such an extent, the total
+indlen requirement may be larger than the sum of the indlen reservations
+held by both extents.
+
+xfs_bmap_add_extent_hole_delay() assumes that the worst case indlen
+reservation is always available and assigns it to the merged extent
+without consideration for the indlen held by the pre-existing extent. As
+a result, the subsequent xfs_mod_fdblocks() call can attempt an
+unintentional allocation rather than a free (indicated by an ASSERT()
+failure). Further, if the allocation happens to fail in this context,
+the failure goes unhandled and creates a filesystem wide block
+accounting inconsistency.
+
+Fix xfs_bmap_add_extent_hole_delay() to function as designed. Cap the
+indlen reservation assigned to the merged extent to the sum of the
+indlen reservations held by each of the individual extents.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -2907,7 +2907,8 @@ xfs_bmap_add_extent_hole_delay(
+ 		oldlen = startblockval(left.br_startblock) +
+ 			startblockval(new->br_startblock) +
+ 			startblockval(right.br_startblock);
+-		newlen = xfs_bmap_worst_indlen(ip, temp);
++		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++					 oldlen);
+ 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ 			nullstartblock((int)newlen));
+ 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+@@ -2928,7 +2929,8 @@ xfs_bmap_add_extent_hole_delay(
+ 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+ 		oldlen = startblockval(left.br_startblock) +
+ 			startblockval(new->br_startblock);
+-		newlen = xfs_bmap_worst_indlen(ip, temp);
++		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++					 oldlen);
+ 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ 			nullstartblock((int)newlen));
+ 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+@@ -2944,7 +2946,8 @@ xfs_bmap_add_extent_hole_delay(
+ 		temp = new->br_blockcount + right.br_blockcount;
+ 		oldlen = startblockval(new->br_startblock) +
+ 			startblockval(right.br_startblock);
+-		newlen = xfs_bmap_worst_indlen(ip, temp);
++		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++					 oldlen);
+ 		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ 			new->br_startoff,
+ 			nullstartblock((int)newlen), temp, right.br_state);
diff --git a/queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch b/queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
new file mode 100644
index 00000000000..6ecba09fe89
--- /dev/null
+++ b/queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
@@ -0,0 +1,339 @@
+From 5eda43000064a69a39fb7869cc63c9571535ad29 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:02 -0800
+Subject: xfs: mark speculative prealloc CoW fork extents unwritten
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 5eda43000064a69a39fb7869cc63c9571535ad29 upstream.
+
+Christoph Hellwig pointed out that there's a potentially nasty race when
+performing simultaneous nearby directio cow writes:
+
+"Thread 1 writes a range from B to c
+
+"                    B --------- C
+                           p
+
+"a little later thread 2 writes from A to B
+
+"        A --------- B
+               p
+
+[editor's note: the 'p' denote cowextsize boundaries, which I added to
+make this more clear]
+
+"but the code preallocates beyond B into the range where thread
+"1 has just written, but ->end_io hasn't been called yet.
+"But once ->end_io is called thread 2 has already allocated
+"up to the extent size hint into the write range of thread 1,
+"so the end_io handler will splice the unintialized blocks from
+"that preallocation back into the file right after B."
+
+We can avoid this race by ensuring that thread 1 cannot accidentally
+remap the blocks that thread 2 allocated (as part of speculative
+preallocation) as part of t2's write preparation in t1's end_io handler.
+The way we make this happen is by taking advantage of the unwritten
+extent flag as an intermediate step.
+
+Recall that when we begin the process of writing data to shared blocks,
+we create a delayed allocation extent in the CoW fork:
+
+D: --RRRRRRSSSRRRRRRRR---
+C: ------DDDDDDD---------
+
+When a thread prepares to CoW some dirty data out to disk, it will now
+convert the delalloc reservation into an /unwritten/ allocated extent in
+the cow fork.  The da conversion code tries to opportunistically
+allocate as much of a (speculatively prealloc'd) extent as possible, so
+we may end up allocating a larger extent than we're actually writing
+out:
+
+D: --RRRRRRSSSRRRRRRRR---
+U: ------UUUUUUU---------
+
+Next, we convert only the part of the extent that we're actively
+planning to write to normal (i.e. not unwritten) status:
+
+D: --RRRRRRSSSRRRRRRRR---
+U: ------UURRUUU---------
+
+If the write succeeds, the end_cow function will now scan the relevant
+range of the CoW fork for real extents and remap only the real extents
+into the data fork:
+
+D: --RRRRRRRRSRRRRRRRR---
+U: ------UU--UUU---------
+
+This ensures that we never obliterate valid data fork extents with
+unwritten blocks from the CoW fork.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c    |    6 ++
+ fs/xfs/xfs_iomap.c   |    2 
+ fs/xfs/xfs_reflink.c |  116 +++++++++++++++++++++++++++++++++++++++++++++++----
+ fs/xfs/xfs_reflink.h |    2 
+ fs/xfs/xfs_trace.h   |    8 ++-
+ 5 files changed, 123 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -486,6 +486,12 @@ xfs_submit_ioend(
+ 	struct xfs_ioend	*ioend,
+ 	int			status)
+ {
++	/* Convert CoW extents to regular */
++	if (!status && ioend->io_type == XFS_IO_COW) {
++		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
++				ioend->io_offset, ioend->io_size);
++	}
++
+ 	/* Reserve log space if we might write beyond the on-disk inode size. */
+ 	if (!status &&
+ 	    ioend->io_type != XFS_IO_UNWRITTEN &&
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -685,7 +685,7 @@ xfs_iomap_write_allocate(
+ 	int		nres;
+ 
+ 	if (whichfork == XFS_COW_FORK)
+-		flags |= XFS_BMAPI_COWFORK;
++		flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+ 
+ 	/*
+ 	 * Make sure that the dquots are there.
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -82,11 +82,22 @@
+  * mappings are a reservation against the free space in the filesystem;
+  * adjacent mappings can also be combined into fewer larger mappings.
+  *
++ * As an optimization, the CoW extent size hint (cowextsz) creates
++ * outsized aligned delalloc reservations in the hope of landing out of
++ * order nearby CoW writes in a single extent on disk, thereby reducing
++ * fragmentation and improving future performance.
++ *
++ * D: --RRRRRRSSSRRRRRRRR--- (data fork)
++ * C: ------DDDDDDD--------- (CoW fork)
++ *
+  * When dirty pages are being written out (typically in writepage), the
+- * delalloc reservations are converted into real mappings by allocating
+- * blocks and replacing the delalloc mapping with real ones.  A delalloc
+- * mapping can be replaced by several real ones if the free space is
+- * fragmented.
++ * delalloc reservations are converted into unwritten mappings by
++ * allocating blocks and replacing the delalloc mapping with real ones.
++ * A delalloc mapping can be replaced by several unwritten ones if the
++ * free space is fragmented.
++ *
++ * D: --RRRRRRSSSRRRRRRRR---
++ * C: ------UUUUUUU---------
+  *
+  * We want to adapt the delalloc mechanism for copy-on-write, since the
+  * write paths are similar.  The first two steps (creating the reservation
+@@ -101,13 +112,29 @@
+  * Block-aligned directio writes will use the same mechanism as buffered
+  * writes.
+  *
++ * Just prior to submitting the actual disk write requests, we convert
++ * the extents representing the range of the file actually being written
++ * (as opposed to extra pieces created for the cowextsize hint) to real
++ * extents.  This will become important in the next step:
++ *
++ * D: --RRRRRRSSSRRRRRRRR---
++ * C: ------UUrrUUU---------
++ *
+  * CoW remapping must be done after the data block write completes,
+  * because we don't want to destroy the old data fork map until we're sure
+  * the new block has been written.  Since the new mappings are kept in a
+  * separate fork, we can simply iterate these mappings to find the ones
+  * that cover the file blocks that we just CoW'd.  For each extent, simply
+  * unmap the corresponding range in the data fork, map the new range into
+- * the data fork, and remove the extent from the CoW fork.
++ * the data fork, and remove the extent from the CoW fork.  Because of
++ * the presence of the cowextsize hint, however, we must be careful
++ * only to remap the blocks that we've actually written out --  we must
++ * never remap delalloc reservations nor CoW staging blocks that have
++ * yet to be written.  This corresponds exactly to the real extents in
++ * the CoW fork:
++ *
++ * D: --RRRRRRrrSRRRRRRRR---
++ * C: ------UU--UUU---------
+  *
+  * Since the remapping operation can be applied to an arbitrary file
+  * range, we record the need for the remap step as a flag in the ioend
+@@ -296,6 +323,65 @@ xfs_reflink_reserve_cow(
+ 	return 0;
+ }
+ 
++/* Convert part of an unwritten CoW extent to a real one. */
++STATIC int
++xfs_reflink_convert_cow_extent(
++	struct xfs_inode		*ip,
++	struct xfs_bmbt_irec		*imap,
++	xfs_fileoff_t			offset_fsb,
++	xfs_filblks_t			count_fsb,
++	struct xfs_defer_ops		*dfops)
++{
++	struct xfs_bmbt_irec		irec = *imap;
++	xfs_fsblock_t			first_block;
++	int				nimaps = 1;
++
++	if (imap->br_state == XFS_EXT_NORM)
++		return 0;
++
++	xfs_trim_extent(&irec, offset_fsb, count_fsb);
++	trace_xfs_reflink_convert_cow(ip, &irec);
++	if (irec.br_blockcount == 0)
++		return 0;
++	return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount,
++			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
++			0, &irec, &nimaps, dfops);
++}
++
++/* Convert all of the unwritten CoW extents in a file's range to real ones. */
++int
++xfs_reflink_convert_cow(
++	struct xfs_inode	*ip,
++	xfs_off_t		offset,
++	xfs_off_t		count)
++{
++	struct xfs_bmbt_irec	got;
++	struct xfs_defer_ops	dfops;
++	struct xfs_mount	*mp = ip->i_mount;
++	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
++	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
++	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
++	xfs_extnum_t		idx;
++	bool			found;
++	int			error;
++
++	xfs_ilock(ip, XFS_ILOCK_EXCL);
++
++	/* Convert all the extents to real from unwritten. */
++	for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
++	     found && got.br_startoff < end_fsb;
++	     found = xfs_iext_get_extent(ifp, ++idx, &got)) {
++		error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
++				end_fsb - offset_fsb, &dfops);
++		if (error)
++			break;
++	}
++
++	/* Finish up. */
++	xfs_iunlock(ip, XFS_ILOCK_EXCL);
++	return error;
++}
++
+ /* Allocate all CoW reservations covering a range of blocks in a file. */
+ static int
+ __xfs_reflink_allocate_cow(
+@@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow(
+ 		goto out_unlock;
+ 	ASSERT(nimaps == 1);
+ 
++	/* Make sure there's a CoW reservation for it. */
+ 	error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+ 	if (error)
+ 		goto out_trans_cancel;
+@@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow(
+ 		goto out_trans_cancel;
+ 	}
+ 
++	/* Allocate the entire reservation as unwritten blocks. */
+ 	xfs_trans_ijoin(tp, ip, 0);
+ 	error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
+-			XFS_BMAPI_COWFORK, &first_block,
++			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
+ 			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
+ 			&imap, &nimaps, &dfops);
+ 	if (error)
+ 		goto out_trans_cancel;
+ 
++	/* Finish up. */
+ 	error = xfs_defer_finish(&tp, &dfops, NULL);
+ 	if (error)
+ 		goto out_trans_cancel;
+@@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range(
+ 		if (error) {
+ 			trace_xfs_reflink_allocate_cow_range_error(ip, error,
+ 					_RET_IP_);
+-			break;
++			return error;
+ 		}
+ 	}
+ 
+-	return error;
++	/* Convert the CoW extents to regular. */
++	return xfs_reflink_convert_cow(ip, offset, count);
+ }
+ 
+ /*
+@@ -669,6 +759,16 @@ xfs_reflink_end_cow(
+ 
+ 		ASSERT(!isnullstartblock(got.br_startblock));
+ 
++		/*
++		 * Don't remap unwritten extents; these are
++		 * speculatively preallocated CoW extents that have been
++		 * allocated but have not yet been involved in a write.
++		 */
++		if (got.br_state == XFS_EXT_UNWRITTEN) {
++			idx--;
++			goto next_extent;
++		}
++
+ 		/* Unmap the old blocks in the data fork. */
+ 		xfs_defer_init(&dfops, &firstfsb);
+ 		rlen = del.br_blockcount;
+--- a/fs/xfs/xfs_reflink.h
++++ b/fs/xfs/xfs_reflink.h
+@@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struc
+ 		struct xfs_bmbt_irec *imap, bool *shared);
+ extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+ 		xfs_off_t offset, xfs_off_t count);
++extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
++		xfs_off_t count);
+ extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
+ 		struct xfs_bmbt_irec *imap, bool *need_alloc);
+ extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3183,6 +3183,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class
+ 		__field(xfs_fileoff_t, lblk)
+ 		__field(xfs_extlen_t, len)
+ 		__field(xfs_fsblock_t, pblk)
++		__field(int, state)
+ 	),
+ 	TP_fast_assign(
+ 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+@@ -3190,13 +3191,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class
+ 		__entry->lblk = irec->br_startoff;
+ 		__entry->len = irec->br_blockcount;
+ 		__entry->pblk = irec->br_startblock;
++		__entry->state = irec->br_state;
+ 	),
+-	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
++	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
+ 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+ 		  __entry->ino,
+ 		  __entry->lblk,
+ 		  __entry->len,
+-		  __entry->pblk)
++		  __entry->pblk,
++		  __entry->state)
+ );
+ #define DEFINE_INODE_IREC_EVENT(name) \
+ DEFINE_EVENT(xfs_inode_irec_class, name, \
+@@ -3345,6 +3348,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
++DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
+ 
+ DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
+ DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
diff --git a/queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch b/queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch
new file mode 100644
index 00000000000..96dbfceafa9
--- /dev/null
+++ b/queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch
@@ -0,0 +1,159 @@
+From 3802a345321a08093ba2ddb1849e736f84e8d450 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 7 Mar 2017 16:45:58 -0800
+Subject: xfs: only reclaim unwritten COW extents periodically
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 3802a345321a08093ba2ddb1849e736f84e8d450 upstream.
+
+We only want to reclaim preallocations from our periodic work item.
+Currently this is archived by looking for a dirty inode, but that check
+is rather fragile.  Instead add a flag to xfs_reflink_cancel_cow_* so
+that the caller can ask for just cancelling unwritten extents in the COW
+fork.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: fix typos in commit message]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c    |    2 +-
+ fs/xfs/xfs_icache.c  |    2 +-
+ fs/xfs/xfs_inode.c   |    2 +-
+ fs/xfs/xfs_reflink.c |   23 ++++++++++++++++-------
+ fs/xfs/xfs_reflink.h |    4 ++--
+ fs/xfs/xfs_super.c   |    2 +-
+ 6 files changed, 22 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -298,7 +298,7 @@ xfs_end_io(
+ 			goto done;
+ 		if (ioend->io_bio->bi_error) {
+ 			error = xfs_reflink_cancel_cow_range(ip,
+-					ioend->io_offset, ioend->io_size);
++					ioend->io_offset, ioend->io_size, true);
+ 			goto done;
+ 		}
+ 		error = xfs_reflink_end_cow(ip, ioend->io_offset,
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1610,7 +1610,7 @@ xfs_inode_free_cowblocks(
+ 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ 
+-	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
++	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
+ 
+ 	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1624,7 +1624,7 @@ xfs_itruncate_extents(
+ 
+ 	/* Remove all pending CoW reservations. */
+ 	error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
+-			last_block);
++			last_block, true);
+ 	if (error)
+ 		goto out;
+ 
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -571,14 +571,18 @@ xfs_reflink_trim_irec_to_next_cow(
+ }
+ 
+ /*
+- * Cancel all pending CoW reservations for some block range of an inode.
++ * Cancel CoW reservations for some block range of an inode.
++ *
++ * If cancel_real is true this function cancels all COW fork extents for the
++ * inode; if cancel_real is false, real extents are not cleared.
+  */
+ int
+ xfs_reflink_cancel_cow_blocks(
+ 	struct xfs_inode		*ip,
+ 	struct xfs_trans		**tpp,
+ 	xfs_fileoff_t			offset_fsb,
+-	xfs_fileoff_t			end_fsb)
++	xfs_fileoff_t			end_fsb,
++	bool				cancel_real)
+ {
+ 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ 	struct xfs_bmbt_irec		got, prev, del;
+@@ -605,7 +609,7 @@ xfs_reflink_cancel_cow_blocks(
+ 					&idx, &got, &del);
+ 			if (error)
+ 				break;
+-		} else {
++		} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
+ 			xfs_trans_ijoin(*tpp, ip, 0);
+ 			xfs_defer_init(&dfops, &firstfsb);
+ 
+@@ -648,13 +652,17 @@ xfs_reflink_cancel_cow_blocks(
+ }
+ 
+ /*
+- * Cancel all pending CoW reservations for some byte range of an inode.
++ * Cancel CoW reservations for some byte range of an inode.
++ *
++ * If cancel_real is true this function cancels all COW fork extents for the
++ * inode; if cancel_real is false, real extents are not cleared.
+  */
+ int
+ xfs_reflink_cancel_cow_range(
+ 	struct xfs_inode	*ip,
+ 	xfs_off_t		offset,
+-	xfs_off_t		count)
++	xfs_off_t		count,
++	bool			cancel_real)
+ {
+ 	struct xfs_trans	*tp;
+ 	xfs_fileoff_t		offset_fsb;
+@@ -680,7 +688,8 @@ xfs_reflink_cancel_cow_range(
+ 	xfs_trans_ijoin(tp, ip, 0);
+ 
+ 	/* Scrape out the old CoW reservations */
+-	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
++	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
++			cancel_real);
+ 	if (error)
+ 		goto out_cancel;
+ 
+@@ -1686,7 +1695,7 @@ next:
+ 	 * We didn't find any shared blocks so turn off the reflink flag.
+ 	 * First, get rid of any leftover CoW mappings.
+ 	 */
+-	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
++	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
+ 	if (error)
+ 		return error;
+ 
+--- a/fs/xfs/xfs_reflink.h
++++ b/fs/xfs/xfs_reflink.h
+@@ -39,9 +39,9 @@ extern int xfs_reflink_trim_irec_to_next
+ 
+ extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
+ 		struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
+-		xfs_fileoff_t end_fsb);
++		xfs_fileoff_t end_fsb, bool cancel_real);
+ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
+-		xfs_off_t count);
++		xfs_off_t count, bool cancel_real);
+ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+ 		xfs_off_t count);
+ extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -948,7 +948,7 @@ xfs_fs_destroy_inode(
+ 	XFS_STATS_INC(ip->i_mount, vn_remove);
+ 
+ 	if (xfs_is_reflink_inode(ip)) {
+-		error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
++		error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ 		if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
+ 			xfs_warn(ip->i_mount,
+ "Error %d while evicting CoW blocks for inode %llu.",
diff --git a/queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch b/queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
new file mode 100644
index 00000000000..0b5b9a17bae
--- /dev/null
+++ b/queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
@@ -0,0 +1,125 @@
+rom 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 6 Feb 2017 13:00:54 -0800
+Subject: xfs: reject all unaligned direct writes to reflinked files
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e upstream.
+
+We currently fall back from direct to buffered writes if we detect a
+remaining shared extent in the iomap_begin callback.  But by the time
+iomap_begin is called for the potentially unaligned end block we might
+have already written most of the data to disk, which we'd now write
+again using buffered I/O.  To avoid this reject all writes to reflinked
+files before starting I/O so that we are guaranteed to only write the
+data once.
+
+The alternative would be to unshare the unaligned start and/or end block
+before doing the I/O. I think that's doable, and will actually be
+required to support reflinks on DAX file system.  But it will take a
+little more time and I'd rather get rid of the double write ASAP.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+[slight changes in context due to the new direct I/O code in 4.10+]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c  |   45 ---------------------------------------------
+ fs/xfs/xfs_file.c  |    9 +++++++++
+ fs/xfs/xfs_trace.h |    2 +-
+ 3 files changed, 10 insertions(+), 46 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -1263,44 +1263,6 @@ xfs_map_trim_size(
+ 	bh_result->b_size = mapping_size;
+ }
+ 
+-/* Bounce unaligned directio writes to the page cache. */
+-static int
+-xfs_bounce_unaligned_dio_write(
+-	struct xfs_inode	*ip,
+-	xfs_fileoff_t		offset_fsb,
+-	struct xfs_bmbt_irec	*imap)
+-{
+-	struct xfs_bmbt_irec	irec;
+-	xfs_fileoff_t		delta;
+-	bool			shared;
+-	bool			x;
+-	int			error;
+-
+-	irec = *imap;
+-	if (offset_fsb > irec.br_startoff) {
+-		delta = offset_fsb - irec.br_startoff;
+-		irec.br_blockcount -= delta;
+-		irec.br_startblock += delta;
+-		irec.br_startoff = offset_fsb;
+-	}
+-	error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
+-	if (error)
+-		return error;
+-
+-	/*
+-	 * We're here because we're trying to do a directio write to a
+-	 * region that isn't aligned to a filesystem block.  If any part
+-	 * of the extent is shared, fall back to buffered mode to handle
+-	 * the RMW.  This is done by returning -EREMCHG ("remote addr
+-	 * changed"), which is caught further up the call stack.
+-	 */
+-	if (shared) {
+-		trace_xfs_reflink_bounce_dio_write(ip, imap);
+-		return -EREMCHG;
+-	}
+-	return 0;
+-}
+-
+ STATIC int
+ __xfs_get_blocks(
+ 	struct inode		*inode,
+@@ -1438,13 +1400,6 @@ __xfs_get_blocks(
+ 	if (imap.br_startblock != HOLESTARTBLOCK &&
+ 	    imap.br_startblock != DELAYSTARTBLOCK &&
+ 	    (create || !ISUNWRITTEN(&imap))) {
+-		if (create && direct && !is_cow) {
+-			error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
+-					&imap);
+-			if (error)
+-				return error;
+-		}
+-
+ 		xfs_map_buffer(inode, bh_result, &imap, offset);
+ 		if (ISUNWRITTEN(&imap))
+ 			set_buffer_unwritten(bh_result);
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -554,6 +554,15 @@ xfs_file_dio_aio_write(
+ 	if ((iocb->ki_pos & mp->m_blockmask) ||
+ 	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
+ 		unaligned_io = 1;
++
++		/*
++		 * We can't properly handle unaligned direct I/O to reflink
++		 * files yet, as we can't unshare a partial block.
++		 */
++		if (xfs_is_reflink_inode(ip)) {
++			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
++			return -EREMCHG;
++		}
+ 		iolock = XFS_IOLOCK_EXCL;
+ 	} else {
+ 		iolock = XFS_IOLOCK_SHARED;
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3353,7 +3353,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_conv
+ DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
+ DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
+ 
+-DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
++DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
+ DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
+ 
diff --git a/queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch b/queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
new file mode 100644
index 00000000000..970cc93a618
--- /dev/null
+++ b/queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
@@ -0,0 +1,32 @@
+From 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f Mon Sep 17 00:00:00 2001
+From: Hou Tao <houtao1@huawei.com>
+Date: Fri, 3 Feb 2017 14:39:07 -0800
+Subject: xfs: reset b_first_retry_time when clear the retry status of xfs_buf_t
+
+From: Hou Tao <houtao1@huawei.com>
+
+commit 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f upstream.
+
+After successful IO or permanent error, b_first_retry_time also
+needs to be cleared, else the invalid first retry time will be
+used by the next retry check.
+
+Signed-off-by: Hou Tao <houtao1@huawei.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf_item.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks(
+ 	 */
+ 	bp->b_last_error = 0;
+ 	bp->b_retries = 0;
++	bp->b_first_retry_time = 0;
+ 
+ 	xfs_buf_do_callbacks(bp);
+ 	bp->b_fspriv = NULL;
diff --git a/queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch b/queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch
new file mode 100644
index 00000000000..b7b4fdb4128
--- /dev/null
+++ b/queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch
@@ -0,0 +1,118 @@
+From 75d65361cf3c0dae2af970c305e19c727b28a510 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Mon, 13 Feb 2017 22:48:30 -0800
+Subject: xfs: split indlen reservations fairly when under reserved
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 75d65361cf3c0dae2af970c305e19c727b28a510 upstream.
+
+Certain workoads that punch holes into speculative preallocation can
+cause delalloc indirect reservation splits when the delalloc extent is
+split in two. If further splits occur, an already short-handed extent
+can be split into two in a manner that leaves zero indirect blocks for
+one of the two new extents. This occurs because the shortage is large
+enough that the xfs_bmap_split_indlen() algorithm completely drains the
+requested indlen of one of the extents before it honors the existing
+reservation.
+
+This ultimately results in a warning from xfs_bmap_del_extent(). This
+has been observed during file copies of large, sparse files using 'cp
+--sparse=always.'
+
+To avoid this problem, update xfs_bmap_split_indlen() to explicitly
+apply the reservation shortage fairly between both extents. This smooths
+out the overall indlen shortage and defers the situation where we end up
+with a delalloc extent with zero indlen reservation to extreme
+circumstances.
+
+Reported-by: Patrick Dung <mpatdung@gmail.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   61 +++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 43 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4899,34 +4899,59 @@ xfs_bmap_split_indlen(
+ 	xfs_filblks_t			len2 = *indlen2;
+ 	xfs_filblks_t			nres = len1 + len2; /* new total res. */
+ 	xfs_filblks_t			stolen = 0;
++	xfs_filblks_t			resfactor;
+ 
+ 	/*
+ 	 * Steal as many blocks as we can to try and satisfy the worst case
+ 	 * indlen for both new extents.
+ 	 */
+-	while (nres > ores && avail) {
+-		nres--;
+-		avail--;
+-		stolen++;
+-	}
++	if (ores < nres && avail)
++		stolen = XFS_FILBLKS_MIN(nres - ores, avail);
++	ores += stolen;
++
++	 /* nothing else to do if we've satisfied the new reservation */
++	if (ores >= nres)
++		return stolen;
++
++	/*
++	 * We can't meet the total required reservation for the two extents.
++	 * Calculate the percent of the overall shortage between both extents
++	 * and apply this percentage to each of the requested indlen values.
++	 * This distributes the shortage fairly and reduces the chances that one
++	 * of the two extents is left with nothing when extents are repeatedly
++	 * split.
++	 */
++	resfactor = (ores * 100);
++	do_div(resfactor, nres);
++	len1 *= resfactor;
++	do_div(len1, 100);
++	len2 *= resfactor;
++	do_div(len2, 100);
++	ASSERT(len1 + len2 <= ores);
++	ASSERT(len1 < *indlen1 && len2 < *indlen2);
+ 
+ 	/*
+-	 * The only blocks available are those reserved for the original
+-	 * extent and what we can steal from the extent being removed.
+-	 * If this still isn't enough to satisfy the combined
+-	 * requirements for the two new extents, skim blocks off of each
+-	 * of the new reservations until they match what is available.
++	 * Hand out the remainder to each extent. If one of the two reservations
++	 * is zero, we want to make sure that one gets a block first. The loop
++	 * below starts with len1, so hand len2 a block right off the bat if it
++	 * is zero.
+ 	 */
+-	while (nres > ores) {
+-		if (len1) {
+-			len1--;
+-			nres--;
++	ores -= (len1 + len2);
++	ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
++	if (ores && !len2 && *indlen2) {
++		len2++;
++		ores--;
++	}
++	while (ores) {
++		if (len1 < *indlen1) {
++			len1++;
++			ores--;
+ 		}
+-		if (nres == ores)
++		if (!ores)
+ 			break;
+-		if (len2) {
+-			len2--;
+-			nres--;
++		if (len2 < *indlen2) {
++			len2++;
++			ores--;
+ 		}
+ 	}
+ 
diff --git a/queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch b/queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
new file mode 100644
index 00000000000..2cd8d041a81
--- /dev/null
+++ b/queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
@@ -0,0 +1,97 @@
+From 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Wed, 8 Mar 2017 10:38:53 -0800
+Subject: xfs: try any AG when allocating the first btree block when reflinking
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e upstream.
+
+When a reflink operation causes the bmap code to allocate a btree block
+we're currently doing single-AG allocations due to having ->firstblock
+set and then try any higher AG due a little reflink quirk we've put in
+when adding the reflink code.  But given that we do not have a minleft
+reservation of any kind in this AG we can still not have any space in
+the same or higher AG even if the file system has enough free space.
+To fix this use a XFS_ALLOCTYPE_FIRST_AG allocation in this fall back
+path instead.
+
+[And yes, we need to redo this properly instead of piling hacks over
+ hacks.  I'm working on that, but it's not going to be a small series.
+ In the meantime this fixes the customer reported issue]
+
+Also add a warning for failing allocations to make it easier to debug.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c       |   10 +++++++---
+ fs/xfs/libxfs/xfs_bmap_btree.c |    6 +++---
+ 2 files changed, 10 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -769,8 +769,8 @@ xfs_bmap_extents_to_btree(
+ 		args.type = XFS_ALLOCTYPE_START_BNO;
+ 		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+ 	} else if (dfops->dop_low) {
+-try_another_ag:
+ 		args.type = XFS_ALLOCTYPE_START_BNO;
++try_another_ag:
+ 		args.fsbno = *firstblock;
+ 	} else {
+ 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+@@ -796,13 +796,17 @@ try_another_ag:
+ 	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ 	    args.fsbno == NULLFSBLOCK &&
+ 	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+-		dfops->dop_low = true;
++		args.type = XFS_ALLOCTYPE_FIRST_AG;
+ 		goto try_another_ag;
+ 	}
++	if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
++		xfs_iroot_realloc(ip, -1, whichfork);
++		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
++		return -ENOSPC;
++	}
+ 	/*
+ 	 * Allocation can't fail, the space was reserved.
+ 	 */
+-	ASSERT(args.fsbno != NULLFSBLOCK);
+ 	ASSERT(*firstblock == NULLFSBLOCK ||
+ 	       args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
+ 	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -453,8 +453,8 @@ xfs_bmbt_alloc_block(
+ 
+ 	if (args.fsbno == NULLFSBLOCK) {
+ 		args.fsbno = be64_to_cpu(start->l);
+-try_another_ag:
+ 		args.type = XFS_ALLOCTYPE_START_BNO;
++try_another_ag:
+ 		/*
+ 		 * Make sure there is sufficient room left in the AG to
+ 		 * complete a full tree split for an extent insert.  If
+@@ -494,8 +494,8 @@ try_another_ag:
+ 	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ 	    args.fsbno == NULLFSBLOCK &&
+ 	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+-		cur->bc_private.b.dfops->dop_low = true;
+ 		args.fsbno = cur->bc_private.b.firstblock;
++		args.type = XFS_ALLOCTYPE_FIRST_AG;
+ 		goto try_another_ag;
+ 	}
+ 
+@@ -512,7 +512,7 @@ try_another_ag:
+ 			goto error0;
+ 		cur->bc_private.b.dfops->dop_low = true;
+ 	}
+-	if (args.fsbno == NULLFSBLOCK) {
++	if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+ 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ 		*stat = 0;
+ 		return 0;
diff --git a/queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch b/queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch
new file mode 100644
index 00000000000..60b561ed819
--- /dev/null
+++ b/queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch
@@ -0,0 +1,83 @@
+From 410d17f67e583559be3a922f8b6cc336331893f3 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 16 Feb 2017 17:12:51 -0800
+Subject: xfs: tune down agno asserts in the bmap code
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 410d17f67e583559be3a922f8b6cc336331893f3 upstream.
+
+In various places we currently assert that xfs_bmap_btalloc allocates
+from the same as the firstblock value passed in, unless it's either
+NULLAGNO or the dop_low flag is set.  But the reflink code does not
+fully follow this convention as it passes in firstblock purely as
+a hint for the allocator without actually having previous allocations
+in the transaction, and without having a minleft check on the current
+AG, leading to the assert firing on a very full and heavily used
+file system.  As even the reflink code only allocates from equal or
+higher AGs for now we can simply the check to always allow for equal
+or higher AGs.
+
+Note that we need to eventually split the two meanings of the firstblock
+value.  At that point we can also allow the reflink code to allocate
+from any AG instead of limiting it in any way.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   22 ++++++----------------
+ 1 file changed, 6 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -804,9 +804,7 @@ try_another_ag:
+ 	 */
+ 	ASSERT(args.fsbno != NULLFSBLOCK);
+ 	ASSERT(*firstblock == NULLFSBLOCK ||
+-	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+-	       (dfops->dop_low &&
+-		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
++	       args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
+ 	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+ 	cur->bc_private.b.allocated++;
+ 	ip->i_d.di_nblocks++;
+@@ -3923,17 +3921,13 @@ xfs_bmap_btalloc(
+ 		 * the first block that was allocated.
+ 		 */
+ 		ASSERT(*ap->firstblock == NULLFSBLOCK ||
+-		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+-		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+-		       (ap->dfops->dop_low &&
+-			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+-			XFS_FSB_TO_AGNO(mp, args.fsbno)));
++		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) <=
++		       XFS_FSB_TO_AGNO(mp, args.fsbno));
+ 
+ 		ap->blkno = args.fsbno;
+ 		if (*ap->firstblock == NULLFSBLOCK)
+ 			*ap->firstblock = args.fsbno;
+-		ASSERT(nullfb || fb_agno == args.agno ||
+-		       (ap->dfops->dop_low && fb_agno < args.agno));
++		ASSERT(nullfb || fb_agno <= args.agno);
+ 		ap->length = args.len;
+ 		if (!(ap->flags & XFS_BMAPI_COWFORK))
+ 			ap->ip->i_d.di_nblocks += args.len;
+@@ -4858,13 +4852,9 @@ error0:
+ 	if (bma.cur) {
+ 		if (!error) {
+ 			ASSERT(*firstblock == NULLFSBLOCK ||
+-			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
++			       XFS_FSB_TO_AGNO(mp, *firstblock) <=
+ 			       XFS_FSB_TO_AGNO(mp,
+-				       bma.cur->bc_private.b.firstblock) ||
+-			       (dfops->dop_low &&
+-				XFS_FSB_TO_AGNO(mp, *firstblock) <
+-				XFS_FSB_TO_AGNO(mp,
+-					bma.cur->bc_private.b.firstblock)));
++				       bma.cur->bc_private.b.firstblock));
+ 			*firstblock = bma.cur->bc_private.b.firstblock;
+ 		}
+ 		xfs_btree_del_cursor(bma.cur,
diff --git a/queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch b/queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
new file mode 100644
index 00000000000..0097104f8c3
--- /dev/null
+++ b/queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
@@ -0,0 +1,66 @@
+From c5ecb42342852892f978572ddc6dca703460f25a Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 6 Feb 2017 17:45:51 -0800
+Subject: xfs: update ctime and mtime on clone destinatation inodes
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit c5ecb42342852892f978572ddc6dca703460f25a upstream.
+
+We're changing both metadata and data, so we need to update the
+timestamps for clone operations.  Dedupe on the other hand does
+not change file data, and only changes invisible metadata so the
+timestamps should not be updated.
+
+This follows existing btrfs behavior.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: remove redundant is_dedupe test]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_reflink.c |   12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -985,13 +985,14 @@ STATIC int
+ xfs_reflink_update_dest(
+ 	struct xfs_inode	*dest,
+ 	xfs_off_t		newlen,
+-	xfs_extlen_t		cowextsize)
++	xfs_extlen_t		cowextsize,
++	bool			is_dedupe)
+ {
+ 	struct xfs_mount	*mp = dest->i_mount;
+ 	struct xfs_trans	*tp;
+ 	int			error;
+ 
+-	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
++	if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+ 		return 0;
+ 
+ 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+@@ -1012,6 +1013,10 @@ xfs_reflink_update_dest(
+ 		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ 	}
+ 
++	if (!is_dedupe) {
++		xfs_trans_ichgtime(tp, dest,
++				   XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
++	}
+ 	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+ 
+ 	error = xfs_trans_commit(tp);
+@@ -1528,7 +1533,8 @@ xfs_reflink_remap_range(
+ 	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+ 		cowextsize = src->i_d.di_cowextsize;
+ 
+-	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
++	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
++			is_dedupe);
+ 
+ out_unlock:
+ 	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
diff --git a/queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch b/queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
new file mode 100644
index 00000000000..f9c2b770391
--- /dev/null
+++ b/queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
@@ -0,0 +1,150 @@
+From f65e6fad293b3a5793b7fa2044800506490e7a2e Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 8 Mar 2017 09:58:08 -0800
+Subject: xfs: use iomap new flag for newly allocated delalloc blocks
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit f65e6fad293b3a5793b7fa2044800506490e7a2e upstream.
+
+Commit fa7f138 ("xfs: clear delalloc and cache on buffered write
+failure") fixed one regression in the iomap error handling code and
+exposed another. The fundamental problem is that if a buffered write
+is a rewrite of preexisting delalloc blocks and the write fails, the
+failure handling code can punch out preexisting blocks with valid
+file data.
+
+This was reproduced directly by sub-block writes in the LTP
+kernel/syscalls/write/write03 test. A first 100 byte write allocates
+a single block in a file. A subsequent 100 byte write fails and
+punches out the block, including the data successfully written by
+the previous write.
+
+To address this problem, update the ->iomap_begin() handler to
+distinguish newly allocated delalloc blocks from preexisting
+delalloc blocks via the IOMAP_F_NEW flag. Use this flag in the
+->iomap_end() handler to decide when a failed or short write should
+punch out delalloc blocks.
+
+This introduces the subtle requirement that ->iomap_begin() should
+never combine newly allocated delalloc blocks with existing blocks
+in the resulting iomap descriptor. This can occur when a new
+delalloc reservation merges with a neighboring extent that is part
+of the current write, for example. Therefore, drop the
+post-allocation extent lookup from xfs_bmapi_reserve_delalloc() and
+just return the record inserted into the fork. This ensures only new
+blocks are returned and thus that preexisting delalloc blocks are
+always handled as "found" blocks and not punched out on a failed
+rewrite.
+
+Reported-by: Xiong Zhou <xzhou@redhat.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   24 ++++++++++++++----------
+ fs/xfs/xfs_iomap.c       |   16 +++++++++++-----
+ 2 files changed, 25 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4253,6 +4253,19 @@ xfs_bmapi_read(
+ 	return 0;
+ }
+ 
++/*
++ * Add a delayed allocation extent to an inode. Blocks are reserved from the
++ * global pool and the extent inserted into the inode in-core extent tree.
++ *
++ * On entry, got refers to the first extent beyond the offset of the extent to
++ * allocate or eof is specified if no such extent exists. On return, got refers
++ * to the extent record that was inserted to the inode fork.
++ *
++ * Note that the allocated extent may have been merged with contiguous extents
++ * during insertion into the inode fork. Thus, got does not reflect the current
++ * state of the inode fork on return. If necessary, the caller can use lastx to
++ * look up the updated record in the inode fork.
++ */
+ int
+ xfs_bmapi_reserve_delalloc(
+ 	struct xfs_inode	*ip,
+@@ -4339,13 +4352,8 @@ xfs_bmapi_reserve_delalloc(
+ 	got->br_startblock = nullstartblock(indlen);
+ 	got->br_blockcount = alen;
+ 	got->br_state = XFS_EXT_NORM;
+-	xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+ 
+-	/*
+-	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+-	 * might have merged it into one of the neighbouring ones.
+-	 */
+-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
++	xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+ 
+ 	/*
+ 	 * Tag the inode if blocks were preallocated. Note that COW fork
+@@ -4357,10 +4365,6 @@ xfs_bmapi_reserve_delalloc(
+ 	if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ 		xfs_inode_set_cowblocks_tag(ip);
+ 
+-	ASSERT(got->br_startoff <= aoff);
+-	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+-	ASSERT(isnullstartblock(got->br_startblock));
+-	ASSERT(got->br_state == XFS_EXT_NORM);
+ 	return 0;
+ 
+ out_unreserve_blocks:
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -637,6 +637,11 @@ retry:
+ 		goto out_unlock;
+ 	}
+ 
++	/*
++	 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
++	 * them out if the write happens to fail.
++	 */
++	iomap->flags = IOMAP_F_NEW;
+ 	trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ done:
+ 	if (isnullstartblock(got.br_startblock))
+@@ -1061,7 +1066,8 @@ xfs_file_iomap_end_delalloc(
+ 	struct xfs_inode	*ip,
+ 	loff_t			offset,
+ 	loff_t			length,
+-	ssize_t			written)
++	ssize_t			written,
++	struct iomap		*iomap)
+ {
+ 	struct xfs_mount	*mp = ip->i_mount;
+ 	xfs_fileoff_t		start_fsb;
+@@ -1080,14 +1086,14 @@ xfs_file_iomap_end_delalloc(
+ 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ 
+ 	/*
+-	 * Trim back delalloc blocks if we didn't manage to write the whole
+-	 * range reserved.
++	 * Trim delalloc blocks if they were allocated by this write and we
++	 * didn't manage to write the whole range.
+ 	 *
+ 	 * We don't need to care about racing delalloc as we hold i_mutex
+ 	 * across the reserve/allocate/unreserve calls. If there are delalloc
+ 	 * blocks in the range, they are ours.
+ 	 */
+-	if (start_fsb < end_fsb) {
++	if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
+ 		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+ 					 XFS_FSB_TO_B(mp, end_fsb) - 1);
+ 
+@@ -1117,7 +1123,7 @@ xfs_file_iomap_end(
+ {
+ 	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+ 		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+-				length, written);
++				length, written, iomap);
+ 	return 0;
+ }
+ 
diff --git a/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
new file mode 100644
index 00000000000..adfd695e2b2
--- /dev/null
+++ b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
@@ -0,0 +1,44 @@
+From d5825712ee98d68a2c17bc89dad2c30276894cba Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Thu, 2 Mar 2017 15:06:33 -0800
+Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode alignment mask
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit d5825712ee98d68a2c17bc89dad2c30276894cba upstream.
+
+When block size is larger than inode cluster size, the call to
+XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs
+would have set xfs_sb->sb_inoalignmt to 0. Hence in
+xfs_set_inoalignment(), xfs_mount->m_inoalign_mask gets initialized to
+-1 instead of 0. However, xfs_mount->m_sinoalign would get correctly
+intialized to 0 because for every positive value of xfs_mount->m_dalign,
+the condition "!(mp->m_dalign & mp->m_inoalign_mask)" would evaluate to
+false.
+
+Also, xfs_imap() worked fine even with xfs_mount->m_inoalign_mask having
+-1 as the value because blks_per_cluster variable would have the value 1
+and hence we would never have a need to use xfs_mount->m_inoalign_mask
+to compute the inode chunk's agbno and offset within the chunk.
+
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_mount.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -502,8 +502,7 @@ STATIC void
+ xfs_set_inoalignment(xfs_mount_t *mp)
+ {
+ 	if (xfs_sb_version_hasalign(&mp->m_sb) &&
+-	    mp->m_sb.sb_inoalignmt >=
+-	    XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
++		mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
+ 		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+ 	else
+ 		mp->m_inoalign_mask = 0;
diff --git a/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
new file mode 100644
index 00000000000..6bde120baa6
--- /dev/null
+++ b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
@@ -0,0 +1,91 @@
+From 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Thu, 16 Feb 2017 17:12:16 -0800
+Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa upstream.
+
+On a ppc64 system, executing generic/256 test with 32k block size gives the following call trace,
+
+XFS: Assertion failed: args->maxlen > 0, file: /root/repos/linux/fs/xfs/libxfs/xfs_alloc.c, line: 2026
+
+kernel BUG at /root/repos/linux/fs/xfs/xfs_message.c:113!
+Oops: Exception in kernel mode, sig: 5 [#1]
+SMP NR_CPUS=2048
+DEBUG_PAGEALLOC
+NUMA
+pSeries
+Modules linked in:
+CPU: 2 PID: 19361 Comm: mkdir Not tainted 4.10.0-rc5 #58
+task: c000000102606d80 task.stack: c0000001026b8000
+NIP: c0000000004ef798 LR: c0000000004ef798 CTR: c00000000082b290
+REGS: c0000001026bb090 TRAP: 0700   Not tainted  (4.10.0-rc5)
+MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>
+CR: 28004428  XER: 00000000
+CFAR: c0000000004ef180 SOFTE: 1
+GPR00: c0000000004ef798 c0000001026bb310 c000000001157300 ffffffffffffffea
+GPR04: 000000000000000a c0000001026bb130 0000000000000000 ffffffffffffffc0
+GPR08: 00000000000000d1 0000000000000021 00000000ffffffd1 c000000000dd4990
+GPR12: 0000000022004444 c00000000fe00800 0000000020000000 0000000000000000
+GPR16: 0000000000000000 0000000043a606fc 0000000043a76c08 0000000043a1b3d0
+GPR20: 000001002a35cd60 c0000001026bbb80 0000000000000000 0000000000000001
+GPR24: 0000000000000240 0000000000000004 c00000062dc55000 0000000000000000
+GPR28: 0000000000000004 c00000062ecd9200 0000000000000000 c0000001026bb6c0
+NIP [c0000000004ef798] .assfail+0x28/0x30
+LR [c0000000004ef798] .assfail+0x28/0x30
+Call Trace:
+[c0000001026bb310] [c0000000004ef798] .assfail+0x28/0x30 (unreliable)
+[c0000001026bb380] [c000000000455d74] .xfs_alloc_space_available+0x194/0x1b0
+[c0000001026bb410] [c00000000045b914] .xfs_alloc_fix_freelist+0x144/0x480
+[c0000001026bb580] [c00000000045c368] .xfs_alloc_vextent+0x698/0xa90
+[c0000001026bb650] [c0000000004a6200] .xfs_ialloc_ag_alloc+0x170/0x820
+[c0000001026bb7c0] [c0000000004a9098] .xfs_dialloc+0x158/0x320
+[c0000001026bb8a0] [c0000000004e628c] .xfs_ialloc+0x7c/0x610
+[c0000001026bb990] [c0000000004e8138] .xfs_dir_ialloc+0xa8/0x2f0
+[c0000001026bbaa0] [c0000000004e8814] .xfs_create+0x494/0x790
+[c0000001026bbbf0] [c0000000004e5ebc] .xfs_generic_create+0x2bc/0x410
+[c0000001026bbce0] [c0000000002b4a34] .vfs_mkdir+0x154/0x230
+[c0000001026bbd70] [c0000000002bc444] .SyS_mkdirat+0x94/0x120
+[c0000001026bbe30] [c00000000000b760] system_call+0x38/0xfc
+Instruction dump:
+4e800020 60000000 7c0802a6 7c862378 3c82ffca 7ca72b78 38841c18 7c651b78
+38600000 f8010010 f821ff91 4bfff94d <0fe00000> 60000000 7c0802a6 7c892378
+
+When block size is larger than inode cluster size, the call to
+XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs
+would have set xfs_sb->sb_inoalignmt to 0. This causes
+xfs_ialloc_cluster_alignment() to return 0.  Due to this
+args.minalignslop (in xfs_ialloc_ag_alloc()) gets the unsigned
+equivalent of -1 assigned to it. This later causes alloc_len in
+xfs_alloc_space_available() to have a value of 0. In such a scenario
+when args.total is also 0, the assert statement "ASSERT(args->maxlen >
+0);" fails.
+
+This commit fixes the bug by replacing the call to XFS_B_TO_FSBT() in
+xfs_ialloc_cluster_alignment() with a call to xfs_icluster_size_fsb().
+
+Suggested-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_ialloc.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment(
+ 	struct xfs_mount	*mp)
+ {
+ 	if (xfs_sb_version_hasalign(&mp->m_sb) &&
+-	    mp->m_sb.sb_inoalignmt >=
+-			XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
++	    mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
+ 		return mp->m_sb.sb_inoalignmt;
+ 	return 1;
+ }
diff --git a/queue-4.9/xfs-verify-free-block-header-fields.patch b/queue-4.9/xfs-verify-free-block-header-fields.patch
new file mode 100644
index 00000000000..7e995279e21
--- /dev/null
+++ b/queue-4.9/xfs-verify-free-block-header-fields.patch
@@ -0,0 +1,93 @@
+From de14c5f541e78c59006bee56f6c5c2ef1ca07272 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:00 -0800
+Subject: xfs: verify free block header fields
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit de14c5f541e78c59006bee56f6c5c2ef1ca07272 upstream.
+
+Perform basic sanity checking of the directory free block header
+fields so that we avoid hanging the system on invalid data.
+
+(Granted that just means that now we shutdown on directory write,
+but that seems better than hanging...)
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_dir2_node.c |   51 ++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 49 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_dir2_node.c
++++ b/fs/xfs/libxfs/xfs_dir2_node.c
+@@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_b
+ 	.verify_write = xfs_dir3_free_write_verify,
+ };
+ 
++/* Everything ok in the free block header? */
++static bool
++xfs_dir3_free_header_check(
++	struct xfs_inode	*dp,
++	xfs_dablk_t		fbno,
++	struct xfs_buf		*bp)
++{
++	struct xfs_mount	*mp = dp->i_mount;
++	unsigned int		firstdb;
++	int			maxbests;
++
++	maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
++	firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
++		   xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
++			maxbests;
++	if (xfs_sb_version_hascrc(&mp->m_sb)) {
++		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
++
++		if (be32_to_cpu(hdr3->firstdb) != firstdb)
++			return false;
++		if (be32_to_cpu(hdr3->nvalid) > maxbests)
++			return false;
++		if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
++			return false;
++	} else {
++		struct xfs_dir2_free_hdr *hdr = bp->b_addr;
++
++		if (be32_to_cpu(hdr->firstdb) != firstdb)
++			return false;
++		if (be32_to_cpu(hdr->nvalid) > maxbests)
++			return false;
++		if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
++			return false;
++	}
++	return true;
++}
+ 
+ static int
+ __xfs_dir3_free_read(
+@@ -168,11 +204,22 @@ __xfs_dir3_free_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ 				XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
++	if (err || !*bpp)
++		return err;
++
++	/* Check things that we can't do in the verifier. */
++	if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) {
++		xfs_buf_ioerror(*bpp, -EFSCORRUPTED);
++		xfs_verifier_error(*bpp);
++		xfs_trans_brelse(tp, *bpp);
++		return -EFSCORRUPTED;
++	}
+ 
+ 	/* try read returns without an error or *bpp if it lands in a hole */
+-	if (!err && tp && *bpp)
++	if (tp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+-	return err;
++
++	return 0;
+ }
+ 
+ int
-- 
2.47.3