From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 18 Sep 2017 08:11:55 +0000 (+0200)
Subject: 4.9-stable patches
X-Git-Tag: v4.9.51~12
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=60a196c617841e32455d50264a7b72ab74480489;p=thirdparty%2Fkernel%2Fstable-queue.git

4.9-stable patches

added patches:
	xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
	xfs-check-_btree_check_block-value.patch
	xfs-don-t-allow-bmap-on-rt-files.patch
	xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
	xfs-fix-inobt-inode-allocation-search-optimization.patch
	xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
	xfs-fix-per-inode-dax-flag-inheritance.patch
	xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
	xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
	xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
	xfs-free-uncommitted-transactions-during-log-recovery.patch
	xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
	xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
	xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
	xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
	xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
	xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch
---

diff --git a/queue-4.9/series b/queue-4.9/series
index e5d0b0047c4..cc556c6deea 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -28,3 +28,20 @@ f2fs-check-hot_data-for-roll-forward-recovery.patch
 x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
 x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
 x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch
+xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
+xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
+xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
+xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch
+xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
+xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
+xfs-don-t-allow-bmap-on-rt-files.patch
+xfs-free-uncommitted-transactions-during-log-recovery.patch
+xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
+xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
+xfs-check-_btree_check_block-value.patch
+xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
+xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
+xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
+xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
+xfs-fix-per-inode-dax-flag-inheritance.patch
+xfs-fix-inobt-inode-allocation-search-optimization.patch
diff --git a/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch b/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
new file mode 100644
index 00000000000..1947cb9447d
--- /dev/null
+++ b/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
@@ -0,0 +1,50 @@
+From hch@lst.de  Mon Sep 18 10:08:45 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:38 -0700
+Subject: xfs: check _alloc_read_agf buffer pointer before using
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-14-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 10479e2dea83d4c421ad05dfc55d918aa8dfc0cd upstream.
+
+In some circumstances, _alloc_read_agf can return an error code of zero
+but also a null AGF buffer pointer.  Check for this and jump out.
+
+Fixes-coverity-id: 1415250
+Fixes-coverity-id: 1415320
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_refcount.c |    4 ++++
+ fs/xfs/xfs_reflink.c         |    2 ++
+ 2 files changed, 6 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -1640,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
+ 	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ 	if (error)
+ 		goto out_trans;
++	if (!agbp) {
++		error = -ENOMEM;
++		goto out_trans;
++	}
+ 	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
+ 
+ 	/* Find all the leftover CoW staging extents. */
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -169,6 +169,8 @@ xfs_reflink_find_shared(
+ 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ 	if (error)
+ 		return error;
++	if (!agbp)
++		return -ENOMEM;
+ 
+ 	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+ 
diff --git a/queue-4.9/xfs-check-_btree_check_block-value.patch b/queue-4.9/xfs-check-_btree_check_block-value.patch
new file mode 100644
index 00000000000..f4e3ccc74da
--- /dev/null
+++ b/queue-4.9/xfs-check-_btree_check_block-value.patch
@@ -0,0 +1,48 @@
+From hch@lst.de  Mon Sep 18 10:08:11 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:36 -0700
+Subject: xfs: check _btree_check_block value
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-12-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 1e86eabe73b73c82e1110c746ed3ec6d5e1c0a0d upstream.
+
+Check the _btree_check_block return value for the firstrec and lastrec
+functions, since we have the ability to signal that the repositioning
+did not succeed.
+
+Fixes-coverity-id: 114067
+Fixes-coverity-id: 114068
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -714,7 +714,8 @@ xfs_btree_firstrec(
+ 	 * Get the block pointer for this level.
+ 	 */
+ 	block = xfs_btree_get_block(cur, level, &bp);
+-	xfs_btree_check_block(cur, block, level, bp);
++	if (xfs_btree_check_block(cur, block, level, bp))
++		return 0;
+ 	/*
+ 	 * It's empty, there is no such record.
+ 	 */
+@@ -743,7 +744,8 @@ xfs_btree_lastrec(
+ 	 * Get the block pointer for this level.
+ 	 */
+ 	block = xfs_btree_get_block(cur, level, &bp);
+-	xfs_btree_check_block(cur, block, level, bp);
++	if (xfs_btree_check_block(cur, block, level, bp))
++		return 0;
+ 	/*
+ 	 * It's empty, there is no such record.
+ 	 */
diff --git a/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch b/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch
new file mode 100644
index 00000000000..b0d1f4dac3b
--- /dev/null
+++ b/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch
@@ -0,0 +1,42 @@
+From 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 19 Jun 2017 13:19:08 -0700
+Subject: xfs: don't allow bmap on rt files
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c upstream.
+
+bmap returns a dumb LBA address but not the block device that goes with
+that LBA.  Swapfiles don't care about this and will blindly assume that
+the data volume is the correct blockdev, which is totally bogus for
+files on the rt subvolume.  This results in the swap code doing IOs to
+arbitrary locations on the data device(!) if the passed in mapping is a
+realtime file, so just turn off bmap for rt files.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/xfs/xfs_aops.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -1566,9 +1566,12 @@ xfs_vm_bmap(
+ 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
+ 	 * bypasseÑ the file system for actual I/O.  We really can't allow
+ 	 * that on reflinks inodes, so we have to skip out here.  And yes,
+-	 * 0 is the magic code for a bmap error..
++	 * 0 is the magic code for a bmap error.
++	 *
++	 * Since we don't pass back blockdev info, we can't return bmap
++	 * information for rt files either.
+ 	 */
+-	if (xfs_is_reflink_inode(ip)) {
++	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
+ 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ 		return 0;
+ 	}
diff --git a/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch b/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
new file mode 100644
index 00000000000..f3bf13baced
--- /dev/null
+++ b/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
@@ -0,0 +1,87 @@
+From hch@lst.de  Mon Sep 18 10:07:57 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:35 -0700
+Subject: xfs: don't crash on unexpected holes in dir/attr btrees
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-11-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit cd87d867920155911d0d2e6485b769d853547750 upstream.
+
+In quite a few places we call xfs_da_read_buf with a mappedbno that we
+don't control, then assume that the function passes back either an error
+code or a buffer pointer.  Unfortunately, if mappedbno == -2 and bno
+maps to a hole, we get a return code of zero and a NULL buffer, which
+means that we crash if we actually try to use that buffer pointer.  This
+happens immediately when we set the buffer type for transaction context.
+
+Therefore, check that we have no error code and a non-NULL bp before
+trying to use bp.  This patch is a follow-up to an incomplete fix in
+96a3aefb8ffde231 ("xfs: don't crash if reading a directory results in an
+unexpected hole").
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_attr_leaf.c  |    2 +-
+ fs/xfs/libxfs/xfs_da_btree.c   |    2 +-
+ fs/xfs/libxfs/xfs_dir2_block.c |    2 +-
+ fs/xfs/libxfs/xfs_dir2_leaf.c  |    4 ++--
+ 4 files changed, 5 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_attr_leaf.c
++++ b/fs/xfs/libxfs/xfs_attr_leaf.c
+@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ 				XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+ 	return err;
+ }
+--- a/fs/xfs/libxfs/xfs_da_btree.c
++++ b/fs/xfs/libxfs/xfs_da_btree.c
+@@ -263,7 +263,7 @@ xfs_da3_node_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ 					which_fork, &xfs_da3_node_buf_ops);
+-	if (!err && tp) {
++	if (!err && tp && *bpp) {
+ 		struct xfs_da_blkinfo	*info = (*bpp)->b_addr;
+ 		int			type;
+ 
+--- a/fs/xfs/libxfs/xfs_dir2_block.c
++++ b/fs/xfs/libxfs/xfs_dir2_block.c
+@@ -139,7 +139,7 @@ xfs_dir3_block_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+ 				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ 	return err;
+ }
+--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
++++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
+@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ 				XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+ 	return err;
+ }
+@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ 				XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+ 	return err;
+ }
diff --git a/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch b/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch
new file mode 100644
index 00000000000..9b8eea399aa
--- /dev/null
+++ b/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch
@@ -0,0 +1,58 @@
+From hch@lst.de  Mon Sep 18 10:09:48 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:42 -0700
+Subject: xfs: fix inobt inode allocation search optimization
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Omar Sandoval <osandov@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-18-hch@lst.de>
+
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit c44245b3d5435f533ca8346ece65918f84c057f9 upstream.
+
+When we try to allocate a free inode by searching the inobt, we try to
+find the inode nearest the parent inode by searching chunks both left
+and right of the chunk containing the parent. As an optimization, we
+cache the leftmost and rightmost records that we previously searched; if
+we do another allocation with the same parent inode, we'll pick up the
+search where it last left off.
+
+There's a bug in the case where we found a free inode to the left of the
+parent's chunk: we need to update the cached left and right records, but
+because we already reassigned the right record to point to the left, we
+end up assigning the left record to both the cached left and right
+records.
+
+This isn't a correctness problem strictly, but it can result in the next
+allocation rechecking chunks unnecessarily or allocating inodes further
+away from the parent than it needs to. Fix it by swapping the record
+pointer after we update the cached left and right records.
+
+Fixes: bd169565993b ("xfs: speed up free inode search")
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ialloc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -1236,13 +1236,13 @@ xfs_dialloc_ag_inobt(
+ 
+ 			/* free inodes to the left? */
+ 			if (useleft && trec.ir_freecount) {
+-				rec = trec;
+ 				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ 				cur = tcur;
+ 
+ 				pag->pagl_leftrec = trec.ir_startino;
+ 				pag->pagl_rightrec = rec.ir_startino;
+ 				pag->pagl_pagino = pagino;
++				rec = trec;
+ 				goto alloc_inode;
+ 			}
+ 
diff --git a/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch b/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
new file mode 100644
index 00000000000..6014e22db42
--- /dev/null
+++ b/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
@@ -0,0 +1,52 @@
+From hch@lst.de  Mon Sep 18 10:09:24 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:40 -0700
+Subject: xfs: fix multi-AG deadlock in xfs_bunmapi
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-16-hch@lst.de>
+
+
+commit 5b094d6dac0451ad89b1dc088395c7b399b7e9e8 upstream.
+
+Just like in the allocator we must avoid touching multiple AGs out of
+order when freeing blocks, as freeing still locks the AGF and can cause
+the same AB-BA deadlocks as in the allocation path.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c |   12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -5556,6 +5556,7 @@ __xfs_bunmapi(
+ 	xfs_fsblock_t		sum;
+ 	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
+ 	xfs_fileoff_t		max_len;
++	xfs_agnumber_t		prev_agno = NULLAGNUMBER, agno;
+ 
+ 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+ 
+@@ -5658,6 +5659,17 @@ __xfs_bunmapi(
+ 		ASSERT(ep != NULL);
+ 		del = got;
+ 		wasdel = isnullstartblock(del.br_startblock);
++
++		/*
++		 * Make sure we don't touch multiple AGF headers out of order
++		 * in a single transaction, as that could cause AB-BA deadlocks.
++		 */
++		if (!wasdel) {
++			agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
++			if (prev_agno != NULLAGNUMBER && prev_agno > agno)
++				break;
++			prev_agno = agno;
++		}
+ 		if (got.br_startoff < start) {
+ 			del.br_startoff = start;
+ 			del.br_blockcount -= start - got.br_startoff;
diff --git a/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch b/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch
new file mode 100644
index 00000000000..018c7fda13e
--- /dev/null
+++ b/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch
@@ -0,0 +1,72 @@
+From hch@lst.de  Mon Sep 18 10:09:34 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:41 -0700
+Subject: xfs: Fix per-inode DAX flag inheritance
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Lukas Czerner <lczerner@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-17-hch@lst.de>
+
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+commit 56bdf855e676f1f2ed7033f288f57dfd315725ba upstream.
+
+According to the commit that implemented per-inode DAX flag:
+commit 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
+the flag is supposed to act as "inherit flag".
+
+Currently this only works in the situations where parent directory
+already has a flag in di_flags set, otherwise inheritance does not
+work. This is because setting the XFS_DIFLAG2_DAX flag is done in a
+wrong branch designated for di_flags, not di_flags2.
+
+Fix this by moving the code to branch designated for setting di_flags2,
+which does test for flags in di_flags2.
+
+Fixes: 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -881,7 +881,6 @@ xfs_ialloc(
+ 	case S_IFREG:
+ 	case S_IFDIR:
+ 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+-			uint64_t	di_flags2 = 0;
+ 			uint		di_flags = 0;
+ 
+ 			if (S_ISDIR(mode)) {
+@@ -918,20 +917,23 @@ xfs_ialloc(
+ 				di_flags |= XFS_DIFLAG_NODEFRAG;
+ 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ 				di_flags |= XFS_DIFLAG_FILESTREAM;
+-			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+-				di_flags2 |= XFS_DIFLAG2_DAX;
+ 
+ 			ip->i_d.di_flags |= di_flags;
+-			ip->i_d.di_flags2 |= di_flags2;
+ 		}
+ 		if (pip &&
+ 		    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
+ 		    pip->i_d.di_version == 3 &&
+ 		    ip->i_d.di_version == 3) {
++			uint64_t	di_flags2 = 0;
++
+ 			if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+-				ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
++				di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ 				ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+ 			}
++			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
++				di_flags2 |= XFS_DIFLAG2_DAX;
++
++			ip->i_d.di_flags2 |= di_flags2;
+ 		}
+ 		/* FALLTHROUGH */
+ 	case S_IFLNK:
diff --git a/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch b/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
new file mode 100644
index 00000000000..5888b8cf152
--- /dev/null
+++ b/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
@@ -0,0 +1,41 @@
+From hch@lst.de  Mon Sep 18 10:09:04 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:39 -0700
+Subject: xfs: fix quotacheck dquot id overflow infinite loop
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-15-hch@lst.de>
+
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit cfaf2d034360166e569a4929dd83ae9698bed856 upstream.
+
+If a dquot has an id of U32_MAX, the next lookup index increment
+overflows the uint32_t back to 0. This starts the lookup sequence
+over from the beginning, repeats indefinitely and results in a
+livelock.
+
+Update xfs_qm_dquot_walk() to explicitly check for the lookup
+overflow and exit the loop.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_qm.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -111,6 +111,9 @@ restart:
+ 			skipped = 0;
+ 			break;
+ 		}
++		/* we're done if id overflows back to zero */
++		if (!next_index)
++			break;
+ 	}
+ 
+ 	if (skipped) {
diff --git a/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch b/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
new file mode 100644
index 00000000000..c93869c496d
--- /dev/null
+++ b/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
@@ -0,0 +1,73 @@
+From 95989c46d2a156365867b1d795fdefce71bce378 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 8 Jun 2017 08:23:07 -0700
+Subject: xfs: fix spurious spin_is_locked() assert failures on non-smp kernels
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 95989c46d2a156365867b1d795fdefce71bce378 upstream.
+
+The 0-day kernel test robot reports assertion failures on
+!CONFIG_SMP kernels due to failed spin_is_locked() checks. As it
+turns out, spin_is_locked() is hardcoded to return zero on
+!CONFIG_SMP kernels and so this function cannot be relied on to
+verify spinlock state in this configuration.
+
+To avoid this problem, replace the associated asserts with lockdep
+variants that do the right thing regardless of kernel configuration.
+Drop the one assert that checks for an unlocked lock as there is no
+suitable lockdep variant for that case. This moves the spinlock
+checks from XFS debug code to lockdep, but generally provides the
+same level of protection.
+
+Reported-by: kbuild test robot <fengguang.wu@intel.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf.c    |    2 +-
+ fs/xfs/xfs_icache.c |    5 ++---
+ 2 files changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -116,7 +116,7 @@ static inline void
+ __xfs_buf_ioacct_dec(
+ 	struct xfs_buf	*bp)
+ {
+-	ASSERT(spin_is_locked(&bp->b_lock));
++	lockdep_assert_held(&bp->b_lock);
+ 
+ 	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
+ 		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -66,7 +66,6 @@ xfs_inode_alloc(
+ 
+ 	XFS_STATS_INC(mp, vn_active);
+ 	ASSERT(atomic_read(&ip->i_pincount) == 0);
+-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ 	ASSERT(!xfs_isiflocked(ip));
+ 	ASSERT(ip->i_ino == 0);
+ 
+@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
+ {
+ 	struct xfs_mount	*mp = pag->pag_mount;
+ 
+-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
++	lockdep_assert_held(&pag->pag_ici_lock);
+ 	if (pag->pag_ici_reclaimable++)
+ 		return;
+ 
+@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
+ {
+ 	struct xfs_mount	*mp = pag->pag_mount;
+ 
+-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
++	lockdep_assert_held(&pag->pag_ici_lock);
+ 	if (--pag->pag_ici_reclaimable)
+ 		return;
+ 
diff --git a/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch b/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
new file mode 100644
index 00000000000..201a257d9c0
--- /dev/null
+++ b/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
@@ -0,0 +1,39 @@
+From hch@lst.de  Mon Sep 18 10:07:40 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:34 -0700
+Subject: xfs: free cowblocks and retry on buffered write ENOSPC
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-10-hch@lst.de>
+
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit cf2cb7845d6e101cb17bd62f8aa08cd514fc8988 upstream.
+
+XFS runs an eofblocks reclaim scan before returning an ENOSPC error to
+userspace for buffered writes. This facilitates aggressive speculative
+preallocation without causing user visible side effects such as
+premature ENOSPC.
+
+Run a cowblocks scan in the same situation to reclaim lingering COW fork
+preallocation throughout the filesystem.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -729,6 +729,7 @@ write_retry:
+ 		xfs_rw_iunlock(ip, iolock);
+ 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+ 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
++		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ 		goto write_retry;
+ 	}
+ 
diff --git a/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch b/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch
new file mode 100644
index 00000000000..e820bd04e9b
--- /dev/null
+++ b/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch
@@ -0,0 +1,102 @@
+From hch@lst.de  Mon Sep 18 10:07:17 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:33 -0700
+Subject: xfs: free uncommitted transactions during log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 39775431f82f890f4aaa08860a30883d081bffc7 upstream.
+
+Log recovery allocates in-core transaction and member item data
+structures on-demand as it processes the on-disk log. Transactions
+are allocated on first encounter on-disk and stored in a hash table
+structure where they are easily accessible for subsequent lookups.
+Transaction items are also allocated on demand and are attached to
+the associated transactions.
+
+When a commit record is encountered in the log, the transaction is
+committed to the fs and the in-core structures are freed. If a
+filesystem crashes or shuts down before all in-core log buffers are
+flushed to the log, however, not all transactions may have commit
+records in the log. As expected, the modifications in such an
+incomplete transaction are not replayed to the fs. The in-core data
+structures for the partial transaction are never freed, however,
+resulting in a memory leak.
+
+Update xlog_do_recovery_pass() to first correctly initialize the
+hash table array so empty lists can be distinguished from populated
+lists on function exit. Update xlog_recover_free_trans() to always
+remove the transaction from the list prior to freeing the associated
+memory. Finally, walk the hash table of transaction lists as the
+last step before it goes out of scope and free any transactions that
+may remain on the lists. This prevents a memory leak of partial
+transactions in the log.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   21 ++++++++++++++++++++-
+ 1 file changed, 20 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4152,7 +4152,7 @@ xlog_recover_commit_trans(
+ 
+ 	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
+ 
+-	hlist_del(&trans->r_list);
++	hlist_del_init(&trans->r_list);
+ 
+ 	error = xlog_recover_reorder_trans(log, trans, pass);
+ 	if (error)
+@@ -4354,6 +4354,8 @@ xlog_recover_free_trans(
+ 	xlog_recover_item_t	*item, *n;
+ 	int			i;
+ 
++	hlist_del_init(&trans->r_list);
++
+ 	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+ 		/* Free the regions in the item. */
+ 		list_del(&item->ri_list);
+@@ -5222,12 +5224,16 @@ xlog_do_recovery_pass(
+ 	int			error2 = 0;
+ 	int			bblks, split_bblks;
+ 	int			hblks, split_hblks, wrapped_hblks;
++	int			i;
+ 	struct hlist_head	rhash[XLOG_RHASH_SIZE];
+ 	LIST_HEAD		(buffer_list);
+ 
+ 	ASSERT(head_blk != tail_blk);
+ 	rhead_blk = 0;
+ 
++	for (i = 0; i < XLOG_RHASH_SIZE; i++)
++		INIT_HLIST_HEAD(&rhash[i]);
++
+ 	/*
+ 	 * Read the header of the tail block and get the iclog buffer size from
+ 	 * h_size.  Use this to tell how many sectors make up the log header.
+@@ -5464,6 +5470,19 @@ xlog_do_recovery_pass(
+ 	if (error && first_bad)
+ 		*first_bad = rhead_blk;
+ 
++	/*
++	 * Transactions are freed at commit time but transactions without commit
++	 * records on disk are never committed. Free any that may be left in the
++	 * hash table.
++	 */
++	for (i = 0; i < XLOG_RHASH_SIZE; i++) {
++		struct hlist_node	*tmp;
++		struct xlog_recover	*trans;
++
++		hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
++			xlog_recover_free_trans(trans);
++	}
++
+ 	return error ? error : error2;
+ }
+ 
diff --git a/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch b/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
new file mode 100644
index 00000000000..ac55c579047
--- /dev/null
+++ b/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
@@ -0,0 +1,85 @@
+From a54fba8f5a0dc36161cacdf2aa90f007f702ec1a Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 18 May 2017 16:36:24 -0700
+Subject: xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff()
+
+From: Jan Kara <jack@suse.cz>
+
+commit a54fba8f5a0dc36161cacdf2aa90f007f702ec1a upstream.
+
+Currently several places in xfs_find_get_desired_pgoff() handle the case
+of a missing page. Make them all handled in one place after the loop has
+terminated.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c |   38 ++++++++------------------------------
+ 1 file changed, 8 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -1139,29 +1139,8 @@ xfs_find_get_desired_pgoff(
+ 		want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
+ 		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ 					  want);
+-		/*
+-		 * No page mapped into given range.  If we are searching holes
+-		 * and if this is the first time we got into the loop, it means
+-		 * that the given offset is landed in a hole, return it.
+-		 *
+-		 * If we have already stepped through some block buffers to find
+-		 * holes but they all contains data.  In this case, the last
+-		 * offset is already updated and pointed to the end of the last
+-		 * mapped page, if it does not reach the endpoint to search,
+-		 * that means there should be a hole between them.
+-		 */
+-		if (nr_pages == 0) {
+-			/* Data search found nothing */
+-			if (type == DATA_OFF)
+-				break;
+-
+-			ASSERT(type == HOLE_OFF);
+-			if (lastoff == startoff || lastoff < endoff) {
+-				found = true;
+-				*offset = lastoff;
+-			}
++		if (nr_pages == 0)
+ 			break;
+-		}
+ 
+ 		for (i = 0; i < nr_pages; i++) {
+ 			struct page	*page = pvec.pages[i];
+@@ -1227,21 +1206,20 @@ xfs_find_get_desired_pgoff(
+ 
+ 		/*
+ 		 * The number of returned pages less than our desired, search
+-		 * done.  In this case, nothing was found for searching data,
+-		 * but we found a hole behind the last offset.
++		 * done.
+ 		 */
+-		if (nr_pages < want) {
+-			if (type == HOLE_OFF) {
+-				*offset = lastoff;
+-				found = true;
+-			}
++		if (nr_pages < want)
+ 			break;
+-		}
+ 
+ 		index = pvec.pages[i - 1]->index + 1;
+ 		pagevec_release(&pvec);
+ 	} while (index <= end);
+ 
++	/* No page at lastoff and we are not done - we found a hole. */
++	if (type == HOLE_OFF && lastoff < endoff) {
++		*offset = lastoff;
++		found = true;
++	}
+ out:
+ 	pagevec_release(&pvec);
+ 	return found;
diff --git a/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch b/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
new file mode 100644
index 00000000000..8972473bf64
--- /dev/null
+++ b/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
@@ -0,0 +1,197 @@
+From 7912e7fef2aebe577f0b46d3cba261f2783c5695 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:21:45 -0700
+Subject: xfs: push buffer of flush locked dquot to avoid quotacheck deadlock
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 7912e7fef2aebe577f0b46d3cba261f2783c5695 upstream.
+
+Reclaim during quotacheck can lead to deadlocks on the dquot flush
+lock:
+
+ - Quotacheck populates a local delwri queue with the physical dquot
+   buffers.
+ - Quotacheck performs the xfs_qm_dqusage_adjust() bulkstat and
+   dirties all of the dquots.
+ - Reclaim kicks in and attempts to flush a dquot whose buffer is
+   already queud on the quotacheck queue. The flush succeeds but
+   queueing to the reclaim delwri queue fails as the backing buffer is
+   already queued. The flush unlock is now deferred to I/O completion
+   of the buffer from the quotacheck queue.
+ - The dqadjust bulkstat continues and dirties the recently flushed
+   dquot once again.
+ - Quotacheck proceeds to the xfs_qm_flush_one() walk which requires
+   the flush lock to update the backing buffers with the in-core
+   recalculated values. It deadlocks on the redirtied dquot as the
+   flush lock was already acquired by reclaim, but the buffer resides
+   on the local delwri queue which isn't submitted until the end of
+   quotacheck.
+
+This is reproduced by running quotacheck on a filesystem with a
+couple million inodes in low memory (512MB-1GB) situations. This is
+a regression as of commit 43ff2122e6 ("xfs: on-stack delayed write
+buffer lists"), which removed a trylock and buffer I/O submission
+from the quotacheck dquot flush sequence.
+
+Quotacheck first resets and collects the physical dquot buffers in a
+delwri queue. Then, it traverses the filesystem inodes via bulkstat,
+updates the in-core dquots, flushes the corrected dquots to the
+backing buffers and finally submits the delwri queue for I/O. Since
+the backing buffers are queued across the entire quotacheck
+operation, dquot reclaim cannot possibly complete a dquot flush
+before quotacheck completes.
+
+Therefore, quotacheck must submit the buffer for I/O in order to
+cycle the flush lock and flush the dirty in-core dquot to the
+buffer. Add a delwri queue buffer push mechanism to submit an
+individual buffer for I/O without losing the delwri queue status and
+use it from quotacheck to avoid the deadlock. This restores
+quotacheck behavior to as before the regression was introduced.
+
+Reported-by: Martin Svec <martin.svec@zoner.cz>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf.c   |   60 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_buf.h   |    1 
+ fs/xfs/xfs_qm.c    |   28 +++++++++++++++++++++++-
+ fs/xfs/xfs_trace.h |    1 
+ 4 files changed, 89 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
+ 	return error;
+ }
+ 
++/*
++ * Push a single buffer on a delwri queue.
++ *
++ * The purpose of this function is to submit a single buffer of a delwri queue
++ * and return with the buffer still on the original queue. The waiting delwri
++ * buffer submission infrastructure guarantees transfer of the delwri queue
++ * buffer reference to a temporary wait list. We reuse this infrastructure to
++ * transfer the buffer back to the original queue.
++ *
++ * Note the buffer transitions from the queued state, to the submitted and wait
++ * listed state and back to the queued state during this call. The buffer
++ * locking and queue management logic between _delwri_pushbuf() and
++ * _delwri_queue() guarantee that the buffer cannot be queued to another list
++ * before returning.
++ */
++int
++xfs_buf_delwri_pushbuf(
++	struct xfs_buf		*bp,
++	struct list_head	*buffer_list)
++{
++	LIST_HEAD		(submit_list);
++	int			error;
++
++	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
++
++	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
++
++	/*
++	 * Isolate the buffer to a new local list so we can submit it for I/O
++	 * independently from the rest of the original list.
++	 */
++	xfs_buf_lock(bp);
++	list_move(&bp->b_list, &submit_list);
++	xfs_buf_unlock(bp);
++
++	/*
++	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
++	 * the buffer on the wait list with an associated reference. Rather than
++	 * bounce the buffer from a local wait list back to the original list
++	 * after I/O completion, reuse the original list as the wait list.
++	 */
++	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
++
++	/*
++	 * The buffer is now under I/O and wait listed as during typical delwri
++	 * submission. Lock the buffer to wait for I/O completion. Rather than
++	 * remove the buffer from the wait list and release the reference, we
++	 * want to return with the buffer queued to the original list. The
++	 * buffer already sits on the original list with a wait list reference,
++	 * however. If we let the queue inherit that wait list reference, all we
++	 * need to do is reset the DELWRI_Q flag.
++	 */
++	xfs_buf_lock(bp);
++	error = bp->b_error;
++	bp->b_flags |= _XBF_DELWRI_Q;
++	xfs_buf_unlock(bp);
++
++	return error;
++}
++
+ int __init
+ xfs_buf_init(void)
+ {
+--- a/fs/xfs/xfs_buf.h
++++ b/fs/xfs/xfs_buf.h
+@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct
+ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+ extern int xfs_buf_delwri_submit(struct list_head *);
+ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
++extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
+ 
+ /* Buffer Daemon Setup Routines */
+ extern int xfs_buf_init(void);
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1247,6 +1247,7 @@ xfs_qm_flush_one(
+ 	struct xfs_dquot	*dqp,
+ 	void			*data)
+ {
++	struct xfs_mount	*mp = dqp->q_mount;
+ 	struct list_head	*buffer_list = data;
+ 	struct xfs_buf		*bp = NULL;
+ 	int			error = 0;
+@@ -1257,7 +1258,32 @@ xfs_qm_flush_one(
+ 	if (!XFS_DQ_IS_DIRTY(dqp))
+ 		goto out_unlock;
+ 
+-	xfs_dqflock(dqp);
++	/*
++	 * The only way the dquot is already flush locked by the time quotacheck
++	 * gets here is if reclaim flushed it before the dqadjust walk dirtied
++	 * it for the final time. Quotacheck collects all dquot bufs in the
++	 * local delwri queue before dquots are dirtied, so reclaim can't have
++	 * possibly queued it for I/O. The only way out is to push the buffer to
++	 * cycle the flush lock.
++	 */
++	if (!xfs_dqflock_nowait(dqp)) {
++		/* buf is pinned in-core by delwri list */
++		DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
++				      mp->m_quotainfo->qi_dqchunklen);
++		bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
++		if (!bp) {
++			error = -EINVAL;
++			goto out_unlock;
++		}
++		xfs_buf_unlock(bp);
++
++		xfs_buf_delwri_pushbuf(bp, buffer_list);
++		xfs_buf_rele(bp);
++
++		error = -EAGAIN;
++		goto out_unlock;
++	}
++
+ 	error = xfs_qm_dqflush(dqp, &bp);
+ 	if (error)
+ 		goto out_unlock;
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
++DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
+ DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+ DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+ DEFINE_BUF_EVENT(xfs_buf_item_relse);
diff --git a/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch b/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
new file mode 100644
index 00000000000..2cd5655f883
--- /dev/null
+++ b/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
@@ -0,0 +1,87 @@
+From 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:35:35 -0700
+Subject: xfs: release bli from transaction properly on fs shutdown
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 upstream.
+
+If a filesystem shutdown occurs with a buffer log item in the CIL
+and a log force occurs, the ->iop_unpin() handler is generally
+expected to tear down the bli properly. This entails freeing the bli
+memory and releasing the associated hold on the buffer so it can be
+released and the filesystem unmounted.
+
+If this sequence occurs while ->bli_refcount is elevated (i.e.,
+another transaction is open and attempting to modify the buffer),
+however, ->iop_unpin() may not be responsible for releasing the bli.
+Instead, the transaction may release the final ->bli_refcount
+reference and thus xfs_trans_brelse() is responsible for tearing
+down the bli.
+
+While xfs_trans_brelse() does drop the reference count, it only
+attempts to release the bli if it is clean (i.e., not in the
+CIL/AIL). If the filesystem is shutdown and the bli is sitting dirty
+in the CIL as noted above, this ends up skipping the last
+opportunity to release the bli. In turn, this leaves the hold on the
+buffer and causes an unmount hang. This can be reproduced by running
+generic/388 in repetition.
+
+Update xfs_trans_brelse() to handle this shutdown corner case
+correctly. If the final bli reference is dropped and the filesystem
+is shutdown, remove the bli from the AIL (if necessary) and release
+the bli to drop the buffer hold and ensure an unmount does not hang.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_trans_buf.c |   23 +++++++++++++++--------
+ 1 file changed, 15 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
+ 		 xfs_buf_t	*bp)
+ {
+ 	xfs_buf_log_item_t	*bip;
++	int			freed;
+ 
+ 	/*
+ 	 * Default to a normal brelse() call if the tp is NULL.
+@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t	*tp,
+ 	/*
+ 	 * Drop our reference to the buf log item.
+ 	 */
+-	atomic_dec(&bip->bli_refcount);
++	freed = atomic_dec_and_test(&bip->bli_refcount);
+ 
+ 	/*
+-	 * If the buf item is not tracking data in the log, then
+-	 * we must free it before releasing the buffer back to the
+-	 * free pool.  Before releasing the buffer to the free pool,
+-	 * clear the transaction pointer in b_fsprivate2 to dissolve
+-	 * its relation to this transaction.
+-	 */
+-	if (!xfs_buf_item_dirty(bip)) {
++	 * If the buf item is not tracking data in the log, then we must free it
++	 * before releasing the buffer back to the free pool.
++	 *
++	 * If the fs has shutdown and we dropped the last reference, it may fall
++	 * on us to release a (possibly dirty) bli if it never made it to the
++	 * AIL (e.g., the aborted unpin already happened and didn't release it
++	 * due to our reference). Since we're already shutdown and need xa_lock,
++	 * just force remove from the AIL and release the bli here.
++	 */
++	if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
++		xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
++		xfs_buf_item_relse(bp);
++	} else if (!xfs_buf_item_dirty(bip)) {
+ /***
+ 		ASSERT(bp->b_pincount == 0);
+ ***/
diff --git a/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch b/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
new file mode 100644
index 00000000000..0c8d3ca7b4a
--- /dev/null
+++ b/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
@@ -0,0 +1,79 @@
+From 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:35:35 -0700
+Subject: xfs: remove bli from AIL before release on transaction abort
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 upstream.
+
+When a buffer is modified, logged and committed, it ultimately ends
+up sitting on the AIL with a dirty bli waiting for metadata
+writeback. If another transaction locks and invalidates the buffer
+(freeing an inode chunk, for example) in the meantime, the bli is
+flagged as stale, the dirty state is cleared and the bli remains in
+the AIL.
+
+If a shutdown occurs before the transaction that has invalidated the
+buffer is committed, the transaction is ultimately aborted. The log
+items are flagged as such and ->iop_unlock() handles the aborted
+items. Because the bli is clean (due to the invalidation),
+->iop_unlock() unconditionally releases it. The log item may still
+reside in the AIL, however, which means the I/O completion handler
+may still run and attempt to access it. This results in assert
+failure due to the release of the bli while still present in the AIL
+and a subsequent NULL dereference and panic in the buffer I/O
+completion handling. This can be reproduced by running generic/388
+in repetition.
+
+To avoid this problem, update xfs_buf_item_unlock() to first check
+whether the bli is aborted and if so, remove it from the AIL before
+it is released. This ensures that the bli is no longer accessed
+during the shutdown sequence after it has been freed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf_item.c |   21 ++++++++++++---------
+ 1 file changed, 12 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -636,20 +636,23 @@ xfs_buf_item_unlock(
+ 
+ 	/*
+ 	 * Clean buffers, by definition, cannot be in the AIL. However, aborted
+-	 * buffers may be dirty and hence in the AIL. Therefore if we are
+-	 * aborting a buffer and we've just taken the last refernce away, we
+-	 * have to check if it is in the AIL before freeing it. We need to free
+-	 * it in this case, because an aborted transaction has already shut the
+-	 * filesystem down and this is the last chance we will have to do so.
++	 * buffers may be in the AIL regardless of dirty state. An aborted
++	 * transaction that invalidates a buffer already in the AIL may have
++	 * marked it stale and cleared the dirty state, for example.
++	 *
++	 * Therefore if we are aborting a buffer and we've just taken the last
++	 * reference away, we have to check if it is in the AIL before freeing
++	 * it. We need to free it in this case, because an aborted transaction
++	 * has already shut the filesystem down and this is the last chance we
++	 * will have to do so.
+ 	 */
+ 	if (atomic_dec_and_test(&bip->bli_refcount)) {
+-		if (clean)
+-			xfs_buf_item_relse(bp);
+-		else if (aborted) {
++		if (aborted) {
+ 			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ 			xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ 			xfs_buf_item_relse(bp);
+-		}
++		} else if (clean)
++			xfs_buf_item_relse(bp);
+ 	}
+ 
+ 	if (!(flags & XFS_BLI_HOLD))
diff --git a/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch b/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
new file mode 100644
index 00000000000..dc8ed599aef
--- /dev/null
+++ b/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
@@ -0,0 +1,56 @@
+From hch@lst.de  Mon Sep 18 10:08:30 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:37 -0700
+Subject: xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-13-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 4c1a67bd3606540b9b42caff34a1d5cd94b1cf65 upstream.
+
+We must initialize the firstfsb parameter to _bmapi_write so that it
+doesn't incorrectly treat stack garbage as a restriction on which AGs
+it can search for free space.
+
+Fixes-coverity-id: 1402025
+Fixes-coverity-id: 1415167
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c |    9 +++++++++
+ fs/xfs/xfs_reflink.c     |    2 +-
+ 2 files changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -6639,6 +6639,15 @@ xfs_bmap_finish_one(
+ 	bmap.br_blockcount = *blockcount;
+ 	bmap.br_state = state;
+ 
++	/*
++	 * firstfsb is tied to the transaction lifetime and is used to
++	 * ensure correct AG locking order and schedule work item
++	 * continuations.  XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
++	 * to only making one bmap call per transaction, so it should
++	 * be safe to have it as a local variable here.
++	 */
++	firstfsb = NULLFSBLOCK;
++
+ 	trace_xfs_bmap_deferred(tp->t_mountp,
+ 			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ 			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -333,7 +333,7 @@ xfs_reflink_convert_cow_extent(
+ 	struct xfs_defer_ops		*dfops)
+ {
+ 	struct xfs_bmbt_irec		irec = *imap;
+-	xfs_fsblock_t			first_block;
++	xfs_fsblock_t			first_block = NULLFSBLOCK;
+ 	int				nimaps = 1;
+ 
+ 	if (imap->br_state == XFS_EXT_NORM)
diff --git a/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch b/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch
new file mode 100644
index 00000000000..3015622b147
--- /dev/null
+++ b/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch
@@ -0,0 +1,294 @@
+From e1a4e37cc7b665b6804fba812aca2f4d7402c249 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Wed, 14 Jun 2017 21:25:57 -0700
+Subject: xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit e1a4e37cc7b665b6804fba812aca2f4d7402c249 upstream.
+
+In a pathological scenario where we are trying to bunmapi a single
+extent in which every other block is shared, it's possible that trying
+to unmap the entire large extent in a single transaction can generate so
+many EFIs that we overflow the transaction reservation.
+
+Therefore, use a heuristic to guess at the number of blocks we can
+safely unmap from a reflink file's data fork in an single transaction.
+This should prevent problems such as the log head slamming into the tail
+and ASSERTs that trigger because we've exceeded the transaction
+reservation.
+
+Note that since bunmapi can fail to unmap the entire range, we must also
+teach the deferred unmap code to roll into a new transaction whenever we
+get low on reservation.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+[hch: random edits, all bugs are my fault]
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c     |   37 ++++++++++++++++++++++++++++---------
+ fs/xfs/libxfs/xfs_bmap.h     |    2 +-
+ fs/xfs/libxfs/xfs_refcount.c |   10 +---------
+ fs/xfs/libxfs/xfs_refcount.h |   16 ++++++++++++++++
+ fs/xfs/xfs_bmap_item.c       |   17 +++++++++++++++--
+ fs/xfs/xfs_trans.h           |    2 +-
+ fs/xfs/xfs_trans_bmap.c      |   11 +++++++++--
+ 7 files changed, 71 insertions(+), 24 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -5555,6 +5555,7 @@ __xfs_bunmapi(
+ 	int			whichfork;	/* data or attribute fork */
+ 	xfs_fsblock_t		sum;
+ 	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
++	xfs_fileoff_t		max_len;
+ 
+ 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+ 
+@@ -5576,6 +5577,16 @@ __xfs_bunmapi(
+ 	ASSERT(len > 0);
+ 	ASSERT(nexts >= 0);
+ 
++	/*
++	 * Guesstimate how many blocks we can unmap without running the risk of
++	 * blowing out the transaction with a mix of EFIs and reflink
++	 * adjustments.
++	 */
++	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
++		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
++	else
++		max_len = len;
++
+ 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ 	    (error = xfs_iread_extents(tp, ip, whichfork)))
+ 		return error;
+@@ -5621,7 +5632,7 @@ __xfs_bunmapi(
+ 
+ 	extno = 0;
+ 	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+-	       (nexts == 0 || extno < nexts)) {
++	       (nexts == 0 || extno < nexts) && max_len > 0) {
+ 		/*
+ 		 * Is the found extent after a hole in which bno lives?
+ 		 * Just back up to the previous extent, if so.
+@@ -5655,6 +5666,15 @@ __xfs_bunmapi(
+ 		}
+ 		if (del.br_startoff + del.br_blockcount > bno + 1)
+ 			del.br_blockcount = bno + 1 - del.br_startoff;
++
++		/* How much can we safely unmap? */
++		if (max_len < del.br_blockcount) {
++			del.br_startoff += del.br_blockcount - max_len;
++			if (!wasdel)
++				del.br_startblock += del.br_blockcount - max_len;
++			del.br_blockcount = max_len;
++		}
++
+ 		sum = del.br_startblock + del.br_blockcount;
+ 		if (isrt &&
+ 		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+@@ -5835,6 +5855,7 @@ __xfs_bunmapi(
+ 		if (!isrt && wasdel)
+ 			xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+ 
++		max_len -= del.br_blockcount;
+ 		bno = del.br_startoff - 1;
+ nodelete:
+ 		/*
+@@ -6604,25 +6625,24 @@ xfs_bmap_finish_one(
+ 	int				whichfork,
+ 	xfs_fileoff_t			startoff,
+ 	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			blockcount,
++	xfs_filblks_t			*blockcount,
+ 	xfs_exntst_t			state)
+ {
+ 	struct xfs_bmbt_irec		bmap;
+ 	int				nimaps = 1;
+ 	xfs_fsblock_t			firstfsb;
+ 	int				flags = XFS_BMAPI_REMAP;
+-	int				done;
+ 	int				error = 0;
+ 
+ 	bmap.br_startblock = startblock;
+ 	bmap.br_startoff = startoff;
+-	bmap.br_blockcount = blockcount;
++	bmap.br_blockcount = *blockcount;
+ 	bmap.br_state = state;
+ 
+ 	trace_xfs_bmap_deferred(tp->t_mountp,
+ 			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ 			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+-			ip->i_ino, whichfork, startoff, blockcount, state);
++			ip->i_ino, whichfork, startoff, *blockcount, state);
+ 
+ 	if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ 		return -EFSCORRUPTED;
+@@ -6641,12 +6661,11 @@ xfs_bmap_finish_one(
+ 					bmap.br_blockcount, flags, &firstfsb,
+ 					bmap.br_blockcount, &bmap, &nimaps,
+ 					dfops);
++		*blockcount = 0;
+ 		break;
+ 	case XFS_BMAP_UNMAP:
+-		error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+-				bmap.br_blockcount, flags, 1, &firstfsb,
+-				dfops, &done);
+-		ASSERT(done);
++		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
++				XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
+ 		break;
+ 	default:
+ 		ASSERT(0);
+--- a/fs/xfs/libxfs/xfs_bmap.h
++++ b/fs/xfs/libxfs/xfs_bmap.h
+@@ -265,7 +265,7 @@ struct xfs_bmap_intent {
+ int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+ 		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+ 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+-		xfs_filblks_t blockcount, xfs_exntst_t state);
++		xfs_filblks_t *blockcount, xfs_exntst_t state);
+ int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ 		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+ int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
+ }
+ 
+ /*
+- * While we're adjusting the refcounts records of an extent, we have
+- * to keep an eye on the number of extents we're dirtying -- run too
+- * many in a single transaction and we'll exceed the transaction's
+- * reservation and crash the fs.  Each record adds 12 bytes to the
+- * log (plus any key updates) so we'll conservatively assume 24 bytes
+- * per record.  We must also leave space for btree splits on both ends
+- * of the range and space for the CUD and a new CUI.
+- *
+  * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
+  * true incorrectly is a shutdown FS; the penalty for guessing false
+  * incorrectly is more transaction rolls than might be necessary.
+@@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
+ 	else if (overhead > cur->bc_tp->t_log_res)
+ 		return false;
+ 	return  cur->bc_tp->t_log_res - overhead >
+-		cur->bc_private.a.priv.refc.nr_ops * 32;
++		cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ }
+ 
+ /*
+--- a/fs/xfs/libxfs/xfs_refcount.h
++++ b/fs/xfs/libxfs/xfs_refcount.h
+@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(
+ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+ 		xfs_agnumber_t agno);
+ 
++/*
++ * While we're adjusting the refcounts records of an extent, we have
++ * to keep an eye on the number of extents we're dirtying -- run too
++ * many in a single transaction and we'll exceed the transaction's
++ * reservation and crash the fs.  Each record adds 12 bytes to the
++ * log (plus any key updates) so we'll conservatively assume 32 bytes
++ * per record.  We must also leave space for btree splits on both ends
++ * of the range and space for the CUD and a new CUI.
++ */
++#define XFS_REFCOUNT_ITEM_OVERHEAD	32
++
++static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
++{
++	return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
++}
++
+ #endif	/* __XFS_REFCOUNT_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -395,6 +395,7 @@ xfs_bui_recover(
+ 	struct xfs_map_extent		*bmap;
+ 	xfs_fsblock_t			startblock_fsb;
+ 	xfs_fsblock_t			inode_fsb;
++	xfs_filblks_t			count;
+ 	bool				op_ok;
+ 	struct xfs_bud_log_item		*budp;
+ 	enum xfs_bmap_intent_type	type;
+@@ -403,6 +404,7 @@ xfs_bui_recover(
+ 	struct xfs_trans		*tp;
+ 	struct xfs_inode		*ip = NULL;
+ 	struct xfs_defer_ops		dfops;
++	struct xfs_bmbt_irec		irec;
+ 	xfs_fsblock_t			firstfsb;
+ 
+ 	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+@@ -480,13 +482,24 @@ xfs_bui_recover(
+ 	}
+ 	xfs_trans_ijoin(tp, ip, 0);
+ 
++	count = bmap->me_len;
+ 	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+ 			ip, whichfork, bmap->me_startoff,
+-			bmap->me_startblock, bmap->me_len,
+-			state);
++			bmap->me_startblock, &count, state);
+ 	if (error)
+ 		goto err_dfops;
+ 
++	if (count > 0) {
++		ASSERT(type == XFS_BMAP_UNMAP);
++		irec.br_startblock = bmap->me_startblock;
++		irec.br_blockcount = count;
++		irec.br_startoff = bmap->me_startoff;
++		irec.br_state = state;
++		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
++		if (error)
++			goto err_dfops;
++	}
++
+ 	/* Finish transaction, free inodes. */
+ 	error = xfs_defer_finish(&tp, &dfops, NULL);
+ 	if (error)
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -277,6 +277,6 @@ int xfs_trans_log_finish_bmap_update(str
+ 		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
+ 		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
+ 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+-		xfs_filblks_t blockcount, xfs_exntst_t state);
++		xfs_filblks_t *blockcount, xfs_exntst_t state);
+ 
+ #endif	/* __XFS_TRANS_H__ */
+--- a/fs/xfs/xfs_trans_bmap.c
++++ b/fs/xfs/xfs_trans_bmap.c
+@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
+ 	int				whichfork,
+ 	xfs_fileoff_t			startoff,
+ 	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			blockcount,
++	xfs_filblks_t			*blockcount,
+ 	xfs_exntst_t			state)
+ {
+ 	int				error;
+@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
+ 	void				**state)
+ {
+ 	struct xfs_bmap_intent		*bmap;
++	xfs_filblks_t			count;
+ 	int				error;
+ 
+ 	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
++	count = bmap->bi_bmap.br_blockcount;
+ 	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
+ 			bmap->bi_type,
+ 			bmap->bi_owner, bmap->bi_whichfork,
+ 			bmap->bi_bmap.br_startoff,
+ 			bmap->bi_bmap.br_startblock,
+-			bmap->bi_bmap.br_blockcount,
++			&count,
+ 			bmap->bi_bmap.br_state);
++	if (!error && count > 0) {
++		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
++		bmap->bi_bmap.br_blockcount = count;
++		return -EAGAIN;
++	}
+ 	kmem_free(bmap);
+ 	return error;
+ }