From f3e817d4a59122622d92eb623d064002b74e8cf9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 1 Apr 2017 18:41:03 +0200
Subject: [PATCH] 4.9-stable patches

added patches:
	xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch
	xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
	xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
	xfs-use-per-ag-reservations-for-the-finobt.patch
---
 queue-4.9/series                              |   4 +
 ...lds-on-success-in-__xfs_ag_resv_init.patch |  61 ++++
 ...ll-up-iolock-from-xfs_free_eofblocks.patch | 246 ++++++++++++++
 ...cans-under-iolock-are-livelock-prone.patch | 229 +++++++++++++
 ...e-per-ag-reservations-for-the-finobt.patch | 321 ++++++++++++++++++
 5 files changed, 861 insertions(+)
 create mode 100644 queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch
 create mode 100644 queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
 create mode 100644 queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
 create mode 100644 queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch

diff --git a/queue-4.9/series b/queue-4.9/series
index 76be2244cf8..db553e76a6f 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -1,2 +1,6 @@
 libceph-force-gfp_noio-for-socket-allocations.patch
 xen-setup-don-t-relocate-p2m-over-existing-one.patch
+xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch
+xfs-use-per-ag-reservations-for-the-finobt.patch
+xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
+xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
diff --git a/queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch b/queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch
new file mode 100644
index 00000000000..38738f50eb2
--- /dev/null
+++ b/queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch
@@ -0,0 +1,61 @@
+From 4dfa2b84118fd6c95202ae87e62adf5000ccd4d0 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Wed, 25 Jan 2017 07:49:34 -0800
+Subject: xfs: only update mount/resv fields on success in __xfs_ag_resv_init
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 4dfa2b84118fd6c95202ae87e62adf5000ccd4d0 upstream.
+
+Try to reserve the blocks first and only then update the fields in
+or hanging off the mount structure.  This way we can call __xfs_ag_resv_init
+again after a previous failure.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_ag_resv.c |   23 ++++++++++++++---------
+ 1 file changed, 14 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ag_resv.c
++++ b/fs/xfs/libxfs/xfs_ag_resv.c
+@@ -200,22 +200,27 @@ __xfs_ag_resv_init(
+ 	struct xfs_mount		*mp = pag->pag_mount;
+ 	struct xfs_ag_resv		*resv;
+ 	int				error;
++	xfs_extlen_t			reserved;
+ 
+-	resv = xfs_perag_resv(pag, type);
+ 	if (used > ask)
+ 		ask = used;
+-	resv->ar_asked = ask;
+-	resv->ar_reserved = resv->ar_orig_reserved = ask - used;
+-	mp->m_ag_max_usable -= ask;
++	reserved = ask - used;
+ 
+-	trace_xfs_ag_resv_init(pag, type, ask);
+-
+-	error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
+-	if (error)
++	error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true);
++	if (error) {
+ 		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+ 				error, _RET_IP_);
++		return error;
++	}
+ 
+-	return error;
++	mp->m_ag_max_usable -= ask;
++
++	resv = xfs_perag_resv(pag, type);
++	resv->ar_asked = ask;
++	resv->ar_reserved = resv->ar_orig_reserved = reserved;
++
++	trace_xfs_ag_resv_init(pag, type, ask);
++	return 0;
+ }
+ 
+ /* Create a per-AG block reservation. */
diff --git a/queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch b/queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
new file mode 100644
index 00000000000..0ac5c2d117a
--- /dev/null
+++ b/queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
@@ -0,0 +1,246 @@
+From a36b926180cda375ac2ec89e1748b47137cfc51c Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 27 Jan 2017 23:22:55 -0800
+Subject: xfs: pull up iolock from xfs_free_eofblocks()
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a36b926180cda375ac2ec89e1748b47137cfc51c upstream.
+
+xfs_free_eofblocks() requires the IOLOCK_EXCL lock, but is called from
+different contexts where the lock may or may not be held. The
+need_iolock parameter exists for this reason, to indicate whether
+xfs_free_eofblocks() must acquire the iolock itself before it can
+proceed.
+
+This is ugly and confusing. Simplify the semantics of
+xfs_free_eofblocks() to require the caller to acquire the iolock
+appropriately and kill the need_iolock parameter. While here, the mp
+param can be removed as well as the xfs_mount is accessible from the
+xfs_inode structure. This patch does not change behavior.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |   41 +++++++++++++++------------------------
+ fs/xfs/xfs_bmap_util.h |    3 --
+ fs/xfs/xfs_icache.c    |   24 ++++++++++++++---------
+ fs/xfs/xfs_inode.c     |   51 ++++++++++++++++++++++++++-----------------------
+ 4 files changed, 60 insertions(+), 59 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -917,17 +917,18 @@ xfs_can_free_eofblocks(struct xfs_inode
+  */
+ int
+ xfs_free_eofblocks(
+-	xfs_mount_t	*mp,
+-	xfs_inode_t	*ip,
+-	bool		need_iolock)
++	struct xfs_inode	*ip)
+ {
+-	xfs_trans_t	*tp;
+-	int		error;
+-	xfs_fileoff_t	end_fsb;
+-	xfs_fileoff_t	last_fsb;
+-	xfs_filblks_t	map_len;
+-	int		nimaps;
+-	xfs_bmbt_irec_t	imap;
++	struct xfs_trans	*tp;
++	int			error;
++	xfs_fileoff_t		end_fsb;
++	xfs_fileoff_t		last_fsb;
++	xfs_filblks_t		map_len;
++	int			nimaps;
++	struct xfs_bmbt_irec	imap;
++	struct xfs_mount	*mp = ip->i_mount;
++
++	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ 
+ 	/*
+ 	 * Figure out if there are any blocks beyond the end
+@@ -944,6 +945,10 @@ xfs_free_eofblocks(
+ 	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+ 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ 
++	/*
++	 * If there are blocks after the end of file, truncate the file to its
++	 * current size to free them up.
++	 */
+ 	if (!error && (nimaps != 0) &&
+ 	    (imap.br_startblock != HOLESTARTBLOCK ||
+ 	     ip->i_delayed_blks)) {
+@@ -954,22 +959,10 @@ xfs_free_eofblocks(
+ 		if (error)
+ 			return error;
+ 
+-		/*
+-		 * There are blocks after the end of file.
+-		 * Free them up now by truncating the file to
+-		 * its current size.
+-		 */
+-		if (need_iolock) {
+-			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
+-				return -EAGAIN;
+-		}
+-
+ 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+ 				&tp);
+ 		if (error) {
+ 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+-			if (need_iolock)
+-				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ 			return error;
+ 		}
+ 
+@@ -997,8 +990,6 @@ xfs_free_eofblocks(
+ 		}
+ 
+ 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-		if (need_iolock)
+-			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ 	}
+ 	return error;
+ }
+@@ -1415,7 +1406,7 @@ xfs_shift_file_space(
+ 	 * into the accessible region of the file.
+ 	 */
+ 	if (xfs_can_free_eofblocks(ip, true)) {
+-		error = xfs_free_eofblocks(mp, ip, false);
++		error = xfs_free_eofblocks(ip);
+ 		if (error)
+ 			return error;
+ 	}
+--- a/fs/xfs/xfs_bmap_util.h
++++ b/fs/xfs/xfs_bmap_util.h
+@@ -63,8 +63,7 @@ int	xfs_insert_file_space(struct xfs_ino
+ 
+ /* EOF block manipulation functions */
+ bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+-int	xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+-			   bool need_iolock);
++int	xfs_free_eofblocks(struct xfs_inode *ip);
+ 
+ int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+ 			 struct xfs_swapext *sx);
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1324,7 +1324,7 @@ xfs_inode_free_eofblocks(
+ 	int			flags,
+ 	void			*args)
+ {
+-	int ret;
++	int ret = 0;
+ 	struct xfs_eofblocks *eofb = args;
+ 	bool need_iolock = true;
+ 	int match;
+@@ -1360,19 +1360,25 @@ xfs_inode_free_eofblocks(
+ 			return 0;
+ 
+ 		/*
+-		 * A scan owner implies we already hold the iolock. Skip it in
+-		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+-		 * the possibility of EAGAIN being returned.
++		 * A scan owner implies we already hold the iolock. Skip it here
++		 * to avoid deadlock.
+ 		 */
+ 		if (eofb->eof_scan_owner == ip->i_ino)
+ 			need_iolock = false;
+ 	}
+ 
+-	ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
+-
+-	/* don't revisit the inode if we're not waiting */
+-	if (ret == -EAGAIN && !(flags & SYNC_WAIT))
+-		ret = 0;
++	/*
++	 * If the caller is waiting, return -EAGAIN to keep the background
++	 * scanner moving and revisit the inode in a subsequent pass.
++	 */
++	if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++		if (flags & SYNC_WAIT)
++			ret = -EAGAIN;
++		return ret;
++	}
++	ret = xfs_free_eofblocks(ip);
++	if (need_iolock)
++		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ 
+ 	return ret;
+ }
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1701,32 +1701,34 @@ xfs_release(
+ 	if (xfs_can_free_eofblocks(ip, false)) {
+ 
+ 		/*
++		 * Check if the inode is being opened, written and closed
++		 * frequently and we have delayed allocation blocks outstanding
++		 * (e.g. streaming writes from the NFS server), truncating the
++		 * blocks past EOF will cause fragmentation to occur.
++		 *
++		 * In this case don't do the truncation, but we have to be
++		 * careful how we detect this case. Blocks beyond EOF show up as
++		 * i_delayed_blks even when the inode is clean, so we need to
++		 * truncate them away first before checking for a dirty release.
++		 * Hence on the first dirty close we will still remove the
++		 * speculative allocation, but after that we will leave it in
++		 * place.
++		 */
++		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
++			return 0;
++		/*
+ 		 * If we can't get the iolock just skip truncating the blocks
+ 		 * past EOF because we could deadlock with the mmap_sem
+-		 * otherwise.  We'll get another chance to drop them once the
++		 * otherwise. We'll get another chance to drop them once the
+ 		 * last reference to the inode is dropped, so we'll never leak
+ 		 * blocks permanently.
+-		 *
+-		 * Further, check if the inode is being opened, written and
+-		 * closed frequently and we have delayed allocation blocks
+-		 * outstanding (e.g. streaming writes from the NFS server),
+-		 * truncating the blocks past EOF will cause fragmentation to
+-		 * occur.
+-		 *
+-		 * In this case don't do the truncation, either, but we have to
+-		 * be careful how we detect this case. Blocks beyond EOF show
+-		 * up as i_delayed_blks even when the inode is clean, so we
+-		 * need to truncate them away first before checking for a dirty
+-		 * release. Hence on the first dirty close we will still remove
+-		 * the speculative allocation, but after that we will leave it
+-		 * in place.
+ 		 */
+-		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+-			return 0;
+-
+-		error = xfs_free_eofblocks(mp, ip, true);
+-		if (error && error != -EAGAIN)
+-			return error;
++		if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++			error = xfs_free_eofblocks(ip);
++			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++			if (error)
++				return error;
++		}
+ 
+ 		/* delalloc blocks after truncation means it really is dirty */
+ 		if (ip->i_delayed_blks)
+@@ -1913,8 +1915,11 @@ xfs_inactive(
+ 		 * cache. Post-eof blocks must be freed, lest we end up with
+ 		 * broken free space accounting.
+ 		 */
+-		if (xfs_can_free_eofblocks(ip, true))
+-			xfs_free_eofblocks(mp, ip, false);
++		if (xfs_can_free_eofblocks(ip, true)) {
++			xfs_ilock(ip, XFS_IOLOCK_EXCL);
++			xfs_free_eofblocks(ip);
++			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++		}
+ 
+ 		return;
+ 	}
diff --git a/queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch b/queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
new file mode 100644
index 00000000000..ec427ea3bfa
--- /dev/null
+++ b/queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
@@ -0,0 +1,229 @@
+From c3155097ad89a956579bc305856a1f2878494e52 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 27 Jan 2017 23:22:56 -0800
+Subject: xfs: sync eofblocks scans under iolock are livelock prone
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit c3155097ad89a956579bc305856a1f2878494e52 upstream.
+
+The xfs_eofblocks.eof_scan_owner field is an internal field to
+facilitate invoking eofb scans from the kernel while under the iolock.
+This is necessary because the eofb scan acquires the iolock of each
+inode. Synchronous scans are invoked on certain buffered write failures
+while under iolock. In such cases, the scan owner indicates that the
+context for the scan already owns the particular iolock and prevents a
+double lock deadlock.
+
+eofblocks scans while under iolock are still livelock prone in the event
+of multiple parallel scans, however. If multiple buffered writes to
+different inodes fail and invoke eofblocks scans at the same time, each
+scan avoids a deadlock with its own inode by virtue of the
+eof_scan_owner field, but will never be able to acquire the iolock of
+the inode from the parallel scan. Because the low free space scans are
+invoked with SYNC_WAIT, the scan will not return until it has processed
+every tagged inode and thus both scans will spin indefinitely on the
+iolock being held across the opposite scan. This problem can be
+reproduced reliably by generic/224 on systems with higher cpu counts
+(x16).
+
+To avoid this problem, simplify the semantics of eofblocks scans to
+never invoke a scan while under iolock. This means that the buffered
+write context must drop the iolock before the scan. It must reacquire
+the lock before the write retry and also repeat the initial write
+checks, as the original state might no longer be valid once the iolock
+was dropped.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/xfs/xfs_file.c   |   13 +++++++++----
+ fs/xfs/xfs_icache.c |   45 +++++++--------------------------------------
+ fs/xfs/xfs_icache.h |    2 --
+ 3 files changed, 16 insertions(+), 44 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -675,8 +675,10 @@ xfs_file_buffered_aio_write(
+ 	struct xfs_inode	*ip = XFS_I(inode);
+ 	ssize_t			ret;
+ 	int			enospc = 0;
+-	int			iolock = XFS_IOLOCK_EXCL;
++	int			iolock;
+ 
++write_retry:
++	iolock = XFS_IOLOCK_EXCL;
+ 	xfs_rw_ilock(ip, iolock);
+ 
+ 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+@@ -686,7 +688,6 @@ xfs_file_buffered_aio_write(
+ 	/* We can write back this queue in page reclaim */
+ 	current->backing_dev_info = inode_to_bdi(inode);
+ 
+-write_retry:
+ 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ 	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
+ 	if (likely(ret >= 0))
+@@ -702,18 +703,21 @@ write_retry:
+ 	 * running at the same time.
+ 	 */
+ 	if (ret == -EDQUOT && !enospc) {
++		xfs_rw_iunlock(ip, iolock);
+ 		enospc = xfs_inode_free_quota_eofblocks(ip);
+ 		if (enospc)
+ 			goto write_retry;
+ 		enospc = xfs_inode_free_quota_cowblocks(ip);
+ 		if (enospc)
+ 			goto write_retry;
++		iolock = 0;
+ 	} else if (ret == -ENOSPC && !enospc) {
+ 		struct xfs_eofblocks eofb = {0};
+ 
+ 		enospc = 1;
+ 		xfs_flush_inodes(ip->i_mount);
+-		eofb.eof_scan_owner = ip->i_ino; /* for locking */
++
++		xfs_rw_iunlock(ip, iolock);
+ 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+ 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+ 		goto write_retry;
+@@ -721,7 +725,8 @@ write_retry:
+ 
+ 	current->backing_dev_info = NULL;
+ out:
+-	xfs_rw_iunlock(ip, iolock);
++	if (iolock)
++		xfs_rw_iunlock(ip, iolock);
+ 	return ret;
+ }
+ 
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1326,11 +1326,8 @@ xfs_inode_free_eofblocks(
+ {
+ 	int ret = 0;
+ 	struct xfs_eofblocks *eofb = args;
+-	bool need_iolock = true;
+ 	int match;
+ 
+-	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+-
+ 	if (!xfs_can_free_eofblocks(ip, false)) {
+ 		/* inode could be preallocated or append-only */
+ 		trace_xfs_inode_free_eofblocks_invalid(ip);
+@@ -1358,27 +1355,19 @@ xfs_inode_free_eofblocks(
+ 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+ 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ 			return 0;
+-
+-		/*
+-		 * A scan owner implies we already hold the iolock. Skip it here
+-		 * to avoid deadlock.
+-		 */
+-		if (eofb->eof_scan_owner == ip->i_ino)
+-			need_iolock = false;
+ 	}
+ 
+ 	/*
+ 	 * If the caller is waiting, return -EAGAIN to keep the background
+ 	 * scanner moving and revisit the inode in a subsequent pass.
+ 	 */
+-	if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ 		if (flags & SYNC_WAIT)
+ 			ret = -EAGAIN;
+ 		return ret;
+ 	}
+ 	ret = xfs_free_eofblocks(ip);
+-	if (need_iolock)
+-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ 
+ 	return ret;
+ }
+@@ -1425,15 +1414,10 @@ __xfs_inode_free_quota_eofblocks(
+ 	struct xfs_eofblocks eofb = {0};
+ 	struct xfs_dquot *dq;
+ 
+-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+-
+ 	/*
+-	 * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+-	 * can repeatedly trylock on the inode we're currently processing. We
+-	 * run a sync scan to increase effectiveness and use the union filter to
++	 * Run a sync scan to increase effectiveness and use the union filter to
+ 	 * cover all applicable quotas in a single scan.
+ 	 */
+-	eofb.eof_scan_owner = ip->i_ino;
+ 	eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
+ 
+ 	if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
+@@ -1585,12 +1569,9 @@ xfs_inode_free_cowblocks(
+ {
+ 	int ret;
+ 	struct xfs_eofblocks *eofb = args;
+-	bool need_iolock = true;
+ 	int match;
+ 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ 
+-	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+-
+ 	/*
+ 	 * Just clear the tag if we have an empty cow fork or none at all. It's
+ 	 * possible the inode was fully unshared since it was originally tagged.
+@@ -1623,28 +1604,16 @@ xfs_inode_free_cowblocks(
+ 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+ 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ 			return 0;
+-
+-		/*
+-		 * A scan owner implies we already hold the iolock. Skip it in
+-		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+-		 * the possibility of EAGAIN being returned.
+-		 */
+-		if (eofb->eof_scan_owner == ip->i_ino)
+-			need_iolock = false;
+ 	}
+ 
+ 	/* Free the CoW blocks */
+-	if (need_iolock) {
+-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+-		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+-	}
++	xfs_ilock(ip, XFS_IOLOCK_EXCL);
++	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ 
+ 	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+ 
+-	if (need_iolock) {
+-		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+-	}
++	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
++	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ 
+ 	return ret;
+ }
+--- a/fs/xfs/xfs_icache.h
++++ b/fs/xfs/xfs_icache.h
+@@ -27,7 +27,6 @@ struct xfs_eofblocks {
+ 	kgid_t		eof_gid;
+ 	prid_t		eof_prid;
+ 	__u64		eof_min_file_size;
+-	xfs_ino_t	eof_scan_owner;
+ };
+ 
+ #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
+@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
+ 	dst->eof_flags = src->eof_flags;
+ 	dst->eof_prid = src->eof_prid;
+ 	dst->eof_min_file_size = src->eof_min_file_size;
+-	dst->eof_scan_owner = NULLFSINO;
+ 
+ 	dst->eof_uid = INVALID_UID;
+ 	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
diff --git a/queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch b/queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch
new file mode 100644
index 00000000000..7a17043901d
--- /dev/null
+++ b/queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch
@@ -0,0 +1,321 @@
+From 76d771b4cbe33c581bd6ca2710c120be51172440 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Wed, 25 Jan 2017 07:49:35 -0800
+Subject: xfs: use per-AG reservations for the finobt
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 76d771b4cbe33c581bd6ca2710c120be51172440 upstream.
+
+Currently we try to rely on the global reserved block pool for block
+allocations for the free inode btree, but I have customer reports
+(fairly complex workload, need to find an easier reproducer) where that
+is not enough as the AG where we free an inode that requires a new
+finobt block is entirely full.  This causes us to cancel a dirty
+transaction and thus a file system shutdown.
+
+I think the right way to guard against this is to treat the finot the same
+way as the refcount btree and have a per-AG reservations for the possible
+worst case size of it, and the patch below implements that.
+
+Note that this could increase mount times with large finobt trees.  In
+an ideal world we would have added a field for the number of finobt
+fields to the AGI, similar to what we did for the refcount blocks.
+We should do add it next time we rev the AGI or AGF format by adding
+new fields.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_ag_resv.c      |   47 +++++++++++++++++---
+ fs/xfs/libxfs/xfs_ialloc_btree.c |   90 +++++++++++++++++++++++++++++++++++++--
+ fs/xfs/libxfs/xfs_ialloc_btree.h |    3 +
+ fs/xfs/xfs_inode.c               |   23 +++++----
+ fs/xfs/xfs_mount.h               |    1 
+ 5 files changed, 144 insertions(+), 20 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ag_resv.c
++++ b/fs/xfs/libxfs/xfs_ag_resv.c
+@@ -39,6 +39,7 @@
+ #include "xfs_rmap_btree.h"
+ #include "xfs_btree.h"
+ #include "xfs_refcount_btree.h"
++#include "xfs_ialloc_btree.h"
+ 
+ /*
+  * Per-AG Block Reservations
+@@ -210,6 +211,9 @@ __xfs_ag_resv_init(
+ 	if (error) {
+ 		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
+ 				error, _RET_IP_);
++		xfs_warn(mp,
++"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
++				pag->pag_agno);
+ 		return error;
+ 	}
+ 
+@@ -228,6 +232,8 @@ int
+ xfs_ag_resv_init(
+ 	struct xfs_perag		*pag)
+ {
++	struct xfs_mount		*mp = pag->pag_mount;
++	xfs_agnumber_t			agno = pag->pag_agno;
+ 	xfs_extlen_t			ask;
+ 	xfs_extlen_t			used;
+ 	int				error = 0;
+@@ -236,23 +242,45 @@ xfs_ag_resv_init(
+ 	if (pag->pag_meta_resv.ar_asked == 0) {
+ 		ask = used = 0;
+ 
+-		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
+-				pag->pag_agno, &ask, &used);
++		error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
+ 		if (error)
+ 			goto out;
+ 
+-		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+-				ask, used);
++		error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
+ 		if (error)
+ 			goto out;
++
++		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
++				ask, used);
++		if (error) {
++			/*
++			 * Because we didn't have per-AG reservations when the
++			 * finobt feature was added we might not be able to
++			 * reserve all needed blocks.  Warn and fall back to the
++			 * old and potentially buggy code in that case, but
++			 * ensure we do have the reservation for the refcountbt.
++			 */
++			ask = used = 0;
++
++			mp->m_inotbt_nores = true;
++
++			error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
++					&used);
++			if (error)
++				goto out;
++
++			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
++					ask, used);
++			if (error)
++				goto out;
++		}
+ 	}
+ 
+ 	/* Create the AGFL metadata reservation */
+ 	if (pag->pag_agfl_resv.ar_asked == 0) {
+ 		ask = used = 0;
+ 
+-		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
+-				&ask, &used);
++		error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
+ 		if (error)
+ 			goto out;
+ 
+@@ -261,9 +289,16 @@ xfs_ag_resv_init(
+ 			goto out;
+ 	}
+ 
++#ifdef DEBUG
++	/* need to read in the AGF for the ASSERT below to work */
++	error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
++	if (error)
++		return error;
++
+ 	ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+ 	       xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
+ 	       pag->pagf_freeblks + pag->pagf_flcount);
++#endif
+ out:
+ 	return error;
+ }
+--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
++++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
+@@ -82,11 +82,12 @@ xfs_finobt_set_root(
+ }
+ 
+ STATIC int
+-xfs_inobt_alloc_block(
++__xfs_inobt_alloc_block(
+ 	struct xfs_btree_cur	*cur,
+ 	union xfs_btree_ptr	*start,
+ 	union xfs_btree_ptr	*new,
+-	int			*stat)
++	int			*stat,
++	enum xfs_ag_resv_type	resv)
+ {
+ 	xfs_alloc_arg_t		args;		/* block allocation args */
+ 	int			error;		/* error return value */
+@@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
+ 	args.maxlen = 1;
+ 	args.prod = 1;
+ 	args.type = XFS_ALLOCTYPE_NEAR_BNO;
++	args.resv = resv;
+ 
+ 	error = xfs_alloc_vextent(&args);
+ 	if (error) {
+@@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
+ }
+ 
+ STATIC int
++xfs_inobt_alloc_block(
++	struct xfs_btree_cur	*cur,
++	union xfs_btree_ptr	*start,
++	union xfs_btree_ptr	*new,
++	int			*stat)
++{
++	return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
++}
++
++STATIC int
++xfs_finobt_alloc_block(
++	struct xfs_btree_cur	*cur,
++	union xfs_btree_ptr	*start,
++	union xfs_btree_ptr	*new,
++	int			*stat)
++{
++	return __xfs_inobt_alloc_block(cur, start, new, stat,
++			XFS_AG_RESV_METADATA);
++}
++
++STATIC int
+ xfs_inobt_free_block(
+ 	struct xfs_btree_cur	*cur,
+ 	struct xfs_buf		*bp)
+@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_fi
+ 
+ 	.dup_cursor		= xfs_inobt_dup_cursor,
+ 	.set_root		= xfs_finobt_set_root,
+-	.alloc_block		= xfs_inobt_alloc_block,
++	.alloc_block		= xfs_finobt_alloc_block,
+ 	.free_block		= xfs_inobt_free_block,
+ 	.get_minrecs		= xfs_inobt_get_minrecs,
+ 	.get_maxrecs		= xfs_inobt_get_maxrecs,
+@@ -478,3 +501,64 @@ xfs_inobt_rec_check_count(
+ 	return 0;
+ }
+ #endif	/* DEBUG */
++
++static xfs_extlen_t
++xfs_inobt_max_size(
++	struct xfs_mount	*mp)
++{
++	/* Bail out if we're uninitialized, which can happen in mkfs. */
++	if (mp->m_inobt_mxr[0] == 0)
++		return 0;
++
++	return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
++		(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
++				XFS_INODES_PER_CHUNK);
++}
++
++static int
++xfs_inobt_count_blocks(
++	struct xfs_mount	*mp,
++	xfs_agnumber_t		agno,
++	xfs_btnum_t		btnum,
++	xfs_extlen_t		*tree_blocks)
++{
++	struct xfs_buf		*agbp;
++	struct xfs_btree_cur	*cur;
++	int			error;
++
++	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
++	if (error)
++		return error;
++
++	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
++	error = xfs_btree_count_blocks(cur, tree_blocks);
++	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
++	xfs_buf_relse(agbp);
++
++	return error;
++}
++
++/*
++ * Figure out how many blocks to reserve and how many are used by this btree.
++ */
++int
++xfs_finobt_calc_reserves(
++	struct xfs_mount	*mp,
++	xfs_agnumber_t		agno,
++	xfs_extlen_t		*ask,
++	xfs_extlen_t		*used)
++{
++	xfs_extlen_t		tree_len = 0;
++	int			error;
++
++	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
++		return 0;
++
++	error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
++	if (error)
++		return error;
++
++	*ask += xfs_inobt_max_size(mp);
++	*used += tree_len;
++	return 0;
++}
+--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
++++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
+@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs
+ #define xfs_inobt_rec_check_count(mp, rec)	0
+ #endif	/* DEBUG */
+ 
++int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
++		xfs_extlen_t *ask, xfs_extlen_t *used);
++
+ #endif	/* __XFS_IALLOC_BTREE_H__ */
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1801,22 +1801,23 @@ xfs_inactive_ifree(
+ 	int			error;
+ 
+ 	/*
+-	 * The ifree transaction might need to allocate blocks for record
+-	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
+-	 * allow ifree to dip into the reserved block pool if necessary.
+-	 *
+-	 * Freeing large sets of inodes generally means freeing inode chunks,
+-	 * directory and file data blocks, so this should be relatively safe.
+-	 * Only under severe circumstances should it be possible to free enough
+-	 * inodes to exhaust the reserve block pool via finobt expansion while
+-	 * at the same time not creating free space in the filesystem.
++	 * We try to use a per-AG reservation for any block needed by the finobt
++	 * tree, but as the finobt feature predates the per-AG reservation
++	 * support a degraded file system might not have enough space for the
++	 * reservation at mount time.  In that case try to dip into the reserved
++	 * pool and pray.
+ 	 *
+ 	 * Send a warning if the reservation does happen to fail, as the inode
+ 	 * now remains allocated and sits on the unlinked list until the fs is
+ 	 * repaired.
+ 	 */
+-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+-			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
++	if (unlikely(mp->m_inotbt_nores)) {
++		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
++				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
++				&tp);
++	} else {
++		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
++	}
+ 	if (error) {
+ 		if (error == -ENOSPC) {
+ 			xfs_warn_ratelimited(mp,
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -140,6 +140,7 @@ typedef struct xfs_mount {
+ 	int			m_fixedfsid[2];	/* unchanged for life of FS */
+ 	uint			m_dmevmask;	/* DMI events for this FS */
+ 	__uint64_t		m_flags;	/* global mount flags */
++	bool			m_inotbt_nores; /* no per-AG finobt resv. */
+ 	int			m_ialloc_inos;	/* inodes in inode allocation */
+ 	int			m_ialloc_blks;	/* blocks in inode allocation */
+ 	int			m_ialloc_min_blks;/* min blocks in sparse inode
-- 
2.47.3