From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 18 Sep 2017 08:21:23 +0000 (+0200)
Subject: 4.9-stable patches
X-Git-Tag: v4.9.51~11
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=4246a018c3711e7099b65c6a1a674e6e7127fbc3;p=thirdparty%2Fkernel%2Fstable-queue.git

4.9-stable patches

added patches:
	iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
	xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
	xfs-add-log-recovery-tracepoint-for-head-tail.patch
	xfs-always-verify-the-log-tail-during-recovery.patch
	xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
	xfs-clear-ms_active-after-finishing-log-recovery.patch
	xfs-disable-per-inode-dax-flag.patch
	xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
	xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
	xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
	xfs-don-t-set-v3-xflags-for-v2-inodes.patch
	xfs-evict-all-inodes-involved-with-log-redo-item.patch
	xfs-fix-incorrect-log_flushed-on-fsync.patch
	xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
	xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
	xfs-handle-efscorrupted-during-head-tail-verification.patch
	xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
	xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
	xfs-open-code-xfs_buf_item_dirty.patch
	xfs-ordered-buffer-log-items-are-never-formatted.patch
	xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
	xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
	xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
	xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
	xfs-remove-xfs_trans_ail_delete_bulk.patch
	xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
	xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
	xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
	xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
	xfs-write-unmount-record-for-ro-mounts.patch
---

diff --git a/queue-4.9/iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch b/queue-4.9/iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
new file mode 100644
index 00000000000..fff65d91932
--- /dev/null
+++ b/queue-4.9/iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
@@ -0,0 +1,49 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:45 -0700
+Subject: iomap: fix integer truncation issues in the zeroing and dirtying helpers
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-21-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e28ae8e428fefe2facd72cea9f29906ecb9c861d upstream.
+
+Fix the min_t calls in the zeroing and dirtying helpers to perform the
+comparisms on 64-bit types, which prevents them from incorrectly
+being truncated, and larger zeroing operations being stuck in a never
+ending loop.
+
+Special thanks to Markus Stockhausen for spotting the bug.
+
+Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
+Tested-by: Paul Menzel <pmenzel@molgen.mpg.de>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/iomap.c
++++ b/fs/iomap.c
+@@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, l
+ 		unsigned long bytes;	/* Bytes to write to page */
+ 
+ 		offset = (pos & (PAGE_SIZE - 1));
+-		bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
++		bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+ 
+ 		rpage = __iomap_read_page(inode, pos);
+ 		if (IS_ERR(rpage))
+@@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *ino
+ 		unsigned offset, bytes;
+ 
+ 		offset = pos & (PAGE_SIZE - 1); /* Within page */
+-		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
++		bytes = min_t(loff_t, PAGE_SIZE - offset, count);
+ 
+ 		if (IS_DAX(inode))
+ 			status = iomap_dax_zero(pos, offset, bytes, iomap);
diff --git a/queue-4.9/series b/queue-4.9/series
index cc556c6deea..5c530ede7c1 100644
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -45,3 +45,33 @@ xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
 xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
 xfs-fix-per-inode-dax-flag-inheritance.patch
 xfs-fix-inobt-inode-allocation-search-optimization.patch
+xfs-clear-ms_active-after-finishing-log-recovery.patch
+xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
+iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
+xfs-write-unmount-record-for-ro-mounts.patch
+xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
+xfs-remove-xfs_trans_ail_delete_bulk.patch
+xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
+xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
+xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
+xfs-always-verify-the-log-tail-during-recovery.patch
+xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
+xfs-handle-efscorrupted-during-head-tail-verification.patch
+xfs-add-log-recovery-tracepoint-for-head-tail.patch
+xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
+xfs-evict-all-inodes-involved-with-log-redo-item.patch
+xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
+xfs-open-code-xfs_buf_item_dirty.patch
+xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
+xfs-ordered-buffer-log-items-are-never-formatted.patch
+xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
+xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
+xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
+xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
+xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
+xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
+xfs-disable-per-inode-dax-flag.patch
+xfs-fix-incorrect-log_flushed-on-fsync.patch
+xfs-don-t-set-v3-xflags-for-v2-inodes.patch
+xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
+xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
diff --git a/queue-4.9/xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch b/queue-4.9/xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
new file mode 100644
index 00000000000..662ed8f9c16
--- /dev/null
+++ b/queue-4.9/xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
@@ -0,0 +1,115 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:49 -0700
+Subject: xfs: Add infrastructure needed for error propagation during buffer IO failure
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-25-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit 0b80ae6ed13169bd3a244e71169f2cc020b0c57a upstream.
+
+With the current code, XFS never re-submit a failed buffer for IO,
+because the failed item in the buffer is kept in the flush locked state
+forever.
+
+To be able to resubmit an log item for IO, we need a way to mark an item
+as failed, if, for any reason the buffer which the item belonged to
+failed during writeback.
+
+Add a new log item callback to be used after an IO completion failure
+and make the needed clean ups.
+
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   32 +++++++++++++++++++++++++++++++-
+ fs/xfs/xfs_trans.h    |    7 +++++--
+ 2 files changed, 36 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -29,6 +29,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_log.h"
++#include "xfs_inode.h"
+ 
+ 
+ kmem_zone_t	*xfs_buf_item_zone;
+@@ -1054,6 +1055,31 @@ xfs_buf_do_callbacks(
+ 	}
+ }
+ 
++/*
++ * Invoke the error state callback for each log item affected by the failed I/O.
++ *
++ * If a metadata buffer write fails with a non-permanent error, the buffer is
++ * eventually resubmitted and so the completion callbacks are not run. The error
++ * state may need to be propagated to the log items attached to the buffer,
++ * however, so the next AIL push of the item knows hot to handle it correctly.
++ */
++STATIC void
++xfs_buf_do_callbacks_fail(
++	struct xfs_buf		*bp)
++{
++	struct xfs_log_item	*next;
++	struct xfs_log_item	*lip = bp->b_fspriv;
++	struct xfs_ail		*ailp = lip->li_ailp;
++
++	spin_lock(&ailp->xa_lock);
++	for (; lip; lip = next) {
++		next = lip->li_bio_list;
++		if (lip->li_ops->iop_error)
++			lip->li_ops->iop_error(lip, bp);
++	}
++	spin_unlock(&ailp->xa_lock);
++}
++
+ static bool
+ xfs_buf_iodone_callback_error(
+ 	struct xfs_buf		*bp)
+@@ -1123,7 +1149,11 @@ xfs_buf_iodone_callback_error(
+ 	if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ 		goto permanent_error;
+ 
+-	/* still a transient error, higher layers will retry */
++	/*
++	 * Still a transient error, run IO completion failure callbacks and let
++	 * the higher layers retry the buffer.
++	 */
++	xfs_buf_do_callbacks_fail(bp);
+ 	xfs_buf_ioerror(bp, 0);
+ 	xfs_buf_relse(bp);
+ 	return true;
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -65,11 +65,13 @@ typedef struct xfs_log_item {
+ } xfs_log_item_t;
+ 
+ #define	XFS_LI_IN_AIL	0x1
+-#define XFS_LI_ABORTED	0x2
++#define	XFS_LI_ABORTED	0x2
++#define	XFS_LI_FAILED	0x4
+ 
+ #define XFS_LI_FLAGS \
+ 	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
+-	{ XFS_LI_ABORTED,	"ABORTED" }
++	{ XFS_LI_ABORTED,	"ABORTED" }, \
++	{ XFS_LI_FAILED,	"FAILED" }
+ 
+ struct xfs_item_ops {
+ 	void (*iop_size)(xfs_log_item_t *, int *, int *);
+@@ -80,6 +82,7 @@ struct xfs_item_ops {
+ 	void (*iop_unlock)(xfs_log_item_t *);
+ 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+ 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
++	void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+ };
+ 
+ void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
diff --git a/queue-4.9/xfs-add-log-recovery-tracepoint-for-head-tail.patch b/queue-4.9/xfs-add-log-recovery-tracepoint-for-head-tail.patch
new file mode 100644
index 00000000000..98aa89c42d6
--- /dev/null
+++ b/queue-4.9/xfs-add-log-recovery-tracepoint-for-head-tail.patch
@@ -0,0 +1,65 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:55 -0700
+Subject: xfs: add log recovery tracepoint for head/tail
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-31-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e67d3d4246e5fbb0c7c700426d11241ca9c6f473 upstream.
+
+Torn write detection and tail overwrite detection can shift the log
+head and tail respectively in the event of CRC mismatch or
+corruption errors. Add a high-level log recovery tracepoint to dump
+the final log head/tail and make those values easily attainable in
+debug/diagnostic situations.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |    2 ++
+ fs/xfs/xfs_trace.h       |   18 ++++++++++++++++++
+ 2 files changed, 20 insertions(+)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -5596,6 +5596,8 @@ xlog_do_recover(
+ 	xfs_buf_t	*bp;
+ 	xfs_sb_t	*sbp;
+ 
++	trace_xfs_log_recover(log, head_blk, tail_blk);
++
+ 	/*
+ 	 * First replay the images in the log.
+ 	 */
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -1991,6 +1991,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+ 
++TRACE_EVENT(xfs_log_recover,
++	TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
++	TP_ARGS(log, headblk, tailblk),
++	TP_STRUCT__entry(
++		__field(dev_t, dev)
++		__field(xfs_daddr_t, headblk)
++		__field(xfs_daddr_t, tailblk)
++	),
++	TP_fast_assign(
++		__entry->dev = log->l_mp->m_super->s_dev;
++		__entry->headblk = headblk;
++		__entry->tailblk = tailblk;
++	),
++	TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
++		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
++		  __entry->tailblk)
++)
++
+ TRACE_EVENT(xfs_log_recover_record,
+ 	TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+ 	TP_ARGS(log, rhead, pass),
diff --git a/queue-4.9/xfs-always-verify-the-log-tail-during-recovery.patch b/queue-4.9/xfs-always-verify-the-log-tail-during-recovery.patch
new file mode 100644
index 00000000000..76fa9b1e7af
--- /dev/null
+++ b/queue-4.9/xfs-always-verify-the-log-tail-during-recovery.patch
@@ -0,0 +1,76 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:52 -0700
+Subject: xfs: always verify the log tail during recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-28-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 5297ac1f6d7cbf45464a49b9558831f271dfc559 upstream.
+
+Log tail verification currently only occurs when torn writes are
+detected at the head of the log. This was introduced because a
+change in the head block due to torn writes can lead to a change in
+the tail block (each log record header references the current tail)
+and the tail block should be verified before log recovery proceeds.
+
+Tail corruption is possible outside of torn write scenarios,
+however. For example, partial log writes can be detected and cleared
+during the initial head/tail block discovery process. If the partial
+write coincides with a tail overwrite, the log tail is corrupted and
+recovery fails.
+
+To facilitate correct handling of log tail overwites, update log
+recovery to always perform tail verification. This is necessary to
+detect potential tail overwrite conditions when torn writes may not
+have occurred. This changes normal (i.e., no torn writes) recovery
+behavior slightly to detect and return CRC related errors near the
+tail before actual recovery starts.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   26 +++-----------------------
+ 1 file changed, 3 insertions(+), 23 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1183,31 +1183,11 @@ xlog_verify_head(
+ 			ASSERT(0);
+ 			return 0;
+ 		}
+-
+-		/*
+-		 * Now verify the tail based on the updated head. This is
+-		 * required because the torn writes trimmed from the head could
+-		 * have been written over the tail of a previous record. Return
+-		 * any errors since recovery cannot proceed if the tail is
+-		 * corrupt.
+-		 *
+-		 * XXX: This leaves a gap in truly robust protection from torn
+-		 * writes in the log. If the head is behind the tail, the tail
+-		 * pushes forward to create some space and then a crash occurs
+-		 * causing the writes into the previous record's tail region to
+-		 * tear, log recovery isn't able to recover.
+-		 *
+-		 * How likely is this to occur? If possible, can we do something
+-		 * more intelligent here? Is it safe to push the tail forward if
+-		 * we can determine that the tail is within the range of the
+-		 * torn write (e.g., the kernel can only overwrite the tail if
+-		 * it has actually been pushed forward)? Alternatively, could we
+-		 * somehow prevent this condition at runtime?
+-		 */
+-		error = xlog_verify_tail(log, *head_blk, *tail_blk);
+ 	}
++	if (error)
++		return error;
+ 
+-	return error;
++	return xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
+ 
+ /*
diff --git a/queue-4.9/xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch b/queue-4.9/xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
new file mode 100644
index 00000000000..a4c13aa8197
--- /dev/null
+++ b/queue-4.9/xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
@@ -0,0 +1,85 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:58 -0700
+Subject: xfs: check for race with xfs_reclaim_inode() in xfs_ifree_cluster()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Omar Sandoval <osandov@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-34-hch@lst.de>
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit f2e9ad212def50bcf4c098c6288779dd97fff0f0 upstream.
+
+After xfs_ifree_cluster() finds an inode in the radix tree and verifies
+that the inode number is what it expected, xfs_reclaim_inode() can swoop
+in and free it. xfs_ifree_cluster() will then happily continue working
+on the freed inode. Most importantly, it will mark the inode stale,
+which will probably be overwritten when the inode slab object is
+reallocated, but if it has already been reallocated then we can end up
+with an inode spuriously marked stale.
+
+In 8a17d7ddedb4 ("xfs: mark reclaimed inodes invalid earlier") we added
+a second check to xfs_iflush_cluster() to detect this race, but the
+similar RCU lookup in xfs_ifree_cluster() needs the same treatment.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |   10 +++++-----
+ fs/xfs/xfs_inode.c  |   23 ++++++++++++++++++-----
+ 2 files changed, 23 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1078,11 +1078,11 @@ reclaim:
+ 	 * Because we use RCU freeing we need to ensure the inode always appears
+ 	 * to be reclaimed with an invalid inode number when in the free state.
+ 	 * We do this as early as possible under the ILOCK so that
+-	 * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+-	 * By doing this, we guarantee that once xfs_iflush_cluster has locked
+-	 * XFS_ILOCK that it will see either a valid, flushable inode that will
+-	 * serialise correctly, or it will see a clean (and invalid) inode that
+-	 * it can skip.
++	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
++	 * detect races with us here. By doing this, we guarantee that once
++	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
++	 * it will see either a valid inode that will serialise correctly, or it
++	 * will see an invalid inode that it can skip.
+ 	 */
+ 	spin_lock(&ip->i_flags_lock);
+ 	ip->i_flags = XFS_IRECLAIM;
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2368,11 +2368,24 @@ retry:
+ 			 * already marked stale. If we can't lock it, back off
+ 			 * and retry.
+ 			 */
+-			if (ip != free_ip &&
+-			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+-				rcu_read_unlock();
+-				delay(1);
+-				goto retry;
++			if (ip != free_ip) {
++				if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
++					rcu_read_unlock();
++					delay(1);
++					goto retry;
++				}
++
++				/*
++				 * Check the inode number again in case we're
++				 * racing with freeing in xfs_reclaim_inode().
++				 * See the comments in that function for more
++				 * information as to why the initial check is
++				 * not sufficient.
++				 */
++				if (ip->i_ino != inum + i) {
++					xfs_iunlock(ip, XFS_ILOCK_EXCL);
++					continue;
++				}
+ 			}
+ 			rcu_read_unlock();
+ 
diff --git a/queue-4.9/xfs-clear-ms_active-after-finishing-log-recovery.patch b/queue-4.9/xfs-clear-ms_active-after-finishing-log-recovery.patch
new file mode 100644
index 00000000000..72835e6f244
--- /dev/null
+++ b/queue-4.9/xfs-clear-ms_active-after-finishing-log-recovery.patch
@@ -0,0 +1,85 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:43 -0700
+Subject: xfs: clear MS_ACTIVE after finishing log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-19-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 8204f8ddaafafcae074746fcf2a05a45e6827603 upstream.
+
+Way back when we established inode block-map redo log items, it was
+discovered that we needed to prevent the VFS from evicting inodes during
+log recovery because any given inode might be have bmap redo items to
+replay even if the inode has no link count and is ultimately deleted,
+and any eviction of an unlinked inode causes the inode to be truncated
+and freed too early.
+
+To make this possible, we set MS_ACTIVE so that inodes would not be torn
+down immediately upon release.  Unfortunately, this also results in the
+quota inodes not being released at all if a later part of the mount
+process should fail, because we never reclaim the inodes.  So, set
+MS_ACTIVE right before we do the last part of log recovery and clear it
+immediately after we finish the log recovery so that everything
+will be torn down properly if we abort the mount.
+
+Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c   |   11 +++++++++++
+ fs/xfs/xfs_mount.c |   10 ----------
+ 2 files changed, 11 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -749,9 +749,20 @@ xfs_log_mount_finish(
+ 		return 0;
+ 	}
+ 
++	/*
++	 * During the second phase of log recovery, we need iget and
++	 * iput to behave like they do for an active filesystem.
++	 * xfs_fs_drop_inode needs to be able to prevent the deletion
++	 * of inodes before we're done replaying log items on those
++	 * inodes.  Turn it off immediately after recovery finishes
++	 * so that we don't leak the quota inodes if subsequent mount
++	 * activities fail.
++	 */
++	mp->m_super->s_flags |= MS_ACTIVE;
+ 	error = xlog_recover_finish(mp->m_log);
+ 	if (!error)
+ 		xfs_log_work_queue(mp);
++	mp->m_super->s_flags &= ~MS_ACTIVE;
+ 
+ 	return error;
+ }
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -925,15 +925,6 @@ xfs_mountfs(
+ 	}
+ 
+ 	/*
+-	 * During the second phase of log recovery, we need iget and
+-	 * iput to behave like they do for an active filesystem.
+-	 * xfs_fs_drop_inode needs to be able to prevent the deletion
+-	 * of inodes before we're done replaying log items on those
+-	 * inodes.
+-	 */
+-	mp->m_super->s_flags |= MS_ACTIVE;
+-
+-	/*
+ 	 * Finish recovering the file system.  This part needed to be delayed
+ 	 * until after the root and real-time bitmap inodes were consistently
+ 	 * read in.
+@@ -1008,7 +999,6 @@ xfs_mountfs(
+  out_quota:
+ 	xfs_qm_unmount_quotas(mp);
+  out_rtunmount:
+-	mp->m_super->s_flags &= ~MS_ACTIVE;
+ 	xfs_rtunmount_inodes(mp);
+  out_rele_rip:
+ 	IRELE(rip);
diff --git a/queue-4.9/xfs-disable-per-inode-dax-flag.patch b/queue-4.9/xfs-disable-per-inode-dax-flag.patch
new file mode 100644
index 00000000000..d4b60e3f025
--- /dev/null
+++ b/queue-4.9/xfs-disable-per-inode-dax-flag.patch
@@ -0,0 +1,39 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:08 -0700
+Subject: xfs: disable per-inode DAX flag
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-44-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 742d84290739ae908f1b61b7d17ea382c8c0073a upstream.
+
+Currently flag switching can be used to easily crash the kernel.  Disable
+the per-inode DAX flag until that is sorted out.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -1005,11 +1005,12 @@ xfs_diflags_to_linux(
+ 		inode->i_flags |= S_NOATIME;
+ 	else
+ 		inode->i_flags &= ~S_NOATIME;
++#if 0	/* disabled until the flag switching races are sorted out */
+ 	if (xflags & FS_XFLAG_DAX)
+ 		inode->i_flags |= S_DAX;
+ 	else
+ 		inode->i_flags &= ~S_DAX;
+-
++#endif
+ }
+ 
+ static int
diff --git a/queue-4.9/xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch b/queue-4.9/xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
new file mode 100644
index 00000000000..17b9b5c1a5a
--- /dev/null
+++ b/queue-4.9/xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
@@ -0,0 +1,97 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:06 -0700
+Subject: xfs: disallow marking previously dirty buffers as ordered
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-42-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a5814bceea48ee1c57c4db2bd54b0c0246daf54a upstream.
+
+Ordered buffers are used in situations where the buffer is not
+physically logged but must pass through the transaction/logging
+pipeline for a particular transaction. As a result, ordered buffers
+are not unpinned and written back until the transaction commits to
+the log. Ordered buffers have a strict requirement that the target
+buffer must not be currently dirty and resident in the log pipeline
+at the time it is marked ordered. If a dirty+ordered buffer is
+committed, the buffer is reinserted to the AIL but not physically
+relogged at the LSN of the associated checkpoint. The buffer log
+item is assigned the LSN of the latest checkpoint and the AIL
+effectively releases the previously logged buffer content from the
+active log before the buffer has been written back. If the tail
+pushes forward and a filesystem crash occurs while in this state, an
+inconsistent filesystem could result.
+
+It is currently the caller responsibility to ensure an ordered
+buffer is not already dirty from a previous modification. This is
+unclear and error prone when not used in situations where it is
+guaranteed a buffer has not been previously modified (such as new
+metadata allocations).
+
+To facilitate general purpose use of ordered buffers, update
+xfs_trans_ordered_buf() to conditionally order the buffer based on
+state of the log item and return the status of the result. If the
+bli is dirty, do not order the buffer and return false. The caller
+must either physically log the buffer (having acquired the
+appropriate log reservation) or push it from the AIL to clean it
+before it can be marked ordered in the current transaction.
+
+Note that ordered buffers are currently only used in two situations:
+1.) inode chunk allocation where previously logged buffers are not
+possible and 2.) extent swap which will be updated to handle ordered
+buffer failures in a separate patch.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans.h     |    2 +-
+ fs/xfs/xfs_trans_buf.c |    7 +++++--
+ 2 files changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -217,7 +217,7 @@ void		xfs_trans_bhold_release(xfs_trans_
+ void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+-void		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
++bool		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+ void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -724,7 +724,7 @@ xfs_trans_inode_alloc_buf(
+  * transactions rather than the physical changes we make to the buffer without
+  * changing writeback ordering constraints of metadata buffers.
+  */
+-void
++bool
+ xfs_trans_ordered_buf(
+ 	struct xfs_trans	*tp,
+ 	struct xfs_buf		*bp)
+@@ -734,7 +734,9 @@ xfs_trans_ordered_buf(
+ 	ASSERT(bp->b_transp == tp);
+ 	ASSERT(bip != NULL);
+ 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+-	ASSERT(!xfs_buf_item_dirty_format(bip));
++
++	if (xfs_buf_item_dirty_format(bip))
++		return false;
+ 
+ 	bip->bli_flags |= XFS_BLI_ORDERED;
+ 	trace_xfs_buf_item_ordered(bip);
+@@ -744,6 +746,7 @@ xfs_trans_ordered_buf(
+ 	 * to be marked dirty and that it has been logged.
+ 	 */
+ 	xfs_trans_dirty_buf(tp, bp);
++	return true;
+ }
+ 
+ /*
diff --git a/queue-4.9/xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch b/queue-4.9/xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
new file mode 100644
index 00000000000..c1fe019eeda
--- /dev/null
+++ b/queue-4.9/xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
@@ -0,0 +1,34 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:44 -0700
+Subject: xfs: don't leak quotacheck dquots when cow recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-20-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 77aff8c76425c8f49b50d0b9009915066739e7d2 upstream.
+
+If we fail a mount on account of cow recovery errors, it's possible that
+a previous quotacheck left some dquots in memory.  The bailout clause of
+xfs_mountfs forgets to purge these, and so we leak them.  Fix that.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_mount.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1004,6 +1004,8 @@ xfs_mountfs(
+ 	IRELE(rip);
+ 	cancel_delayed_work_sync(&mp->m_reclaim_work);
+ 	xfs_reclaim_inodes(mp, SYNC_WAIT);
++	/* Clean out dquots that might be in memory after quotacheck. */
++	xfs_qm_unmount(mp);
+  out_log_dealloc:
+ 	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+ 	xfs_log_mount_cancel(mp);
diff --git a/queue-4.9/xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch b/queue-4.9/xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
new file mode 100644
index 00000000000..1aa95814834
--- /dev/null
+++ b/queue-4.9/xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
@@ -0,0 +1,121 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:03 -0700
+Subject: xfs: don't log dirty ranges for ordered buffers
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-39-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 8dc518dfa7dbd079581269e51074b3c55a65a880 upstream.
+
+Ordered buffers are attached to transactions and pushed through the
+logging infrastructure just like normal buffers with the exception
+that they are not actually written to the log. Therefore, we don't
+need to log dirty ranges of ordered buffers. xfs_trans_log_buf() is
+called on ordered buffers to set up all of the dirty state on the
+transaction, buffer and log item and prepare the buffer for I/O.
+
+Now that xfs_trans_dirty_buf() is available, call it from
+xfs_trans_ordered_buf() so the latter is now mutually exclusive with
+xfs_trans_log_buf(). This reflects the implementation of ordered
+buffers and helps eliminate confusion over the need to log ranges of
+ordered buffers just to set up internal log state.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c  |    6 ++----
+ fs/xfs/libxfs/xfs_ialloc.c |    2 --
+ fs/xfs/xfs_trans_buf.c     |   26 ++++++++++++++------------
+ 3 files changed, 16 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -4447,12 +4447,10 @@ xfs_btree_block_change_owner(
+ 	 * though, so everything is consistent in memory.
+ 	 */
+ 	if (bp) {
+-		if (cur->bc_tp) {
++		if (cur->bc_tp)
+ 			xfs_trans_ordered_buf(cur->bc_tp, bp);
+-			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+-		} else {
++		else
+ 			xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+-		}
+ 	} else {
+ 		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ 		ASSERT(level == cur->bc_nlevels - 1);
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
+ 				 * transaction and pin the log appropriately.
+ 				 */
+ 				xfs_trans_ordered_buf(tp, fbuf);
+-				xfs_trans_log_buf(tp, fbuf, 0,
+-						  BBTOB(fbuf->b_length) - 1);
+ 			}
+ 		} else {
+ 			fbuf->b_flags |= XBF_DONE;
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -560,16 +560,12 @@ xfs_trans_log_buf(
+ 	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+ 
+ 	ASSERT(first <= last && last < BBTOB(bp->b_length));
++	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+ 
+ 	xfs_trans_dirty_buf(tp, bp);
+ 
+-	/*
+-	 * If we have an ordered buffer we are not logging any dirty range but
+-	 * it still needs to be marked dirty and that it has been logged.
+-	 */
+ 	trace_xfs_trans_log_buf(bip);
+-	if (!(bip->bli_flags & XFS_BLI_ORDERED))
+-		xfs_buf_item_log(bip, first, last);
++	xfs_buf_item_log(bip, first, last);
+ }
+ 
+ 
+@@ -722,12 +718,11 @@ xfs_trans_inode_alloc_buf(
+ }
+ 
+ /*
+- * Mark the buffer as ordered for this transaction. This means
+- * that the contents of the buffer are not recorded in the transaction
+- * but it is tracked in the AIL as though it was. This allows us
+- * to record logical changes in transactions rather than the physical
+- * changes we make to the buffer without changing writeback ordering
+- * constraints of metadata buffers.
++ * Mark the buffer as ordered for this transaction. This means that the contents
++ * of the buffer are not recorded in the transaction but it is tracked in the
++ * AIL as though it was. This allows us to record logical changes in
++ * transactions rather than the physical changes we make to the buffer without
++ * changing writeback ordering constraints of metadata buffers.
+  */
+ void
+ xfs_trans_ordered_buf(
+@@ -739,9 +734,16 @@ xfs_trans_ordered_buf(
+ 	ASSERT(bp->b_transp == tp);
+ 	ASSERT(bip != NULL);
+ 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
++	ASSERT(!xfs_buf_item_dirty_format(bip));
+ 
+ 	bip->bli_flags |= XFS_BLI_ORDERED;
+ 	trace_xfs_buf_item_ordered(bip);
++
++	/*
++	 * We don't log a dirty range of an ordered buffer but it still needs
++	 * to be marked dirty and that it has been logged.
++	 */
++	xfs_trans_dirty_buf(tp, bp);
+ }
+ 
+ /*
diff --git a/queue-4.9/xfs-don-t-set-v3-xflags-for-v2-inodes.patch b/queue-4.9/xfs-don-t-set-v3-xflags-for-v2-inodes.patch
new file mode 100644
index 00000000000..9d8f073e5eb
--- /dev/null
+++ b/queue-4.9/xfs-don-t-set-v3-xflags-for-v2-inodes.patch
@@ -0,0 +1,102 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:10 -0700
+Subject: xfs: don't set v3 xflags for v2 inodes
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-46-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit dd60687ee541ca3f6df8758f38e6f22f57c42a37 upstream.
+
+Reject attempts to set XFLAGS that correspond to di_flags2 inode flags
+if the inode isn't a v3 inode, because di_flags2 only exists on v3.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c |   38 +++++++++++++++++++++++++-------------
+ 1 file changed, 25 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
+ 	return 0;
+ }
+ 
+-STATIC void
+-xfs_set_diflags(
++STATIC uint16_t
++xfs_flags2diflags(
+ 	struct xfs_inode	*ip,
+ 	unsigned int		xflags)
+ {
+-	unsigned int		di_flags;
+-	uint64_t		di_flags2;
+-
+ 	/* can't set PREALLOC this way, just preserve it */
+-	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++	uint16_t		di_flags =
++		(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++
+ 	if (xflags & FS_XFLAG_IMMUTABLE)
+ 		di_flags |= XFS_DIFLAG_IMMUTABLE;
+ 	if (xflags & FS_XFLAG_APPEND)
+@@ -967,19 +966,24 @@ xfs_set_diflags(
+ 		if (xflags & FS_XFLAG_EXTSIZE)
+ 			di_flags |= XFS_DIFLAG_EXTSIZE;
+ 	}
+-	ip->i_d.di_flags = di_flags;
+ 
+-	/* diflags2 only valid for v3 inodes. */
+-	if (ip->i_d.di_version < 3)
+-		return;
++	return di_flags;
++}
++
++STATIC uint64_t
++xfs_flags2diflags2(
++	struct xfs_inode	*ip,
++	unsigned int		xflags)
++{
++	uint64_t		di_flags2 =
++		(ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ 
+-	di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ 	if (xflags & FS_XFLAG_DAX)
+ 		di_flags2 |= XFS_DIFLAG2_DAX;
+ 	if (xflags & FS_XFLAG_COWEXTSIZE)
+ 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ 
+-	ip->i_d.di_flags2 = di_flags2;
++	return di_flags2;
+ }
+ 
+ STATIC void
+@@ -1020,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
+ 	struct fsxattr		*fa)
+ {
+ 	struct xfs_mount	*mp = ip->i_mount;
++	uint64_t		di_flags2;
+ 
+ 	/* Can't change realtime flag if any extents are allocated. */
+ 	if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+@@ -1050,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
+ 	    !capable(CAP_LINUX_IMMUTABLE))
+ 		return -EPERM;
+ 
+-	xfs_set_diflags(ip, fa->fsx_xflags);
++	/* diflags2 only valid for v3 inodes. */
++	di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
++	if (di_flags2 && ip->i_d.di_version < 3)
++		return -EINVAL;
++
++	ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
++	ip->i_d.di_flags2 = di_flags2;
++
+ 	xfs_diflags_to_linux(ip);
+ 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+ 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/queue-4.9/xfs-evict-all-inodes-involved-with-log-redo-item.patch b/queue-4.9/xfs-evict-all-inodes-involved-with-log-redo-item.patch
new file mode 100644
index 00000000000..2f310bc21d7
--- /dev/null
+++ b/queue-4.9/xfs-evict-all-inodes-involved-with-log-redo-item.patch
@@ -0,0 +1,94 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:57 -0700
+Subject: xfs: evict all inodes involved with log redo item
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>, viro@ZenIV.linux.org.uk
+Message-ID: <20170917210712.10804-33-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 799ea9e9c59949008770aab4e1da87f10e99dbe4 upstream.
+
+When we introduced the bmap redo log items, we set MS_ACTIVE on the
+mountpoint and XFS_IRECOVERY on the inode to prevent unlinked inodes
+from being truncated prematurely during log recovery.  This also had the
+effect of putting linked inodes on the lru instead of evicting them.
+
+Unfortunately, we neglected to find all those unreferenced lru inodes
+and evict them after finishing log recovery, which means that we leak
+them if anything goes wrong in the rest of xfs_mountfs, because the lru
+is only cleaned out on unmount.
+
+Therefore, evict unreferenced inodes in the lru list immediately
+after clearing MS_ACTIVE.
+
+Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: viro@ZenIV.linux.org.uk
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/inode.c         |    1 +
+ fs/internal.h      |    1 -
+ fs/xfs/xfs_log.c   |   12 ++++++++++++
+ include/linux/fs.h |    1 +
+ 4 files changed, 14 insertions(+), 1 deletion(-)
+
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -637,6 +637,7 @@ again:
+ 
+ 	dispose_list(&dispose);
+ }
++EXPORT_SYMBOL_GPL(evict_inodes);
+ 
+ /**
+  * invalidate_inodes	- attempt to free all inodes on a superblock
+--- a/fs/internal.h
++++ b/fs/internal.h
+@@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const
+ extern void inode_io_list_del(struct inode *inode);
+ 
+ extern long get_nr_dirty_inodes(void);
+-extern void evict_inodes(struct super_block *);
+ extern int invalidate_inodes(struct super_block *, bool);
+ 
+ /*
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -761,12 +761,24 @@ xfs_log_mount_finish(
+ 	 * inodes.  Turn it off immediately after recovery finishes
+ 	 * so that we don't leak the quota inodes if subsequent mount
+ 	 * activities fail.
++	 *
++	 * We let all inodes involved in redo item processing end up on
++	 * the LRU instead of being evicted immediately so that if we do
++	 * something to an unlinked inode, the irele won't cause
++	 * premature truncation and freeing of the inode, which results
++	 * in log recovery failure.  We have to evict the unreferenced
++	 * lru inodes after clearing MS_ACTIVE because we don't
++	 * otherwise clean up the lru if there's a subsequent failure in
++	 * xfs_mountfs, which leads to us leaking the inodes if nothing
++	 * else (e.g. quotacheck) references the inodes before the
++	 * mount failure occurs.
+ 	 */
+ 	mp->m_super->s_flags |= MS_ACTIVE;
+ 	error = xlog_recover_finish(mp->m_log);
+ 	if (!error)
+ 		xfs_log_work_queue(mp);
+ 	mp->m_super->s_flags &= ~MS_ACTIVE;
++	evict_inodes(mp->m_super);
+ 
+ 	if (readonly)
+ 		mp->m_flags |= XFS_MOUNT_RDONLY;
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inod
+ #endif
+ extern void unlock_new_inode(struct inode *);
+ extern unsigned int get_next_ino(void);
++extern void evict_inodes(struct super_block *sb);
+ 
+ extern void __iget(struct inode * inode);
+ extern void iget_failed(struct inode *);
diff --git a/queue-4.9/xfs-fix-incorrect-log_flushed-on-fsync.patch b/queue-4.9/xfs-fix-incorrect-log_flushed-on-fsync.patch
new file mode 100644
index 00000000000..2bf5e667d57
--- /dev/null
+++ b/queue-4.9/xfs-fix-incorrect-log_flushed-on-fsync.patch
@@ -0,0 +1,98 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:09 -0700
+Subject: xfs: fix incorrect log_flushed on fsync
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Amir Goldstein <amir73il@gmail.com>, Josef Bacik <jbacik@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-45-hch@lst.de>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit 47c7d0b19502583120c3f396c7559e7a77288a68 upstream.
+
+When calling into _xfs_log_force{,_lsn}() with a pointer
+to log_flushed variable, log_flushed will be set to 1 if:
+1. xlog_sync() is called to flush the active log buffer
+AND/OR
+2. xlog_wait() is called to wait on a syncing log buffers
+
+xfs_file_fsync() checks the value of log_flushed after
+_xfs_log_force_lsn() call to optimize away an explicit
+PREFLUSH request to the data block device after writing
+out all the file's pages to disk.
+
+This optimization is incorrect in the following sequence of events:
+
+ Task A                    Task B
+ -------------------------------------------------------
+ xfs_file_fsync()
+   _xfs_log_force_lsn()
+     xlog_sync()
+        [submit PREFLUSH]
+                           xfs_file_fsync()
+                             file_write_and_wait_range()
+                               [submit WRITE X]
+                               [endio  WRITE X]
+                             _xfs_log_force_lsn()
+                               xlog_wait()
+        [endio  PREFLUSH]
+
+The write X is not guarantied to be on persistent storage
+when PREFLUSH request in completed, because write A was submitted
+after the PREFLUSH request, but xfs_file_fsync() of task A will
+be notified of log_flushed=1 and will skip explicit flush.
+
+If the system crashes after fsync of task A, write X may not be
+present on disk after reboot.
+
+This bug was discovered and demonstrated using Josef Bacik's
+dm-log-writes target, which can be used to record block io operations
+and then replay a subset of these operations onto the target device.
+The test goes something like this:
+- Use fsx to execute ops of a file and record ops on log device
+- Every now and then fsync the file, store md5 of file and mark
+  the location in the log
+- Then replay log onto device for each mark, mount fs and compare
+  md5 of file to stored value
+
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Josef Bacik <jbacik@fb.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |    7 -------
+ 1 file changed, 7 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -3337,8 +3337,6 @@ maybe_sleep:
+ 		 */
+ 		if (iclog->ic_state & XLOG_STATE_IOERROR)
+ 			return -EIO;
+-		if (log_flushed)
+-			*log_flushed = 1;
+ 	} else {
+ 
+ no_sleep:
+@@ -3442,8 +3440,6 @@ try_again:
+ 
+ 				xlog_wait(&iclog->ic_prev->ic_write_wait,
+ 							&log->l_icloglock);
+-				if (log_flushed)
+-					*log_flushed = 1;
+ 				already_slept = 1;
+ 				goto try_again;
+ 			}
+@@ -3477,9 +3473,6 @@ try_again:
+ 			 */
+ 			if (iclog->ic_state & XLOG_STATE_IOERROR)
+ 				return -EIO;
+-
+-			if (log_flushed)
+-				*log_flushed = 1;
+ 		} else {		/* just return */
+ 			spin_unlock(&log->l_icloglock);
+ 		}
diff --git a/queue-4.9/xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch b/queue-4.9/xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
new file mode 100644
index 00000000000..8ccc26b5a5f
--- /dev/null
+++ b/queue-4.9/xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
@@ -0,0 +1,201 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:53 -0700
+Subject: xfs: fix log recovery corruption error due to tail overwrite
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-29-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 4a4f66eac4681378996a1837ad1ffec3a2e2981f upstream.
+
+If we consider the case where the tail (T) of the log is pinned long
+enough for the head (H) to push and block behind the tail, we can
+end up blocked in the following state without enough free space (f)
+in the log to satisfy a transaction reservation:
+
+	0	phys. log	N
+	[-------HffT---H'--T'---]
+
+The last good record in the log (before H) refers to T. The tail
+eventually pushes forward (T') leaving more free space in the log
+for writes to H. At this point, suppose space frees up in the log
+for the maximum of 8 in-core log buffers to start flushing out to
+the log. If this pushes the head from H to H', these next writes
+overwrite the previous tail T. This is safe because the items logged
+from T to T' have been written back and removed from the AIL.
+
+If the next log writes (H -> H') happen to fail and result in
+partial records in the log, the filesystem shuts down having
+overwritten T with invalid data. Log recovery correctly locates H on
+the subsequent mount, but H still refers to the now corrupted tail
+T. This results in log corruption errors and recovery failure.
+
+Since the tail overwrite results from otherwise correct runtime
+behavior, it is up to log recovery to try and deal with this
+situation. Update log recovery tail verification to run a CRC pass
+from the first record past the tail to the head. This facilitates
+error detection at T and moves the recovery tail to the first good
+record past H' (similar to truncating the head on torn write
+detection). If corruption is detected beyond the range possibly
+affected by the max number of iclogs, the log is legitimately
+corrupted and log recovery failure is expected.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |  108 +++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 77 insertions(+), 31 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1029,61 +1029,106 @@ out_error:
+ }
+ 
+ /*
+- * Check the log tail for torn writes. This is required when torn writes are
+- * detected at the head and the head had to be walked back to a previous record.
+- * The tail of the previous record must now be verified to ensure the torn
+- * writes didn't corrupt the previous tail.
++ * Calculate distance from head to tail (i.e., unused space in the log).
++ */
++static inline int
++xlog_tail_distance(
++	struct xlog	*log,
++	xfs_daddr_t	head_blk,
++	xfs_daddr_t	tail_blk)
++{
++	if (head_blk < tail_blk)
++		return tail_blk - head_blk;
++
++	return tail_blk + (log->l_logBBsize - head_blk);
++}
++
++/*
++ * Verify the log tail. This is particularly important when torn or incomplete
++ * writes have been detected near the front of the log and the head has been
++ * walked back accordingly.
++ *
++ * We also have to handle the case where the tail was pinned and the head
++ * blocked behind the tail right before a crash. If the tail had been pushed
++ * immediately prior to the crash and the subsequent checkpoint was only
++ * partially written, it's possible it overwrote the last referenced tail in the
++ * log with garbage. This is not a coherency problem because the tail must have
++ * been pushed before it can be overwritten, but appears as log corruption to
++ * recovery because we have no way to know the tail was updated if the
++ * subsequent checkpoint didn't write successfully.
+  *
+- * Return an error if CRC verification fails as recovery cannot proceed.
++ * Therefore, CRC check the log from tail to head. If a failure occurs and the
++ * offending record is within max iclog bufs from the head, walk the tail
++ * forward and retry until a valid tail is found or corruption is detected out
++ * of the range of a possible overwrite.
+  */
+ STATIC int
+ xlog_verify_tail(
+ 	struct xlog		*log,
+ 	xfs_daddr_t		head_blk,
+-	xfs_daddr_t		tail_blk)
++	xfs_daddr_t		*tail_blk,
++	int			hsize)
+ {
+ 	struct xlog_rec_header	*thead;
+ 	struct xfs_buf		*bp;
+ 	xfs_daddr_t		first_bad;
+-	int			count;
+ 	int			error = 0;
+ 	bool			wrapped;
+-	xfs_daddr_t		tmp_head;
++	xfs_daddr_t		tmp_tail;
++	xfs_daddr_t		orig_tail = *tail_blk;
+ 
+ 	bp = xlog_get_bp(log, 1);
+ 	if (!bp)
+ 		return -ENOMEM;
+ 
+ 	/*
+-	 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+-	 * a temporary head block that points after the last possible
+-	 * concurrently written record of the tail.
++	 * Make sure the tail points to a record (returns positive count on
++	 * success).
+ 	 */
+-	count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+-				     XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+-				     &wrapped);
+-	if (count < 0) {
+-		error = count;
++	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
++			&tmp_tail, &thead, &wrapped);
++	if (error < 0)
+ 		goto out;
+-	}
++	if (*tail_blk != tmp_tail)
++		*tail_blk = tmp_tail;
+ 
+ 	/*
+-	 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+-	 * into the actual log head. tmp_head points to the start of the record
+-	 * so update it to the actual head block.
++	 * Run a CRC check from the tail to the head. We can't just check
++	 * MAX_ICLOGS records past the tail because the tail may point to stale
++	 * blocks cleared during the search for the head/tail. These blocks are
++	 * overwritten with zero-length records and thus record count is not a
++	 * reliable indicator of the iclog state before a crash.
+ 	 */
+-	if (count < XLOG_MAX_ICLOGS + 1)
+-		tmp_head = head_blk;
+-
+-	/*
+-	 * We now have a tail and temporary head block that covers at least
+-	 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+-	 * records were completely written. Run a CRC verification pass from
+-	 * tail to head and return the result.
+-	 */
+-	error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
++	first_bad = 0;
++	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+ 				      XLOG_RECOVER_CRCPASS, &first_bad);
++	while (error == -EFSBADCRC && first_bad) {
++		int	tail_distance;
++
++		/*
++		 * Is corruption within range of the head? If so, retry from
++		 * the next record. Otherwise return an error.
++		 */
++		tail_distance = xlog_tail_distance(log, head_blk, first_bad);
++		if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
++			break;
++
++		/* skip to the next record; returns positive count on success */
++		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
++				&tmp_tail, &thead, &wrapped);
++		if (error < 0)
++			goto out;
++
++		*tail_blk = tmp_tail;
++		first_bad = 0;
++		error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
++					      XLOG_RECOVER_CRCPASS, &first_bad);
++	}
+ 
++	if (!error && *tail_blk != orig_tail)
++		xfs_warn(log->l_mp,
++		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
++			 orig_tail, *tail_blk);
+ out:
+ 	xlog_put_bp(bp);
+ 	return error;
+@@ -1187,7 +1232,8 @@ xlog_verify_head(
+ 	if (error)
+ 		return error;
+ 
+-	return xlog_verify_tail(log, *head_blk, *tail_blk);
++	return xlog_verify_tail(log, *head_blk, tail_blk,
++				be32_to_cpu((*rhead)->h_size));
+ }
+ 
+ /*
diff --git a/queue-4.9/xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch b/queue-4.9/xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
new file mode 100644
index 00000000000..1f74f453b9e
--- /dev/null
+++ b/queue-4.9/xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
@@ -0,0 +1,90 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:51 -0700
+Subject: xfs: fix recovery failure when log record header wraps log end
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-27-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 284f1c2c9bebf871861184b0e2c40fa921dd380b upstream.
+
+The high-level log recovery algorithm consists of two loops that
+walk the physical log and process log records from the tail to the
+head. The first loop handles the case where the tail is beyond the
+head and processes records up to the end of the physical log. The
+subsequent loop processes records from the beginning of the physical
+log to the head.
+
+Because log records can wrap around the end of the physical log, the
+first loop mentioned above must handle this case appropriately.
+Records are processed from in-core buffers, which means that this
+algorithm must split the reads of such records into two partial
+I/Os: 1.) from the beginning of the record to the end of the log and
+2.) from the beginning of the log to the end of the record. This is
+further complicated by the fact that the log record header and log
+record data are read into independent buffers.
+
+The current handling of each buffer correctly splits the reads when
+either the header or data starts before the end of the log and wraps
+around the end. The data read does not correctly handle the case
+where the prior header read wrapped or ends on the physical log end
+boundary. blk_no is incremented to or beyond the log end after the
+header read to point to the record data, but the split data read
+logic triggers, attempts to read from an invalid log block and
+ultimately causes log recovery to fail. This can be reproduced
+fairly reliably via xfstests tests generic/047 and generic/388 with
+large iclog sizes (256k) and small (10M) logs.
+
+If the record header read has pushed beyond the end of the physical
+log, the subsequent data read is actually contiguous. Update the
+data read logic to detect the case where blk_no has wrapped, mod it
+against the log size to read from the correct address and issue one
+contiguous read for the log data buffer. The log record is processed
+as normal from the buffer(s), the loop exits after the current
+iteration and the subsequent loop picks up with the first new record
+after the start of the log.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   18 ++++++++++++++----
+ 1 file changed, 14 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -5216,7 +5216,7 @@ xlog_do_recovery_pass(
+ 	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
+ {
+ 	xlog_rec_header_t	*rhead;
+-	xfs_daddr_t		blk_no;
++	xfs_daddr_t		blk_no, rblk_no;
+ 	xfs_daddr_t		rhead_blk;
+ 	char			*offset;
+ 	xfs_buf_t		*hbp, *dbp;
+@@ -5369,9 +5369,19 @@ xlog_do_recovery_pass(
+ 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+ 			blk_no += hblks;
+ 
+-			/* Read in data for log record */
+-			if (blk_no + bblks <= log->l_logBBsize) {
+-				error = xlog_bread(log, blk_no, bblks, dbp,
++			/*
++			 * Read the log record data in multiple reads if it
++			 * wraps around the end of the log. Note that if the
++			 * header already wrapped, blk_no could point past the
++			 * end of the log. The record data is contiguous in
++			 * that case.
++			 */
++			if (blk_no + bblks <= log->l_logBBsize ||
++			    blk_no >= log->l_logBBsize) {
++				/* mod blk_no in case the header wrapped and
++				 * pushed it beyond the end of the log */
++				rblk_no = do_mod(blk_no, log->l_logBBsize);
++				error = xlog_bread(log, rblk_no, bblks, dbp,
+ 						   &offset);
+ 				if (error)
+ 					goto bread_err2;
diff --git a/queue-4.9/xfs-handle-efscorrupted-during-head-tail-verification.patch b/queue-4.9/xfs-handle-efscorrupted-during-head-tail-verification.patch
new file mode 100644
index 00000000000..dab3e542742
--- /dev/null
+++ b/queue-4.9/xfs-handle-efscorrupted-during-head-tail-verification.patch
@@ -0,0 +1,72 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:54 -0700
+Subject: xfs: handle -EFSCORRUPTED during head/tail verification
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-30-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a4c9b34d6a17081005ec459b57b8effc08f4c731 upstream.
+
+Torn write and tail overwrite detection both trigger only on
+-EFSBADCRC errors. While this is the most likely failure scenario
+for each condition, -EFSCORRUPTED is still possible in certain cases
+depending on what ends up on disk when a torn write or partial tail
+overwrite occurs. For example, an invalid log record h_len can lead
+to an -EFSCORRUPTED error when running the log recovery CRC pass.
+
+Therefore, update log head and tail verification to trigger the
+associated head/tail fixups in the event of -EFSCORRUPTED errors
+along with -EFSBADCRC. Also, -EFSCORRUPTED can currently be returned
+from xlog_do_recovery_pass() before rhead_blk is initialized if the
+first record encountered happens to be corrupted. This leads to an
+incorrect 'first_bad' return value. Initialize rhead_blk earlier in
+the function to address that problem as well.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |    7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1102,7 +1102,7 @@ xlog_verify_tail(
+ 	first_bad = 0;
+ 	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+ 				      XLOG_RECOVER_CRCPASS, &first_bad);
+-	while (error == -EFSBADCRC && first_bad) {
++	while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+ 		int	tail_distance;
+ 
+ 		/*
+@@ -1188,7 +1188,7 @@ xlog_verify_head(
+ 	 */
+ 	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+ 				      XLOG_RECOVER_CRCPASS, &first_bad);
+-	if (error == -EFSBADCRC) {
++	if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+ 		/*
+ 		 * We've hit a potential torn write. Reset the error and warn
+ 		 * about it.
+@@ -5255,7 +5255,7 @@ xlog_do_recovery_pass(
+ 	LIST_HEAD		(buffer_list);
+ 
+ 	ASSERT(head_blk != tail_blk);
+-	rhead_blk = 0;
++	blk_no = rhead_blk = tail_blk;
+ 
+ 	for (i = 0; i < XLOG_RHASH_SIZE; i++)
+ 		INIT_HLIST_HEAD(&rhash[i]);
+@@ -5333,7 +5333,6 @@ xlog_do_recovery_pass(
+ 	}
+ 
+ 	memset(rhash, 0, sizeof(rhash));
+-	blk_no = rhead_blk = tail_blk;
+ 	if (tail_blk > head_blk) {
+ 		/*
+ 		 * Perform recovery around the end of the physical log.
diff --git a/queue-4.9/xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch b/queue-4.9/xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
new file mode 100644
index 00000000000..c2283bb6941
--- /dev/null
+++ b/queue-4.9/xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
@@ -0,0 +1,109 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:05 -0700
+Subject: xfs: move bmbt owner change to last step of extent swap
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-41-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 6fb10d6d22094bc4062f92b9ccbcee2f54033d04 upstream.
+
+The extent swap operation currently resets bmbt block owners before
+the inode forks are swapped. The bmbt buffers are marked as ordered
+so they do not have to be physically logged in the transaction.
+
+This use of ordered buffers is not safe as bmbt buffers may have
+been previously physically logged. The bmbt owner change algorithm
+needs to be updated to physically log buffers that are already dirty
+when/if they are encountered. This means that an extent swap will
+eventually require multiple rolling transactions to handle large
+btrees. In addition, all inode related changes must be logged before
+the bmbt owner change scan begins and can roll the transaction for
+the first time to preserve fs consistency via log recovery.
+
+In preparation for such fixes to the bmbt owner change algorithm,
+refactor the bmbt scan out of the extent fork swap code to the last
+operation before the transaction is committed. Update
+xfs_swap_extent_forks() to only set the inode log flags when an
+owner change scan is necessary. Update xfs_swap_extents() to trigger
+the owner change based on the inode log flags. Note that since the
+owner change now occurs after the extent fork swap, the inode btrees
+must be fixed up with the inode number of the current inode (similar
+to log recovery).
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_util.c |   44 ++++++++++++++++++++++++++------------------
+ 1 file changed, 26 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1825,29 +1825,18 @@ xfs_swap_extent_forks(
+ 	}
+ 
+ 	/*
+-	 * Before we've swapped the forks, lets set the owners of the forks
+-	 * appropriately. We have to do this as we are demand paging the btree
+-	 * buffers, and so the validation done on read will expect the owner
+-	 * field to be correctly set. Once we change the owners, we can swap the
+-	 * inode forks.
++	 * Btree format (v3) inodes have the inode number stamped in the bmbt
++	 * block headers. We can't start changing the bmbt blocks until the
++	 * inode owner change is logged so recovery does the right thing in the
++	 * event of a crash. Set the owner change log flags now and leave the
++	 * bmbt scan as the last step.
+ 	 */
+ 	if (ip->i_d.di_version == 3 &&
+-	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ 		(*target_log_flags) |= XFS_ILOG_DOWNER;
+-		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+-					      tip->i_ino, NULL);
+-		if (error)
+-			return error;
+-	}
+-
+ 	if (tip->i_d.di_version == 3 &&
+-	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ 		(*src_log_flags) |= XFS_ILOG_DOWNER;
+-		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+-					      ip->i_ino, NULL);
+-		if (error)
+-			return error;
+-	}
+ 
+ 	/*
+ 	 * Swap the data forks of the inodes
+@@ -2077,6 +2066,25 @@ xfs_swap_extents(
+ 	xfs_trans_log_inode(tp, tip, target_log_flags);
+ 
+ 	/*
++	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
++	 * have inode number owner values in the bmbt blocks that still refer to
++	 * the old inode. Scan each bmbt to fix up the owner values with the
++	 * inode number of the current inode.
++	 */
++	if (src_log_flags & XFS_ILOG_DOWNER) {
++		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
++					      ip->i_ino, NULL);
++		if (error)
++			goto out_trans_cancel;
++	}
++	if (target_log_flags & XFS_ILOG_DOWNER) {
++		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
++					      tip->i_ino, NULL);
++		if (error)
++			goto out_trans_cancel;
++	}
++
++	/*
+ 	 * If this is a synchronous mount, make sure that the
+ 	 * transaction goes to disk before returning to the user.
+ 	 */
diff --git a/queue-4.9/xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch b/queue-4.9/xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
new file mode 100644
index 00000000000..ee6eafd110d
--- /dev/null
+++ b/queue-4.9/xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
@@ -0,0 +1,146 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:11 -0700
+Subject: xfs: open code end_buffer_async_write in xfs_finish_page_writeback
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-47-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 8353a814f2518dcfa79a5bb77afd0e7dfa391bb1 upstream.
+
+Our loop in xfs_finish_page_writeback, which iterates over all buffer
+heads in a page and then calls end_buffer_async_write, which also
+iterates over all buffers in the page to check if any I/O is in flight
+is not only inefficient, but also potentially dangerous as
+end_buffer_async_write can cause the page and all buffers to be freed.
+
+Replace it with a single loop that does the work of end_buffer_async_write
+on a per-page basis.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c |   72 ++++++++++++++++++++++++++++++++++++------------------
+ 1 file changed, 48 insertions(+), 24 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
+  * associated buffer_heads, paying attention to the start and end offsets that
+  * we need to process on the page.
+  *
+- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+- * the page at all, as we may be racing with memory reclaim and it can free both
+- * the bufferhead chain and the page as it will see the page as clean and
+- * unused.
++ * Note that we open code the action in end_buffer_async_write here so that we
++ * only have to iterate over the buffers attached to the page once.  This is not
++ * only more efficient, but also ensures that we only calls end_page_writeback
++ * at the end of the iteration, and thus avoids the pitfall of having the page
++ * and buffers potentially freed after every call to end_buffer_async_write.
+  */
+ static void
+ xfs_finish_page_writeback(
+@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
+ 	struct bio_vec		*bvec,
+ 	int			error)
+ {
+-	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
+-	struct buffer_head	*head, *bh, *next;
++	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
++	bool			busy = false;
+ 	unsigned int		off = 0;
+-	unsigned int		bsize;
++	unsigned long		flags;
+ 
+ 	ASSERT(bvec->bv_offset < PAGE_SIZE);
+ 	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
+-	ASSERT(end < PAGE_SIZE);
++	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
+ 	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
+ 
+-	bh = head = page_buffers(bvec->bv_page);
+-
+-	bsize = bh->b_size;
++	local_irq_save(flags);
++	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ 	do {
+-		if (off > end)
+-			break;
+-		next = bh->b_this_page;
+-		if (off < bvec->bv_offset)
+-			goto next_bh;
+-		bh->b_end_io(bh, !error);
+-next_bh:
+-		off += bsize;
+-	} while ((bh = next) != head);
++		if (off >= bvec->bv_offset &&
++		    off < bvec->bv_offset + bvec->bv_len) {
++			ASSERT(buffer_async_write(bh));
++			ASSERT(bh->b_end_io == NULL);
++
++			if (error) {
++				mapping_set_error(bvec->bv_page->mapping, -EIO);
++				set_buffer_write_io_error(bh);
++				clear_buffer_uptodate(bh);
++				SetPageError(bvec->bv_page);
++			} else {
++				set_buffer_uptodate(bh);
++			}
++			clear_buffer_async_write(bh);
++			unlock_buffer(bh);
++		} else if (buffer_async_write(bh)) {
++			ASSERT(buffer_locked(bh));
++			busy = true;
++		}
++		off += bh->b_size;
++	} while ((bh = bh->b_this_page) != head);
++	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
++	local_irq_restore(flags);
++
++	if (!busy)
++		end_page_writeback(bvec->bv_page);
+ }
+ 
+ /*
+@@ -138,8 +154,10 @@ xfs_destroy_ioend(
+ 	int			error)
+ {
+ 	struct inode		*inode = ioend->io_inode;
+-	struct bio		*last = ioend->io_bio;
+-	struct bio		*bio, *next;
++	struct bio		*bio = &ioend->io_inline_bio;
++	struct bio		*last = ioend->io_bio, *next;
++	u64			start = bio->bi_iter.bi_sector;
++	bool			quiet = bio_flagged(bio, BIO_QUIET);
+ 
+ 	for (bio = &ioend->io_inline_bio; bio; bio = next) {
+ 		struct bio_vec	*bvec;
+@@ -160,6 +178,11 @@ xfs_destroy_ioend(
+ 
+ 		bio_put(bio);
+ 	}
++
++	if (unlikely(error && !quiet)) {
++		xfs_err_ratelimited(XFS_I(inode)->i_mount,
++			"writeback error on sector %llu", start);
++	}
+ }
+ 
+ /*
+@@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
+ 	ASSERT(!buffer_delay(bh));
+ 	ASSERT(!buffer_unwritten(bh));
+ 
+-	mark_buffer_async_write(bh);
++	bh->b_end_io = NULL;
++	set_buffer_async_write(bh);
+ 	set_buffer_uptodate(bh);
+ 	clear_buffer_dirty(bh);
+ }
diff --git a/queue-4.9/xfs-open-code-xfs_buf_item_dirty.patch b/queue-4.9/xfs-open-code-xfs_buf_item_dirty.patch
new file mode 100644
index 00000000000..3ed5b1d6426
--- /dev/null
+++ b/queue-4.9/xfs-open-code-xfs_buf_item_dirty.patch
@@ -0,0 +1,67 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:59 -0700
+Subject: xfs: open-code xfs_buf_item_dirty()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-35-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a4f6cf6b2b6b60ec2a05a33a32e65caa4149aa2b upstream.
+
+It checks a single flag and has one caller. It probably isn't worth
+its own function.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c  |   11 -----------
+ fs/xfs/xfs_buf_item.h  |    1 -
+ fs/xfs/xfs_trans_buf.c |    2 +-
+ 3 files changed, 1 insertion(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -945,17 +945,6 @@ xfs_buf_item_log(
+ }
+ 
+ 
+-/*
+- * Return 1 if the buffer has been logged or ordered in a transaction (at any
+- * point, not just the current transaction) and 0 if not.
+- */
+-uint
+-xfs_buf_item_dirty(
+-	xfs_buf_log_item_t	*bip)
+-{
+-	return (bip->bli_flags & XFS_BLI_DIRTY);
+-}
+-
+ STATIC void
+ xfs_buf_item_free(
+ 	xfs_buf_log_item_t	*bip)
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,7 +64,6 @@ typedef struct xfs_buf_log_item {
+ int	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void	xfs_buf_item_relse(struct xfs_buf *);
+ void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+-uint	xfs_buf_item_dirty(xfs_buf_log_item_t *);
+ void	xfs_buf_attach_iodone(struct xfs_buf *,
+ 			      void(*)(struct xfs_buf *, xfs_log_item_t *),
+ 			      xfs_log_item_t *);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
+ 	if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
+ 		xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
+ 		xfs_buf_item_relse(bp);
+-	} else if (!xfs_buf_item_dirty(bip)) {
++	} else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
+ /***
+ 		ASSERT(bp->b_pincount == 0);
+ ***/
diff --git a/queue-4.9/xfs-ordered-buffer-log-items-are-never-formatted.patch b/queue-4.9/xfs-ordered-buffer-log-items-are-never-formatted.patch
new file mode 100644
index 00000000000..6b314188183
--- /dev/null
+++ b/queue-4.9/xfs-ordered-buffer-log-items-are-never-formatted.patch
@@ -0,0 +1,70 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:01 -0700
+Subject: xfs: ordered buffer log items are never formatted
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-37-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e9385cc6fb7edf23702de33a2dc82965d92d9392 upstream.
+
+Ordered buffers pass through the logging infrastructure without ever
+being written to the log. The way this works is that the ordered
+buffer status is transferred to the log vector at commit time via
+the ->iop_size() callback. In xlog_cil_insert_format_items(),
+ordered log vectors bypass ->iop_format() processing altogether.
+
+Therefore it is unnecessary for xfs_buf_item_format() to handle
+ordered buffers. Remove the unnecessary logic and assert that an
+ordered buffer never reaches this point.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   12 ++----------
+ fs/xfs/xfs_trace.h    |    1 -
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -323,6 +323,8 @@ xfs_buf_item_format(
+ 	ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+ 	       (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+ 	        && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
++	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
++	       (bip->bli_flags & XFS_BLI_STALE));
+ 
+ 
+ 	/*
+@@ -347,16 +349,6 @@ xfs_buf_item_format(
+ 		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+ 	}
+ 
+-	if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+-							XFS_BLI_ORDERED) {
+-		/*
+-		 * The buffer has been logged just to order it.  It is not being
+-		 * included in the transaction commit, so don't format it.
+-		 */
+-		trace_xfs_buf_item_format_ordered(bip);
+-		return;
+-	}
+-
+ 	for (i = 0; i < bip->bli_format_count; i++) {
+ 		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+ 					    &bip->bli_formats[i]);
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -520,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size)
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
diff --git a/queue-4.9/xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch b/queue-4.9/xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
new file mode 100644
index 00000000000..e8361be924c
--- /dev/null
+++ b/queue-4.9/xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
@@ -0,0 +1,265 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:50 -0700
+Subject: xfs: Properly retry failed inode items in case of error during buffer writeback
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-26-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit d3a304b6292168b83b45d624784f973fdc1ca674 upstream.
+
+When a buffer has been failed during writeback, the inode items into it
+are kept flush locked, and are never resubmitted due the flush lock, so,
+if any buffer fails to be written, the items in AIL are never written to
+disk and never unlocked.
+
+This causes unmount operation to hang due these items flush locked in AIL,
+but this also causes the items in AIL to never be written back, even when
+the IO device comes back to normal.
+
+I've been testing this patch with a DM-thin device, creating a
+filesystem larger than the real device.
+
+When writing enough data to fill the DM-thin device, XFS receives ENOSPC
+errors from the device, and keep spinning on xfsaild (when 'retry
+forever' configuration is set).
+
+At this point, the filesystem can not be unmounted because of the flush locked
+items in AIL, but worse, the items in AIL are never retried at all
+(once xfs_inode_item_push() will skip the items that are flush locked),
+even if the underlying DM-thin device is expanded to the proper size.
+
+This patch fixes both cases, retrying any item that has been failed
+previously, using the infra-structure provided by the previous patch.
+
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c   |   28 ++++++++++++++++++++++++++++
+ fs/xfs/xfs_buf_item.h   |    3 +++
+ fs/xfs/xfs_inode_item.c |   47 +++++++++++++++++++++++++++++++++++++++++++----
+ fs/xfs/xfs_trans.h      |    1 +
+ fs/xfs/xfs_trans_ail.c  |    3 ++-
+ fs/xfs/xfs_trans_priv.h |   31 +++++++++++++++++++++++++++++++
+ 6 files changed, 108 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1234,3 +1234,31 @@ xfs_buf_iodone(
+ 	xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+ 	xfs_buf_item_free(BUF_ITEM(lip));
+ }
++
++/*
++ * Requeue a failed buffer for writeback
++ *
++ * Return true if the buffer has been re-queued properly, false otherwise
++ */
++bool
++xfs_buf_resubmit_failed_buffers(
++	struct xfs_buf		*bp,
++	struct xfs_log_item	*lip,
++	struct list_head	*buffer_list)
++{
++	struct xfs_log_item	*next;
++
++	/*
++	 * Clear XFS_LI_FAILED flag from all items before resubmit
++	 *
++	 * XFS_LI_FAILED set/clear is protected by xa_lock, caller  this
++	 * function already have it acquired
++	 */
++	for (; lip; lip = next) {
++		next = lip->li_bio_list;
++		xfs_clear_li_failed(lip);
++	}
++
++	/* Add this buffer back to the delayed write list */
++	return xfs_buf_delwri_queue(bp, buffer_list);
++}
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -70,6 +70,9 @@ void	xfs_buf_attach_iodone(struct xfs_bu
+ 			      xfs_log_item_t *);
+ void	xfs_buf_iodone_callbacks(struct xfs_buf *);
+ void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
++bool	xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
++					struct xfs_log_item *,
++					struct list_head *);
+ 
+ extern kmem_zone_t	*xfs_buf_item_zone;
+ 
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -27,6 +27,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_trans_priv.h"
++#include "xfs_buf_item.h"
+ #include "xfs_log.h"
+ 
+ 
+@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
+ 		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
+ 
++/*
++ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
++ * have been failed during writeback
++ *
++ * This informs the AIL that the inode is already flush locked on the next push,
++ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
++ * dirty data makes it to disk.
++ */
++STATIC void
++xfs_inode_item_error(
++	struct xfs_log_item	*lip,
++	struct xfs_buf		*bp)
++{
++	ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
++	xfs_set_li_failed(lip, bp);
++}
++
+ STATIC uint
+ xfs_inode_item_push(
+ 	struct xfs_log_item	*lip,
+@@ -484,13 +502,28 @@ xfs_inode_item_push(
+ {
+ 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ 	struct xfs_inode	*ip = iip->ili_inode;
+-	struct xfs_buf		*bp = NULL;
++	struct xfs_buf		*bp = lip->li_buf;
+ 	uint			rval = XFS_ITEM_SUCCESS;
+ 	int			error;
+ 
+ 	if (xfs_ipincount(ip) > 0)
+ 		return XFS_ITEM_PINNED;
+ 
++	/*
++	 * The buffer containing this item failed to be written back
++	 * previously. Resubmit the buffer for IO.
++	 */
++	if (lip->li_flags & XFS_LI_FAILED) {
++		if (!xfs_buf_trylock(bp))
++			return XFS_ITEM_LOCKED;
++
++		if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
++			rval = XFS_ITEM_FLUSHING;
++
++		xfs_buf_unlock(bp);
++		return rval;
++	}
++
+ 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ 		return XFS_ITEM_LOCKED;
+ 
+@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_ino
+ 	.iop_unlock	= xfs_inode_item_unlock,
+ 	.iop_committed	= xfs_inode_item_committed,
+ 	.iop_push	= xfs_inode_item_push,
+-	.iop_committing = xfs_inode_item_committing
++	.iop_committing = xfs_inode_item_committing,
++	.iop_error	= xfs_inode_item_error
+ };
+ 
+ 
+@@ -710,7 +744,8 @@ xfs_iflush_done(
+ 		 * the AIL lock.
+ 		 */
+ 		iip = INODE_ITEM(blip);
+-		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
++		if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
++		    lip->li_flags & XFS_LI_FAILED)
+ 			need_ail++;
+ 
+ 		blip = next;
+@@ -718,7 +753,8 @@ xfs_iflush_done(
+ 
+ 	/* make sure we capture the state of the initial inode. */
+ 	iip = INODE_ITEM(lip);
+-	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
++	if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
++	    lip->li_flags & XFS_LI_FAILED)
+ 		need_ail++;
+ 
+ 	/*
+@@ -739,6 +775,9 @@ xfs_iflush_done(
+ 			if (INODE_ITEM(blip)->ili_logged &&
+ 			    blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+ 				mlip_changed |= xfs_ail_delete_one(ailp, blip);
++			else {
++				xfs_clear_li_failed(blip);
++			}
+ 		}
+ 
+ 		if (mlip_changed) {
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -50,6 +50,7 @@ typedef struct xfs_log_item {
+ 	struct xfs_ail			*li_ailp;	/* ptr to AIL */
+ 	uint				li_type;	/* item type */
+ 	uint				li_flags;	/* misc flags */
++	struct xfs_buf			*li_buf;	/* real buffer pointer */
+ 	struct xfs_log_item		*li_bio_list;	/* buffer item list */
+ 	void				(*li_cb)(struct xfs_buf *,
+ 						 struct xfs_log_item *);
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
+ bool
+ xfs_ail_delete_one(
+ 	struct xfs_ail		*ailp,
+-	struct xfs_log_item 	*lip)
++	struct xfs_log_item	*lip)
+ {
+ 	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
+ 
+ 	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+ 	xfs_ail_delete(ailp, lip);
++	xfs_clear_li_failed(lip);
+ 	lip->li_flags &= ~XFS_LI_IN_AIL;
+ 	lip->li_lsn = 0;
+ 
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
+ 	*dst = *src;
+ }
+ #endif
++
++static inline void
++xfs_clear_li_failed(
++	struct xfs_log_item	*lip)
++{
++	struct xfs_buf	*bp = lip->li_buf;
++
++	ASSERT(lip->li_flags & XFS_LI_IN_AIL);
++	lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++	if (lip->li_flags & XFS_LI_FAILED) {
++		lip->li_flags &= ~XFS_LI_FAILED;
++		lip->li_buf = NULL;
++		xfs_buf_rele(bp);
++	}
++}
++
++static inline void
++xfs_set_li_failed(
++	struct xfs_log_item	*lip,
++	struct xfs_buf		*bp)
++{
++	lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++	if (!(lip->li_flags & XFS_LI_FAILED)) {
++		xfs_buf_hold(bp);
++		lip->li_flags |= XFS_LI_FAILED;
++		lip->li_buf = bp;
++	}
++}
++
+ #endif	/* __XFS_TRANS_PRIV_H__ */
diff --git a/queue-4.9/xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch b/queue-4.9/xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
new file mode 100644
index 00000000000..0d572079ca2
--- /dev/null
+++ b/queue-4.9/xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
@@ -0,0 +1,137 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:02 -0700
+Subject: xfs: refactor buffer logging into buffer dirtying helper
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-38-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 9684010d38eccda733b61106765e9357cf436f65 upstream.
+
+xfs_trans_log_buf() is responsible for logging the dirty segments of
+a buffer along with setting all of the necessary state on the
+transaction, buffer, bli, etc., to ensure that the associated items
+are marked as dirty and prepared for I/O. We have a couple use cases
+that need to to dirty a buffer in a transaction without actually
+logging dirty ranges of the buffer.  One existing use case is
+ordered buffers, which are currently logged with arbitrary ranges to
+accomplish this even though the content of ordered buffers is never
+written to the log. Another pending use case is to relog an already
+dirty buffer across rolled transactions within the deferred
+operations infrastructure. This is required to prevent a held
+(XFS_BLI_HOLD) buffer from pinning the tail of the log.
+
+Refactor xfs_trans_log_buf() into a new function that contains all
+of the logic responsible to dirty the transaction, lidp, buffer and
+bli. This new function can be used in the future for the use cases
+outlined above. This patch does not introduce functional changes.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans.h     |    4 +++-
+ fs/xfs/xfs_trans_buf.c |   46 ++++++++++++++++++++++++++++++----------------
+ 2 files changed, 33 insertions(+), 17 deletions(-)
+
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -222,7 +222,9 @@ void		xfs_trans_dquot_buf(xfs_trans_t *,
+ void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+ void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
+-void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
++void		xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
++				  uint);
++void		xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+ void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+ 
+ void		xfs_extent_free_init_defer_op(void);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
+ }
+ 
+ /*
+- * This is called to mark bytes first through last inclusive of the given
+- * buffer as needing to be logged when the transaction is committed.
+- * The buffer must already be associated with the given transaction.
+- *
+- * First and last are numbers relative to the beginning of this buffer,
+- * so the first byte in the buffer is numbered 0 regardless of the
+- * value of b_blkno.
++ * Mark a buffer dirty in the transaction.
+  */
+ void
+-xfs_trans_log_buf(xfs_trans_t	*tp,
+-		  xfs_buf_t	*bp,
+-		  uint		first,
+-		  uint		last)
++xfs_trans_dirty_buf(
++	struct xfs_trans	*tp,
++	struct xfs_buf		*bp)
+ {
+-	xfs_buf_log_item_t	*bip = bp->b_fspriv;
++	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+ 
+ 	ASSERT(bp->b_transp == tp);
+ 	ASSERT(bip != NULL);
+-	ASSERT(first <= last && last < BBTOB(bp->b_length));
+ 	ASSERT(bp->b_iodone == NULL ||
+ 	       bp->b_iodone == xfs_buf_iodone_callbacks);
+ 
+@@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
+ 	bp->b_iodone = xfs_buf_iodone_callbacks;
+ 	bip->bli_item.li_cb = xfs_buf_iodone;
+ 
+-	trace_xfs_trans_log_buf(bip);
+-
+ 	/*
+ 	 * If we invalidated the buffer within this transaction, then
+ 	 * cancel the invalidation now that we're dirtying the buffer
+@@ -545,15 +535,39 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
+ 		bp->b_flags &= ~XBF_STALE;
+ 		bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+ 	}
++	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+ 
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
++}
++
++/*
++ * This is called to mark bytes first through last inclusive of the given
++ * buffer as needing to be logged when the transaction is committed.
++ * The buffer must already be associated with the given transaction.
++ *
++ * First and last are numbers relative to the beginning of this buffer,
++ * so the first byte in the buffer is numbered 0 regardless of the
++ * value of b_blkno.
++ */
++void
++xfs_trans_log_buf(
++	struct xfs_trans	*tp,
++	struct xfs_buf		*bp,
++	uint			first,
++	uint			last)
++{
++	struct xfs_buf_log_item	*bip = bp->b_fspriv;
++
++	ASSERT(first <= last && last < BBTOB(bp->b_length));
++
++	xfs_trans_dirty_buf(tp, bp);
+ 
+ 	/*
+ 	 * If we have an ordered buffer we are not logging any dirty range but
+ 	 * it still needs to be marked dirty and that it has been logged.
+ 	 */
+-	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
++	trace_xfs_trans_log_buf(bip);
+ 	if (!(bip->bli_flags & XFS_BLI_ORDERED))
+ 		xfs_buf_item_log(bip, first, last);
+ }
diff --git a/queue-4.9/xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch b/queue-4.9/xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
new file mode 100644
index 00000000000..346318d2998
--- /dev/null
+++ b/queue-4.9/xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
@@ -0,0 +1,177 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:07 -0700
+Subject: xfs: relog dirty buffers during swapext bmbt owner change
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-43-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 2dd3d709fc4338681a3aa61658122fa8faa5a437 upstream.
+
+The owner change bmbt scan that occurs during extent swap operations
+does not handle ordered buffer failures. Buffers that cannot be
+marked ordered must be physically logged so previously dirty ranges
+of the buffer can be relogged in the transaction.
+
+Since the bmbt scan may need to process and potentially log a large
+number of blocks, we can't expect to complete this operation in a
+single transaction. Update extent swap to use a permanent
+transaction with enough log reservation to physically log a buffer.
+Update the bmbt scan to physically log any buffers that cannot be
+ordered and to terminate the scan with -EAGAIN. On -EAGAIN, the
+caller rolls the transaction and restarts the scan. Finally, update
+the bmbt scan helper function to skip bmbt blocks that already match
+the expected owner so they are not reprocessed after scan restarts.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+[darrick: fix the xfs_trans_roll call]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c |   26 ++++++++++++++------
+ fs/xfs/xfs_bmap_util.c    |   59 +++++++++++++++++++++++++++++++++++++---------
+ 2 files changed, 66 insertions(+), 19 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -4435,10 +4435,15 @@ xfs_btree_block_change_owner(
+ 
+ 	/* modify the owner */
+ 	block = xfs_btree_get_block(cur, level, &bp);
+-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
++	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
++		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
++			return 0;
+ 		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
+-	else
++	} else {
++		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
++			return 0;
+ 		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
++	}
+ 
+ 	/*
+ 	 * If the block is a root block hosted in an inode, we might not have a
+@@ -4447,14 +4452,19 @@ xfs_btree_block_change_owner(
+ 	 * block is formatted into the on-disk inode fork. We still change it,
+ 	 * though, so everything is consistent in memory.
+ 	 */
+-	if (bp) {
+-		if (cur->bc_tp)
+-			xfs_trans_ordered_buf(cur->bc_tp, bp);
+-		else
+-			xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+-	} else {
++	if (!bp) {
+ 		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ 		ASSERT(level == cur->bc_nlevels - 1);
++		return 0;
++	}
++
++	if (cur->bc_tp) {
++		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
++			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
++			return -EAGAIN;
++		}
++	} else {
++		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+ 	}
+ 
+ 	return 0;
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1914,6 +1914,48 @@ xfs_swap_extent_forks(
+ 	return 0;
+ }
+ 
++/*
++ * Fix up the owners of the bmbt blocks to refer to the current inode. The
++ * change owner scan attempts to order all modified buffers in the current
++ * transaction. In the event of ordered buffer failure, the offending buffer is
++ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
++ * the transaction in this case to replenish the fallback log reservation and
++ * restart the scan. This process repeats until the scan completes.
++ */
++static int
++xfs_swap_change_owner(
++	struct xfs_trans	**tpp,
++	struct xfs_inode	*ip,
++	struct xfs_inode	*tmpip)
++{
++	int			error;
++	struct xfs_trans	*tp = *tpp;
++
++	do {
++		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
++					      NULL);
++		/* success or fatal error */
++		if (error != -EAGAIN)
++			break;
++
++		error = xfs_trans_roll(tpp, NULL);
++		if (error)
++			break;
++		tp = *tpp;
++
++		/*
++		 * Redirty both inodes so they can relog and keep the log tail
++		 * moving forward.
++		 */
++		xfs_trans_ijoin(tp, ip, 0);
++		xfs_trans_ijoin(tp, tmpip, 0);
++		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
++		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
++	} while (true);
++
++	return error;
++}
++
+ int
+ xfs_swap_extents(
+ 	struct xfs_inode	*ip,	/* target inode */
+@@ -1927,8 +1969,8 @@ xfs_swap_extents(
+ 	int			error = 0;
+ 	int			lock_flags;
+ 	struct xfs_ifork	*cowfp;
+-	__uint64_t		f;
+-	int			resblks;
++	uint64_t		f;
++	int			resblks = 0;
+ 
+ 	/*
+ 	 * Lock the inodes against other IO, page faults and truncate to
+@@ -1976,11 +2018,8 @@ xfs_swap_extents(
+ 			  XFS_SWAP_RMAP_SPACE_RES(mp,
+ 				XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+ 				XFS_DATA_FORK);
+-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+-				0, 0, &tp);
+-	} else
+-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+-				0, 0, &tp);
++	}
++	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ 	if (error)
+ 		goto out_unlock;
+ 
+@@ -2072,14 +2111,12 @@ xfs_swap_extents(
+ 	 * inode number of the current inode.
+ 	 */
+ 	if (src_log_flags & XFS_ILOG_DOWNER) {
+-		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+-					      ip->i_ino, NULL);
++		error = xfs_swap_change_owner(&tp, ip, tip);
+ 		if (error)
+ 			goto out_trans_cancel;
+ 	}
+ 	if (target_log_flags & XFS_ILOG_DOWNER) {
+-		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+-					      tip->i_ino, NULL);
++		error = xfs_swap_change_owner(&tp, tip, ip);
+ 		if (error)
+ 			goto out_trans_cancel;
+ 	}
diff --git a/queue-4.9/xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch b/queue-4.9/xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
new file mode 100644
index 00000000000..e5b5b36f5ee
--- /dev/null
+++ b/queue-4.9/xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
@@ -0,0 +1,158 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:00 -0700
+Subject: xfs: remove unnecessary dirty bli format check for ordered bufs
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-36-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 6453c65d3576bc3e602abb5add15f112755c08ca upstream.
+
+xfs_buf_item_unlock() historically checked the dirty state of the
+buffer by manually checking the buffer log formats for dirty
+segments. The introduction of ordered buffers invalidated this check
+because ordered buffers have dirty bli's but no dirty (logged)
+segments. The check was updated to accommodate ordered buffers by
+looking at the bli state first and considering the blf only if the
+bli is clean.
+
+This logic is safe but unnecessary. There is no valid case where the
+bli is clean yet the blf has dirty segments. The bli is set dirty
+whenever the blf is logged (via xfs_trans_log_buf()) and the blf is
+cleared in the only place BLI_DIRTY is cleared (xfs_trans_binval()).
+
+Remove the conditional blf dirty checks and replace with an assert
+that should catch any discrepencies between bli and blf dirty
+states. Refactor the old blf dirty check into a helper function to
+be used by the assert.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   62 +++++++++++++++++++++++++-------------------------
+ fs/xfs/xfs_buf_item.h |    1 
+ 2 files changed, 33 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -575,26 +575,18 @@ xfs_buf_item_unlock(
+ {
+ 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+ 	struct xfs_buf		*bp = bip->bli_buf;
+-	bool			clean;
+-	bool			aborted;
+-	int			flags;
++	bool			aborted = !!(lip->li_flags & XFS_LI_ABORTED);
++	bool			hold = !!(bip->bli_flags & XFS_BLI_HOLD);
++	bool			dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
++	bool			ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+ 
+ 	/* Clear the buffer's association with this transaction. */
+ 	bp->b_transp = NULL;
+ 
+ 	/*
+-	 * If this is a transaction abort, don't return early.  Instead, allow
+-	 * the brelse to happen.  Normally it would be done for stale
+-	 * (cancelled) buffers at unpin time, but we'll never go through the
+-	 * pin/unpin cycle if we abort inside commit.
+-	 */
+-	aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
+-	/*
+-	 * Before possibly freeing the buf item, copy the per-transaction state
+-	 * so we can reference it safely later after clearing it from the
+-	 * buffer log item.
++	 * The per-transaction state has been copied above so clear it from the
++	 * bli.
+ 	 */
+-	flags = bip->bli_flags;
+ 	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+ 
+ 	/*
+@@ -602,7 +594,7 @@ xfs_buf_item_unlock(
+ 	 * unlock the buffer and free the buf item when the buffer is unpinned
+ 	 * for the last time.
+ 	 */
+-	if (flags & XFS_BLI_STALE) {
++	if (bip->bli_flags & XFS_BLI_STALE) {
+ 		trace_xfs_buf_item_unlock_stale(bip);
+ 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ 		if (!aborted) {
+@@ -620,20 +612,11 @@ xfs_buf_item_unlock(
+ 	 * regardless of whether it is dirty or not. A dirty abort implies a
+ 	 * shutdown, anyway.
+ 	 *
+-	 * Ordered buffers are dirty but may have no recorded changes, so ensure
+-	 * we only release clean items here.
++	 * The bli dirty state should match whether the blf has logged segments
++	 * except for ordered buffers, where only the bli should be dirty.
+ 	 */
+-	clean = (flags & XFS_BLI_DIRTY) ? false : true;
+-	if (clean) {
+-		int i;
+-		for (i = 0; i < bip->bli_format_count; i++) {
+-			if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+-				     bip->bli_formats[i].blf_map_size)) {
+-				clean = false;
+-				break;
+-			}
+-		}
+-	}
++	ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
++	       (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+ 
+ 	/*
+ 	 * Clean buffers, by definition, cannot be in the AIL. However, aborted
+@@ -652,11 +635,11 @@ xfs_buf_item_unlock(
+ 			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ 			xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ 			xfs_buf_item_relse(bp);
+-		} else if (clean)
++		} else if (!dirty)
+ 			xfs_buf_item_relse(bp);
+ 	}
+ 
+-	if (!(flags & XFS_BLI_HOLD))
++	if (!hold)
+ 		xfs_buf_relse(bp);
+ }
+ 
+@@ -945,6 +928,25 @@ xfs_buf_item_log(
+ }
+ 
+ 
++/*
++ * Return true if the buffer has any ranges logged/dirtied by a transaction,
++ * false otherwise.
++ */
++bool
++xfs_buf_item_dirty_format(
++	struct xfs_buf_log_item	*bip)
++{
++	int			i;
++
++	for (i = 0; i < bip->bli_format_count; i++) {
++		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
++			     bip->bli_formats[i].blf_map_size))
++			return true;
++	}
++
++	return false;
++}
++
+ STATIC void
+ xfs_buf_item_free(
+ 	xfs_buf_log_item_t	*bip)
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,6 +64,7 @@ typedef struct xfs_buf_log_item {
+ int	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void	xfs_buf_item_relse(struct xfs_buf *);
+ void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
++bool	xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
+ void	xfs_buf_attach_iodone(struct xfs_buf *,
+ 			      void(*)(struct xfs_buf *, xfs_log_item_t *),
+ 			      xfs_log_item_t *);
diff --git a/queue-4.9/xfs-remove-xfs_trans_ail_delete_bulk.patch b/queue-4.9/xfs-remove-xfs_trans_ail_delete_bulk.patch
new file mode 100644
index 00000000000..428abd0253d
--- /dev/null
+++ b/queue-4.9/xfs-remove-xfs_trans_ail_delete_bulk.patch
@@ -0,0 +1,193 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:48 -0700
+Subject: xfs: remove xfs_trans_ail_delete_bulk
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-24-hch@lst.de>
+
+commit 27af1bbf524459962d1477a38ac6e0b7f79aaecc upstream.
+
+xfs_iflush_done uses an on-stack variable length array to pass the log
+items to be deleted to xfs_trans_ail_delete_bulk.  On-stack VLAs are a
+nasty gcc extension that can lead to unbounded stack allocations, but
+fortunately we can easily avoid them by simply open coding
+xfs_trans_ail_delete_bulk in xfs_iflush_done, which is the only caller
+of it except for the single-item xfs_trans_ail_delete.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode_item.c |   29 +++++++++++--------
+ fs/xfs/xfs_trans_ail.c  |   73 +++++++++++++++++++++++-------------------------
+ fs/xfs/xfs_trans_priv.h |   15 +--------
+ 3 files changed, 56 insertions(+), 61 deletions(-)
+
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -731,22 +731,27 @@ xfs_iflush_done(
+ 	 * holding the lock before removing the inode from the AIL.
+ 	 */
+ 	if (need_ail) {
+-		struct xfs_log_item *log_items[need_ail];
+-		int i = 0;
++		bool			mlip_changed = false;
++
++		/* this is an opencoded batch version of xfs_trans_ail_delete */
+ 		spin_lock(&ailp->xa_lock);
+ 		for (blip = lip; blip; blip = blip->li_bio_list) {
+-			iip = INODE_ITEM(blip);
+-			if (iip->ili_logged &&
+-			    blip->li_lsn == iip->ili_flush_lsn) {
+-				log_items[i++] = blip;
+-			}
+-			ASSERT(i <= need_ail);
++			if (INODE_ITEM(blip)->ili_logged &&
++			    blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
++				mlip_changed |= xfs_ail_delete_one(ailp, blip);
+ 		}
+-		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+-		xfs_trans_ail_delete_bulk(ailp, log_items, i,
+-					  SHUTDOWN_CORRUPT_INCORE);
+-	}
+ 
++		if (mlip_changed) {
++			if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
++				xlog_assign_tail_lsn_locked(ailp->xa_mount);
++			if (list_empty(&ailp->xa_ail))
++				wake_up_all(&ailp->xa_empty);
++		}
++		spin_unlock(&ailp->xa_lock);
++
++		if (mlip_changed)
++			xfs_log_space_wake(ailp->xa_mount);
++	}
+ 
+ 	/*
+ 	 * clean up and unlock the flush lock now we are done. We can clear the
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
+ 	}
+ }
+ 
+-/*
+- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
++bool
++xfs_ail_delete_one(
++	struct xfs_ail		*ailp,
++	struct xfs_log_item 	*lip)
++{
++	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
++
++	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
++	xfs_ail_delete(ailp, lip);
++	lip->li_flags &= ~XFS_LI_IN_AIL;
++	lip->li_lsn = 0;
++
++	return mlip == lip;
++}
++
++/**
++ * Remove a log items from the AIL
+  *
+  * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+  * removed from the AIL. The caller is already holding the AIL lock, and done
+@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
+  * before returning.
+  */
+ void
+-xfs_trans_ail_delete_bulk(
++xfs_trans_ail_delete(
+ 	struct xfs_ail		*ailp,
+-	struct xfs_log_item	**log_items,
+-	int			nr_items,
++	struct xfs_log_item	*lip,
+ 	int			shutdown_type) __releases(ailp->xa_lock)
+ {
+-	xfs_log_item_t		*mlip;
+-	int			mlip_changed = 0;
+-	int			i;
+-
+-	mlip = xfs_ail_min(ailp);
+-
+-	for (i = 0; i < nr_items; i++) {
+-		struct xfs_log_item *lip = log_items[i];
+-		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+-			struct xfs_mount	*mp = ailp->xa_mount;
+-
+-			spin_unlock(&ailp->xa_lock);
+-			if (!XFS_FORCED_SHUTDOWN(mp)) {
+-				xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+-		"%s: attempting to delete a log item that is not in the AIL",
+-						__func__);
+-				xfs_force_shutdown(mp, shutdown_type);
+-			}
+-			return;
+-		}
++	struct xfs_mount	*mp = ailp->xa_mount;
++	bool			mlip_changed;
+ 
+-		trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+-		xfs_ail_delete(ailp, lip);
+-		lip->li_flags &= ~XFS_LI_IN_AIL;
+-		lip->li_lsn = 0;
+-		if (mlip == lip)
+-			mlip_changed = 1;
++	if (!(lip->li_flags & XFS_LI_IN_AIL)) {
++		spin_unlock(&ailp->xa_lock);
++		if (!XFS_FORCED_SHUTDOWN(mp)) {
++			xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
++	"%s: attempting to delete a log item that is not in the AIL",
++					__func__);
++			xfs_force_shutdown(mp, shutdown_type);
++		}
++		return;
+ 	}
+ 
++	mlip_changed = xfs_ail_delete_one(ailp, lip);
+ 	if (mlip_changed) {
+-		if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+-			xlog_assign_tail_lsn_locked(ailp->xa_mount);
++		if (!XFS_FORCED_SHUTDOWN(mp))
++			xlog_assign_tail_lsn_locked(mp);
+ 		if (list_empty(&ailp->xa_ail))
+ 			wake_up_all(&ailp->xa_empty);
+-		spin_unlock(&ailp->xa_lock);
++	}
+ 
++	spin_unlock(&ailp->xa_lock);
++	if (mlip_changed)
+ 		xfs_log_space_wake(ailp->xa_mount);
+-	} else {
+-		spin_unlock(&ailp->xa_lock);
+-	}
+ }
+ 
+ int
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -106,18 +106,9 @@ xfs_trans_ail_update(
+ 	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+ }
+ 
+-void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+-				struct xfs_log_item **log_items, int nr_items,
+-				int shutdown_type)
+-				__releases(ailp->xa_lock);
+-static inline void
+-xfs_trans_ail_delete(
+-	struct xfs_ail	*ailp,
+-	xfs_log_item_t	*lip,
+-	int		shutdown_type) __releases(ailp->xa_lock)
+-{
+-	xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
+-}
++bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
++void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
++		int shutdown_type) __releases(ailp->xa_lock);
+ 
+ static inline void
+ xfs_trans_ail_remove(
diff --git a/queue-4.9/xfs-skip-bmbt-block-ino-validation-during-owner-change.patch b/queue-4.9/xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
new file mode 100644
index 00000000000..90303e5850a
--- /dev/null
+++ b/queue-4.9/xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
@@ -0,0 +1,80 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:04 -0700
+Subject: xfs: skip bmbt block ino validation during owner change
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-40-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 99c794c639a65cc7b74f30a674048fd100fe9ac8 upstream.
+
+Extent swap uses xfs_btree_visit_blocks() to fix up bmbt block
+owners on v5 (!rmapbt) filesystems. The bmbt scan uses
+xfs_btree_lookup_get_block() to read bmbt blocks which verifies the
+current owner of the block against the parent inode of the bmbt.
+This works during extent swap because the bmbt owners are updated to
+the opposite inode number before the inode extent forks are swapped.
+
+The modified bmbt blocks are marked as ordered buffers which allows
+everything to commit in a single transaction. If the transaction
+commits to the log and the system crashes such that recovery of the
+extent swap is required, log recovery restarts the bmbt scan to fix
+up any bmbt blocks that may have not been written back before the
+crash. The log recovery bmbt scan occurs after the inode forks have
+been swapped, however. This causes the bmbt block owner verification
+to fail, leads to log recovery failure and requires xfs_repair to
+zap the log to recover.
+
+Define a new invalid inode owner flag to inform the btree block
+lookup mechanism that the current inode may be invalid with respect
+to the current owner of the bmbt block. Set this flag on the cursor
+used for change owner scans to allow this operation to work at
+runtime and during log recovery.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Fixes: bb3be7e7c ("xfs: check for bogus values in btree block headers")
+Cc: stable@vger.kernel.org
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap_btree.c |    1 +
+ fs/xfs/libxfs/xfs_btree.c      |    1 +
+ fs/xfs/libxfs/xfs_btree.h      |    3 ++-
+ 3 files changed, 4 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
+ 	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+ 	if (!cur)
+ 		return -ENOMEM;
++	cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+ 
+ 	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+ 	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -1774,6 +1774,7 @@ xfs_btree_lookup_get_block(
+ 
+ 	/* Check the inode owner since the verifiers don't. */
+ 	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
++	    !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+ 	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+ 	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+ 			cur->bc_private.b.ip->i_ino)
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
+ 			short		forksize;	/* fork's inode space */
+ 			char		whichfork;	/* data or attr fork */
+ 			char		flags;		/* flags */
+-#define	XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
++#define	XFS_BTCUR_BPRV_WASDEL		(1<<0)		/* was delayed */
++#define	XFS_BTCUR_BPRV_INVALID_OWNER	(1<<1)		/* for ext swap */
+ 		} b;
+ 	}		bc_private;	/* per-btree type data */
+ } xfs_btree_cur_t;
diff --git a/queue-4.9/xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch b/queue-4.9/xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
new file mode 100644
index 00000000000..7a4371e5069
--- /dev/null
+++ b/queue-4.9/xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
@@ -0,0 +1,136 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:56 -0700
+Subject: xfs: stop searching for free slots in an inode chunk when there are none
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-32-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit 2d32311cf19bfb8c1d2b4601974ddd951f9cfd0b upstream.
+
+In a filesystem without finobt, the Space manager selects an AG to alloc a new
+inode, where xfs_dialloc_ag_inobt() will search the AG for the free slot chunk.
+
+When the new inode is in the same AG as its parent, the btree will be searched
+starting on the parent's record, and then retried from the top if no slot is
+available beyond the parent's record.
+
+To exit this loop though, xfs_dialloc_ag_inobt() relies on the fact that the
+btree must have a free slot available, once its callers relied on the
+agi->freecount when deciding how/where to allocate this new inode.
+
+In the case when the agi->freecount is corrupted, showing available inodes in an
+AG, when in fact there is none, this becomes an infinite loop.
+
+Add a way to stop the loop when a free slot is not found in the btree, making
+the function to fall into the whole AG scan which will then, be able to detect
+the corruption and shut the filesystem down.
+
+As pointed by Brian, this might impact performance, giving the fact we
+don't reset the search distance anymore when we reach the end of the
+tree, giving it fewer tries before falling back to the whole AG search, but
+it will only affect searches that start within 10 records to the end of the tree.
+
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ialloc.c |   55 ++++++++++++++++++++++-----------------------
+ 1 file changed, 27 insertions(+), 28 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -1123,6 +1123,7 @@ xfs_dialloc_ag_inobt(
+ 	int			error;
+ 	int			offset;
+ 	int			i, j;
++	int			searchdistance = 10;
+ 
+ 	pag = xfs_perag_get(mp, agno);
+ 
+@@ -1149,7 +1150,6 @@ xfs_dialloc_ag_inobt(
+ 	if (pagno == agno) {
+ 		int		doneleft;	/* done, to the left */
+ 		int		doneright;	/* done, to the right */
+-		int		searchdistance = 10;
+ 
+ 		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ 		if (error)
+@@ -1210,21 +1210,9 @@ xfs_dialloc_ag_inobt(
+ 		/*
+ 		 * Loop until we find an inode chunk with a free inode.
+ 		 */
+-		while (!doneleft || !doneright) {
++		while (--searchdistance > 0 && (!doneleft || !doneright)) {
+ 			int	useleft;  /* using left inode chunk this time */
+ 
+-			if (!--searchdistance) {
+-				/*
+-				 * Not in range - save last search
+-				 * location and allocate a new inode
+-				 */
+-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-				pag->pagl_leftrec = trec.ir_startino;
+-				pag->pagl_rightrec = rec.ir_startino;
+-				pag->pagl_pagino = pagino;
+-				goto newino;
+-			}
+-
+ 			/* figure out the closer block if both are valid. */
+ 			if (!doneleft && !doneright) {
+ 				useleft = pagino -
+@@ -1268,26 +1256,37 @@ xfs_dialloc_ag_inobt(
+ 				goto error1;
+ 		}
+ 
+-		/*
+-		 * We've reached the end of the btree. because
+-		 * we are only searching a small chunk of the
+-		 * btree each search, there is obviously free
+-		 * inodes closer to the parent inode than we
+-		 * are now. restart the search again.
+-		 */
+-		pag->pagl_pagino = NULLAGINO;
+-		pag->pagl_leftrec = NULLAGINO;
+-		pag->pagl_rightrec = NULLAGINO;
+-		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+-		goto restart_pagno;
++		if (searchdistance <= 0) {
++			/*
++			 * Not in range - save last search
++			 * location and allocate a new inode
++			 */
++			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++			pag->pagl_leftrec = trec.ir_startino;
++			pag->pagl_rightrec = rec.ir_startino;
++			pag->pagl_pagino = pagino;
++
++		} else {
++			/*
++			 * We've reached the end of the btree. because
++			 * we are only searching a small chunk of the
++			 * btree each search, there is obviously free
++			 * inodes closer to the parent inode than we
++			 * are now. restart the search again.
++			 */
++			pag->pagl_pagino = NULLAGINO;
++			pag->pagl_leftrec = NULLAGINO;
++			pag->pagl_rightrec = NULLAGINO;
++			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
++			goto restart_pagno;
++		}
+ 	}
+ 
+ 	/*
+ 	 * In a different AG from the parent.
+ 	 * See if the most recently allocated block has any free.
+ 	 */
+-newino:
+ 	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ 		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ 					 XFS_LOOKUP_EQ, &i);
diff --git a/queue-4.9/xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch b/queue-4.9/xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
new file mode 100644
index 00000000000..9694f99d6d5
--- /dev/null
+++ b/queue-4.9/xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
@@ -0,0 +1,62 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:47 -0700
+Subject: xfs: toggle readonly state around xfs_log_mount_finish
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Eric Sandeen <sandeen@sandeen.net>, Eric Sandeen <sandeen@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-23-hch@lst.de>
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit 6f4a1eefdd0ad4561543270a7fceadabcca075dd upstream.
+
+When we do log recovery on a readonly mount, unlinked inode
+processing does not happen due to the readonly checks in
+xfs_inactive(), which are trying to prevent any I/O on a
+readonly mount.
+
+This is misguided - we do I/O on readonly mounts all the time,
+for consistency; for example, log recovery.  So do the same
+RDONLY flag twiddling around xfs_log_mount_finish() as we
+do around xfs_log_mount(), for the same reason.
+
+This all cries out for a big rework but for now this is a
+simple fix to an obvious problem.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -743,10 +743,14 @@ xfs_log_mount_finish(
+ 	struct xfs_mount	*mp)
+ {
+ 	int	error = 0;
++	bool	readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+ 
+ 	if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ 		return 0;
++	} else if (readonly) {
++		/* Allow unlinked processing to proceed */
++		mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ 	}
+ 
+ 	/*
+@@ -764,6 +768,9 @@ xfs_log_mount_finish(
+ 		xfs_log_work_queue(mp);
+ 	mp->m_super->s_flags &= ~MS_ACTIVE;
+ 
++	if (readonly)
++		mp->m_flags |= XFS_MOUNT_RDONLY;
++
+ 	return error;
+ }
+ 
diff --git a/queue-4.9/xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch b/queue-4.9/xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
new file mode 100644
index 00000000000..1af93992f7d
--- /dev/null
+++ b/queue-4.9/xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
@@ -0,0 +1,34 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:12 -0700
+Subject: xfs: use kmem_free to free return value of kmem_zalloc
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Pan Bian <bianpan2016@163.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-48-hch@lst.de>
+
+From: Pan Bian <bianpan2016@163.com>
+
+commit 6c370590cfe0c36bcd62d548148aa65c984540b7 upstream.
+
+In function xfs_test_remount_options(), kfree() is used to free memory
+allocated by kmem_zalloc(). But it is better to use kmem_free().
+
+Signed-off-by: Pan Bian <bianpan2016@163.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_super.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1214,7 +1214,7 @@ xfs_test_remount_options(
+ 	tmp_mp->m_super = sb;
+ 	error = xfs_parseargs(tmp_mp, options);
+ 	xfs_free_fsname(tmp_mp);
+-	kfree(tmp_mp);
++	kmem_free(tmp_mp);
+ 
+ 	return error;
+ }
diff --git a/queue-4.9/xfs-write-unmount-record-for-ro-mounts.patch b/queue-4.9/xfs-write-unmount-record-for-ro-mounts.patch
new file mode 100644
index 00000000000..5d2bca563bb
--- /dev/null
+++ b/queue-4.9/xfs-write-unmount-record-for-ro-mounts.patch
@@ -0,0 +1,66 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:46 -0700
+Subject: xfs: write unmount record for ro mounts
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Eric Sandeen <sandeen@sandeen.net>, Eric Sandeen <sandeen@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-22-hch@lst.de>
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit 757a69ef6cf2bf839bd4088e5609ddddd663b0c4 upstream.
+
+There are dueling comments in the xfs code about intent
+for log writes when unmounting a readonly filesystem.
+
+In xfs_mountfs, we see the intent:
+
+/*
+ * Now the log is fully replayed, we can transition to full read-only
+ * mode for read-only mounts. This will sync all the metadata and clean
+ * the log so that the recovery we just performed does not have to be
+ * replayed again on the next mount.
+ */
+
+and it calls xfs_quiesce_attr(), but by the time we get to
+xfs_log_unmount_write(), it returns early for a RDONLY mount:
+
+ * Don't write out unmount record on read-only mounts.
+
+Because of this, sequential ro mounts of a filesystem with
+a dirty log will replay the log each time, which seems odd.
+
+Fix this by writing an unmount record even for RO mounts, as long
+as norecovery wasn't specified (don't write a clean log record
+if a dirty log may still be there!) and the log device is
+writable.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -812,11 +812,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
+ 	int		 error;
+ 
+ 	/*
+-	 * Don't write out unmount record on read-only mounts.
++	 * Don't write out unmount record on norecovery mounts or ro devices.
+ 	 * Or, if we are doing a forced umount (typically because of IO errors).
+ 	 */
+-	if (mp->m_flags & XFS_MOUNT_RDONLY)
++	if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
++	    xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
++		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ 		return 0;
++	}
+ 
+ 	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ 	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));