--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:45 -0700
+Subject: iomap: fix integer truncation issues in the zeroing and dirtying helpers
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-21-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e28ae8e428fefe2facd72cea9f29906ecb9c861d upstream.
+
+Fix the min_t calls in the zeroing and dirtying helpers to perform the
+comparisms on 64-bit types, which prevents them from incorrectly
+being truncated, and larger zeroing operations being stuck in a never
+ending loop.
+
+Special thanks to Markus Stockhausen for spotting the bug.
+
+Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
+Tested-by: Paul Menzel <pmenzel@molgen.mpg.de>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/iomap.c
++++ b/fs/iomap.c
+@@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, l
+ unsigned long bytes; /* Bytes to write to page */
+
+ offset = (pos & (PAGE_SIZE - 1));
+- bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
++ bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+
+ rpage = __iomap_read_page(inode, pos);
+ if (IS_ERR(rpage))
+@@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *ino
+ unsigned offset, bytes;
+
+ offset = pos & (PAGE_SIZE - 1); /* Within page */
+- bytes = min_t(unsigned, PAGE_SIZE - offset, count);
++ bytes = min_t(loff_t, PAGE_SIZE - offset, count);
+
+ if (IS_DAX(inode))
+ status = iomap_dax_zero(pos, offset, bytes, iomap);
xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
xfs-fix-per-inode-dax-flag-inheritance.patch
xfs-fix-inobt-inode-allocation-search-optimization.patch
+xfs-clear-ms_active-after-finishing-log-recovery.patch
+xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
+iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
+xfs-write-unmount-record-for-ro-mounts.patch
+xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
+xfs-remove-xfs_trans_ail_delete_bulk.patch
+xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
+xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
+xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
+xfs-always-verify-the-log-tail-during-recovery.patch
+xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
+xfs-handle-efscorrupted-during-head-tail-verification.patch
+xfs-add-log-recovery-tracepoint-for-head-tail.patch
+xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
+xfs-evict-all-inodes-involved-with-log-redo-item.patch
+xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
+xfs-open-code-xfs_buf_item_dirty.patch
+xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
+xfs-ordered-buffer-log-items-are-never-formatted.patch
+xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
+xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
+xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
+xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
+xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
+xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
+xfs-disable-per-inode-dax-flag.patch
+xfs-fix-incorrect-log_flushed-on-fsync.patch
+xfs-don-t-set-v3-xflags-for-v2-inodes.patch
+xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
+xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:49 -0700
+Subject: xfs: Add infrastructure needed for error propagation during buffer IO failure
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-25-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit 0b80ae6ed13169bd3a244e71169f2cc020b0c57a upstream.
+
+With the current code, XFS never re-submit a failed buffer for IO,
+because the failed item in the buffer is kept in the flush locked state
+forever.
+
+To be able to resubmit an log item for IO, we need a way to mark an item
+as failed, if, for any reason the buffer which the item belonged to
+failed during writeback.
+
+Add a new log item callback to be used after an IO completion failure
+and make the needed clean ups.
+
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c | 32 +++++++++++++++++++++++++++++++-
+ fs/xfs/xfs_trans.h | 7 +++++--
+ 2 files changed, 36 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -29,6 +29,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_log.h"
++#include "xfs_inode.h"
+
+
+ kmem_zone_t *xfs_buf_item_zone;
+@@ -1054,6 +1055,31 @@ xfs_buf_do_callbacks(
+ }
+ }
+
++/*
++ * Invoke the error state callback for each log item affected by the failed I/O.
++ *
++ * If a metadata buffer write fails with a non-permanent error, the buffer is
++ * eventually resubmitted and so the completion callbacks are not run. The error
++ * state may need to be propagated to the log items attached to the buffer,
++ * however, so the next AIL push of the item knows hot to handle it correctly.
++ */
++STATIC void
++xfs_buf_do_callbacks_fail(
++ struct xfs_buf *bp)
++{
++ struct xfs_log_item *next;
++ struct xfs_log_item *lip = bp->b_fspriv;
++ struct xfs_ail *ailp = lip->li_ailp;
++
++ spin_lock(&ailp->xa_lock);
++ for (; lip; lip = next) {
++ next = lip->li_bio_list;
++ if (lip->li_ops->iop_error)
++ lip->li_ops->iop_error(lip, bp);
++ }
++ spin_unlock(&ailp->xa_lock);
++}
++
+ static bool
+ xfs_buf_iodone_callback_error(
+ struct xfs_buf *bp)
+@@ -1123,7 +1149,11 @@ xfs_buf_iodone_callback_error(
+ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ goto permanent_error;
+
+- /* still a transient error, higher layers will retry */
++ /*
++ * Still a transient error, run IO completion failure callbacks and let
++ * the higher layers retry the buffer.
++ */
++ xfs_buf_do_callbacks_fail(bp);
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_relse(bp);
+ return true;
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -65,11 +65,13 @@ typedef struct xfs_log_item {
+ } xfs_log_item_t;
+
+ #define XFS_LI_IN_AIL 0x1
+-#define XFS_LI_ABORTED 0x2
++#define XFS_LI_ABORTED 0x2
++#define XFS_LI_FAILED 0x4
+
+ #define XFS_LI_FLAGS \
+ { XFS_LI_IN_AIL, "IN_AIL" }, \
+- { XFS_LI_ABORTED, "ABORTED" }
++ { XFS_LI_ABORTED, "ABORTED" }, \
++ { XFS_LI_FAILED, "FAILED" }
+
+ struct xfs_item_ops {
+ void (*iop_size)(xfs_log_item_t *, int *, int *);
+@@ -80,6 +82,7 @@ struct xfs_item_ops {
+ void (*iop_unlock)(xfs_log_item_t *);
+ xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+ void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
++ void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+ };
+
+ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:55 -0700
+Subject: xfs: add log recovery tracepoint for head/tail
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-31-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e67d3d4246e5fbb0c7c700426d11241ca9c6f473 upstream.
+
+Torn write detection and tail overwrite detection can shift the log
+head and tail respectively in the event of CRC mismatch or
+corruption errors. Add a high-level log recovery tracepoint to dump
+the final log head/tail and make those values easily attainable in
+debug/diagnostic situations.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 2 ++
+ fs/xfs/xfs_trace.h | 18 ++++++++++++++++++
+ 2 files changed, 20 insertions(+)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -5596,6 +5596,8 @@ xlog_do_recover(
+ xfs_buf_t *bp;
+ xfs_sb_t *sbp;
+
++ trace_xfs_log_recover(log, head_blk, tail_blk);
++
+ /*
+ * First replay the images in the log.
+ */
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -1991,6 +1991,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
++TRACE_EVENT(xfs_log_recover,
++ TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
++ TP_ARGS(log, headblk, tailblk),
++ TP_STRUCT__entry(
++ __field(dev_t, dev)
++ __field(xfs_daddr_t, headblk)
++ __field(xfs_daddr_t, tailblk)
++ ),
++ TP_fast_assign(
++ __entry->dev = log->l_mp->m_super->s_dev;
++ __entry->headblk = headblk;
++ __entry->tailblk = tailblk;
++ ),
++ TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
++ __entry->tailblk)
++)
++
+ TRACE_EVENT(xfs_log_recover_record,
+ TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+ TP_ARGS(log, rhead, pass),
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:52 -0700
+Subject: xfs: always verify the log tail during recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-28-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 5297ac1f6d7cbf45464a49b9558831f271dfc559 upstream.
+
+Log tail verification currently only occurs when torn writes are
+detected at the head of the log. This was introduced because a
+change in the head block due to torn writes can lead to a change in
+the tail block (each log record header references the current tail)
+and the tail block should be verified before log recovery proceeds.
+
+Tail corruption is possible outside of torn write scenarios,
+however. For example, partial log writes can be detected and cleared
+during the initial head/tail block discovery process. If the partial
+write coincides with a tail overwrite, the log tail is corrupted and
+recovery fails.
+
+To facilitate correct handling of log tail overwites, update log
+recovery to always perform tail verification. This is necessary to
+detect potential tail overwrite conditions when torn writes may not
+have occurred. This changes normal (i.e., no torn writes) recovery
+behavior slightly to detect and return CRC related errors near the
+tail before actual recovery starts.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 26 +++-----------------------
+ 1 file changed, 3 insertions(+), 23 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1183,31 +1183,11 @@ xlog_verify_head(
+ ASSERT(0);
+ return 0;
+ }
+-
+- /*
+- * Now verify the tail based on the updated head. This is
+- * required because the torn writes trimmed from the head could
+- * have been written over the tail of a previous record. Return
+- * any errors since recovery cannot proceed if the tail is
+- * corrupt.
+- *
+- * XXX: This leaves a gap in truly robust protection from torn
+- * writes in the log. If the head is behind the tail, the tail
+- * pushes forward to create some space and then a crash occurs
+- * causing the writes into the previous record's tail region to
+- * tear, log recovery isn't able to recover.
+- *
+- * How likely is this to occur? If possible, can we do something
+- * more intelligent here? Is it safe to push the tail forward if
+- * we can determine that the tail is within the range of the
+- * torn write (e.g., the kernel can only overwrite the tail if
+- * it has actually been pushed forward)? Alternatively, could we
+- * somehow prevent this condition at runtime?
+- */
+- error = xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
++ if (error)
++ return error;
+
+- return error;
++ return xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
+
+ /*
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:58 -0700
+Subject: xfs: check for race with xfs_reclaim_inode() in xfs_ifree_cluster()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Omar Sandoval <osandov@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-34-hch@lst.de>
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit f2e9ad212def50bcf4c098c6288779dd97fff0f0 upstream.
+
+After xfs_ifree_cluster() finds an inode in the radix tree and verifies
+that the inode number is what it expected, xfs_reclaim_inode() can swoop
+in and free it. xfs_ifree_cluster() will then happily continue working
+on the freed inode. Most importantly, it will mark the inode stale,
+which will probably be overwritten when the inode slab object is
+reallocated, but if it has already been reallocated then we can end up
+with an inode spuriously marked stale.
+
+In 8a17d7ddedb4 ("xfs: mark reclaimed inodes invalid earlier") we added
+a second check to xfs_iflush_cluster() to detect this race, but the
+similar RCU lookup in xfs_ifree_cluster() needs the same treatment.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c | 10 +++++-----
+ fs/xfs/xfs_inode.c | 23 ++++++++++++++++++-----
+ 2 files changed, 23 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1078,11 +1078,11 @@ reclaim:
+ * Because we use RCU freeing we need to ensure the inode always appears
+ * to be reclaimed with an invalid inode number when in the free state.
+ * We do this as early as possible under the ILOCK so that
+- * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+- * By doing this, we guarantee that once xfs_iflush_cluster has locked
+- * XFS_ILOCK that it will see either a valid, flushable inode that will
+- * serialise correctly, or it will see a clean (and invalid) inode that
+- * it can skip.
++ * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
++ * detect races with us here. By doing this, we guarantee that once
++ * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
++ * it will see either a valid inode that will serialise correctly, or it
++ * will see an invalid inode that it can skip.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2368,11 +2368,24 @@ retry:
+ * already marked stale. If we can't lock it, back off
+ * and retry.
+ */
+- if (ip != free_ip &&
+- !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+- rcu_read_unlock();
+- delay(1);
+- goto retry;
++ if (ip != free_ip) {
++ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
++ rcu_read_unlock();
++ delay(1);
++ goto retry;
++ }
++
++ /*
++ * Check the inode number again in case we're
++ * racing with freeing in xfs_reclaim_inode().
++ * See the comments in that function for more
++ * information as to why the initial check is
++ * not sufficient.
++ */
++ if (ip->i_ino != inum + i) {
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
++ continue;
++ }
+ }
+ rcu_read_unlock();
+
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:43 -0700
+Subject: xfs: clear MS_ACTIVE after finishing log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-19-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 8204f8ddaafafcae074746fcf2a05a45e6827603 upstream.
+
+Way back when we established inode block-map redo log items, it was
+discovered that we needed to prevent the VFS from evicting inodes during
+log recovery because any given inode might be have bmap redo items to
+replay even if the inode has no link count and is ultimately deleted,
+and any eviction of an unlinked inode causes the inode to be truncated
+and freed too early.
+
+To make this possible, we set MS_ACTIVE so that inodes would not be torn
+down immediately upon release. Unfortunately, this also results in the
+quota inodes not being released at all if a later part of the mount
+process should fail, because we never reclaim the inodes. So, set
+MS_ACTIVE right before we do the last part of log recovery and clear it
+immediately after we finish the log recovery so that everything
+will be torn down properly if we abort the mount.
+
+Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c | 11 +++++++++++
+ fs/xfs/xfs_mount.c | 10 ----------
+ 2 files changed, 11 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -749,9 +749,20 @@ xfs_log_mount_finish(
+ return 0;
+ }
+
++ /*
++ * During the second phase of log recovery, we need iget and
++ * iput to behave like they do for an active filesystem.
++ * xfs_fs_drop_inode needs to be able to prevent the deletion
++ * of inodes before we're done replaying log items on those
++ * inodes. Turn it off immediately after recovery finishes
++ * so that we don't leak the quota inodes if subsequent mount
++ * activities fail.
++ */
++ mp->m_super->s_flags |= MS_ACTIVE;
+ error = xlog_recover_finish(mp->m_log);
+ if (!error)
+ xfs_log_work_queue(mp);
++ mp->m_super->s_flags &= ~MS_ACTIVE;
+
+ return error;
+ }
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -925,15 +925,6 @@ xfs_mountfs(
+ }
+
+ /*
+- * During the second phase of log recovery, we need iget and
+- * iput to behave like they do for an active filesystem.
+- * xfs_fs_drop_inode needs to be able to prevent the deletion
+- * of inodes before we're done replaying log items on those
+- * inodes.
+- */
+- mp->m_super->s_flags |= MS_ACTIVE;
+-
+- /*
+ * Finish recovering the file system. This part needed to be delayed
+ * until after the root and real-time bitmap inodes were consistently
+ * read in.
+@@ -1008,7 +999,6 @@ xfs_mountfs(
+ out_quota:
+ xfs_qm_unmount_quotas(mp);
+ out_rtunmount:
+- mp->m_super->s_flags &= ~MS_ACTIVE;
+ xfs_rtunmount_inodes(mp);
+ out_rele_rip:
+ IRELE(rip);
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:08 -0700
+Subject: xfs: disable per-inode DAX flag
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-44-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 742d84290739ae908f1b61b7d17ea382c8c0073a upstream.
+
+Currently flag switching can be used to easily crash the kernel. Disable
+the per-inode DAX flag until that is sorted out.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -1005,11 +1005,12 @@ xfs_diflags_to_linux(
+ inode->i_flags |= S_NOATIME;
+ else
+ inode->i_flags &= ~S_NOATIME;
++#if 0 /* disabled until the flag switching races are sorted out */
+ if (xflags & FS_XFLAG_DAX)
+ inode->i_flags |= S_DAX;
+ else
+ inode->i_flags &= ~S_DAX;
+-
++#endif
+ }
+
+ static int
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:06 -0700
+Subject: xfs: disallow marking previously dirty buffers as ordered
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-42-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a5814bceea48ee1c57c4db2bd54b0c0246daf54a upstream.
+
+Ordered buffers are used in situations where the buffer is not
+physically logged but must pass through the transaction/logging
+pipeline for a particular transaction. As a result, ordered buffers
+are not unpinned and written back until the transaction commits to
+the log. Ordered buffers have a strict requirement that the target
+buffer must not be currently dirty and resident in the log pipeline
+at the time it is marked ordered. If a dirty+ordered buffer is
+committed, the buffer is reinserted to the AIL but not physically
+relogged at the LSN of the associated checkpoint. The buffer log
+item is assigned the LSN of the latest checkpoint and the AIL
+effectively releases the previously logged buffer content from the
+active log before the buffer has been written back. If the tail
+pushes forward and a filesystem crash occurs while in this state, an
+inconsistent filesystem could result.
+
+It is currently the caller responsibility to ensure an ordered
+buffer is not already dirty from a previous modification. This is
+unclear and error prone when not used in situations where it is
+guaranteed a buffer has not been previously modified (such as new
+metadata allocations).
+
+To facilitate general purpose use of ordered buffers, update
+xfs_trans_ordered_buf() to conditionally order the buffer based on
+state of the log item and return the status of the result. If the
+bli is dirty, do not order the buffer and return false. The caller
+must either physically log the buffer (having acquired the
+appropriate log reservation) or push it from the AIL to clean it
+before it can be marked ordered in the current transaction.
+
+Note that ordered buffers are currently only used in two situations:
+1.) inode chunk allocation where previously logged buffers are not
+possible and 2.) extent swap which will be updated to handle ordered
+buffer failures in a separate patch.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans.h | 2 +-
+ fs/xfs/xfs_trans_buf.c | 7 +++++--
+ 2 files changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -217,7 +217,7 @@ void xfs_trans_bhold_release(xfs_trans_
+ void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+ void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+-void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
++bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+ void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -724,7 +724,7 @@ xfs_trans_inode_alloc_buf(
+ * transactions rather than the physical changes we make to the buffer without
+ * changing writeback ordering constraints of metadata buffers.
+ */
+-void
++bool
+ xfs_trans_ordered_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+@@ -734,7 +734,9 @@ xfs_trans_ordered_buf(
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+- ASSERT(!xfs_buf_item_dirty_format(bip));
++
++ if (xfs_buf_item_dirty_format(bip))
++ return false;
+
+ bip->bli_flags |= XFS_BLI_ORDERED;
+ trace_xfs_buf_item_ordered(bip);
+@@ -744,6 +746,7 @@ xfs_trans_ordered_buf(
+ * to be marked dirty and that it has been logged.
+ */
+ xfs_trans_dirty_buf(tp, bp);
++ return true;
+ }
+
+ /*
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:44 -0700
+Subject: xfs: don't leak quotacheck dquots when cow recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-20-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 77aff8c76425c8f49b50d0b9009915066739e7d2 upstream.
+
+If we fail a mount on account of cow recovery errors, it's possible that
+a previous quotacheck left some dquots in memory. The bailout clause of
+xfs_mountfs forgets to purge these, and so we leak them. Fix that.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_mount.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1004,6 +1004,8 @@ xfs_mountfs(
+ IRELE(rip);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
++ /* Clean out dquots that might be in memory after quotacheck. */
++ xfs_qm_unmount(mp);
+ out_log_dealloc:
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+ xfs_log_mount_cancel(mp);
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:03 -0700
+Subject: xfs: don't log dirty ranges for ordered buffers
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-39-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 8dc518dfa7dbd079581269e51074b3c55a65a880 upstream.
+
+Ordered buffers are attached to transactions and pushed through the
+logging infrastructure just like normal buffers with the exception
+that they are not actually written to the log. Therefore, we don't
+need to log dirty ranges of ordered buffers. xfs_trans_log_buf() is
+called on ordered buffers to set up all of the dirty state on the
+transaction, buffer and log item and prepare the buffer for I/O.
+
+Now that xfs_trans_dirty_buf() is available, call it from
+xfs_trans_ordered_buf() so the latter is now mutually exclusive with
+xfs_trans_log_buf(). This reflects the implementation of ordered
+buffers and helps eliminate confusion over the need to log ranges of
+ordered buffers just to set up internal log state.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c | 6 ++----
+ fs/xfs/libxfs/xfs_ialloc.c | 2 --
+ fs/xfs/xfs_trans_buf.c | 26 ++++++++++++++------------
+ 3 files changed, 16 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -4447,12 +4447,10 @@ xfs_btree_block_change_owner(
+ * though, so everything is consistent in memory.
+ */
+ if (bp) {
+- if (cur->bc_tp) {
++ if (cur->bc_tp)
+ xfs_trans_ordered_buf(cur->bc_tp, bp);
+- xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+- } else {
++ else
+ xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+- }
+ } else {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+- xfs_trans_log_buf(tp, fbuf, 0,
+- BBTOB(fbuf->b_length) - 1);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -560,16 +560,12 @@ xfs_trans_log_buf(
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(first <= last && last < BBTOB(bp->b_length));
++ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+
+ xfs_trans_dirty_buf(tp, bp);
+
+- /*
+- * If we have an ordered buffer we are not logging any dirty range but
+- * it still needs to be marked dirty and that it has been logged.
+- */
+ trace_xfs_trans_log_buf(bip);
+- if (!(bip->bli_flags & XFS_BLI_ORDERED))
+- xfs_buf_item_log(bip, first, last);
++ xfs_buf_item_log(bip, first, last);
+ }
+
+
+@@ -722,12 +718,11 @@ xfs_trans_inode_alloc_buf(
+ }
+
+ /*
+- * Mark the buffer as ordered for this transaction. This means
+- * that the contents of the buffer are not recorded in the transaction
+- * but it is tracked in the AIL as though it was. This allows us
+- * to record logical changes in transactions rather than the physical
+- * changes we make to the buffer without changing writeback ordering
+- * constraints of metadata buffers.
++ * Mark the buffer as ordered for this transaction. This means that the contents
++ * of the buffer are not recorded in the transaction but it is tracked in the
++ * AIL as though it was. This allows us to record logical changes in
++ * transactions rather than the physical changes we make to the buffer without
++ * changing writeback ordering constraints of metadata buffers.
+ */
+ void
+ xfs_trans_ordered_buf(
+@@ -739,9 +734,16 @@ xfs_trans_ordered_buf(
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
++ ASSERT(!xfs_buf_item_dirty_format(bip));
+
+ bip->bli_flags |= XFS_BLI_ORDERED;
+ trace_xfs_buf_item_ordered(bip);
++
++ /*
++ * We don't log a dirty range of an ordered buffer but it still needs
++ * to be marked dirty and that it has been logged.
++ */
++ xfs_trans_dirty_buf(tp, bp);
+ }
+
+ /*
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:10 -0700
+Subject: xfs: don't set v3 xflags for v2 inodes
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-46-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit dd60687ee541ca3f6df8758f38e6f22f57c42a37 upstream.
+
+Reject attempts to set XFLAGS that correspond to di_flags2 inode flags
+if the inode isn't a v3 inode, because di_flags2 only exists on v3.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c | 38 +++++++++++++++++++++++++-------------
+ 1 file changed, 25 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
+ return 0;
+ }
+
+-STATIC void
+-xfs_set_diflags(
++STATIC uint16_t
++xfs_flags2diflags(
+ struct xfs_inode *ip,
+ unsigned int xflags)
+ {
+- unsigned int di_flags;
+- uint64_t di_flags2;
+-
+ /* can't set PREALLOC this way, just preserve it */
+- di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++ uint16_t di_flags =
++ (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++
+ if (xflags & FS_XFLAG_IMMUTABLE)
+ di_flags |= XFS_DIFLAG_IMMUTABLE;
+ if (xflags & FS_XFLAG_APPEND)
+@@ -967,19 +966,24 @@ xfs_set_diflags(
+ if (xflags & FS_XFLAG_EXTSIZE)
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ }
+- ip->i_d.di_flags = di_flags;
+
+- /* diflags2 only valid for v3 inodes. */
+- if (ip->i_d.di_version < 3)
+- return;
++ return di_flags;
++}
++
++STATIC uint64_t
++xfs_flags2diflags2(
++ struct xfs_inode *ip,
++ unsigned int xflags)
++{
++ uint64_t di_flags2 =
++ (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+
+- di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ if (xflags & FS_XFLAG_DAX)
+ di_flags2 |= XFS_DIFLAG2_DAX;
+ if (xflags & FS_XFLAG_COWEXTSIZE)
+ di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+
+- ip->i_d.di_flags2 = di_flags2;
++ return di_flags2;
+ }
+
+ STATIC void
+@@ -1020,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
+ struct fsxattr *fa)
+ {
+ struct xfs_mount *mp = ip->i_mount;
++ uint64_t di_flags2;
+
+ /* Can't change realtime flag if any extents are allocated. */
+ if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+@@ -1050,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
+ !capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+- xfs_set_diflags(ip, fa->fsx_xflags);
++ /* diflags2 only valid for v3 inodes. */
++ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
++ if (di_flags2 && ip->i_d.di_version < 3)
++ return -EINVAL;
++
++ ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
++ ip->i_d.di_flags2 = di_flags2;
++
+ xfs_diflags_to_linux(ip);
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:57 -0700
+Subject: xfs: evict all inodes involved with log redo item
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>, viro@ZenIV.linux.org.uk
+Message-ID: <20170917210712.10804-33-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 799ea9e9c59949008770aab4e1da87f10e99dbe4 upstream.
+
+When we introduced the bmap redo log items, we set MS_ACTIVE on the
+mountpoint and XFS_IRECOVERY on the inode to prevent unlinked inodes
+from being truncated prematurely during log recovery. This also had the
+effect of putting linked inodes on the lru instead of evicting them.
+
+Unfortunately, we neglected to find all those unreferenced lru inodes
+and evict them after finishing log recovery, which means that we leak
+them if anything goes wrong in the rest of xfs_mountfs, because the lru
+is only cleaned out on unmount.
+
+Therefore, evict unreferenced inodes in the lru list immediately
+after clearing MS_ACTIVE.
+
+Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: viro@ZenIV.linux.org.uk
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/inode.c | 1 +
+ fs/internal.h | 1 -
+ fs/xfs/xfs_log.c | 12 ++++++++++++
+ include/linux/fs.h | 1 +
+ 4 files changed, 14 insertions(+), 1 deletion(-)
+
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -637,6 +637,7 @@ again:
+
+ dispose_list(&dispose);
+ }
++EXPORT_SYMBOL_GPL(evict_inodes);
+
+ /**
+ * invalidate_inodes - attempt to free all inodes on a superblock
+--- a/fs/internal.h
++++ b/fs/internal.h
+@@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const
+ extern void inode_io_list_del(struct inode *inode);
+
+ extern long get_nr_dirty_inodes(void);
+-extern void evict_inodes(struct super_block *);
+ extern int invalidate_inodes(struct super_block *, bool);
+
+ /*
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -761,12 +761,24 @@ xfs_log_mount_finish(
+ * inodes. Turn it off immediately after recovery finishes
+ * so that we don't leak the quota inodes if subsequent mount
+ * activities fail.
++ *
++ * We let all inodes involved in redo item processing end up on
++ * the LRU instead of being evicted immediately so that if we do
++ * something to an unlinked inode, the irele won't cause
++ * premature truncation and freeing of the inode, which results
++ * in log recovery failure. We have to evict the unreferenced
++ * lru inodes after clearing MS_ACTIVE because we don't
++ * otherwise clean up the lru if there's a subsequent failure in
++ * xfs_mountfs, which leads to us leaking the inodes if nothing
++ * else (e.g. quotacheck) references the inodes before the
++ * mount failure occurs.
+ */
+ mp->m_super->s_flags |= MS_ACTIVE;
+ error = xlog_recover_finish(mp->m_log);
+ if (!error)
+ xfs_log_work_queue(mp);
+ mp->m_super->s_flags &= ~MS_ACTIVE;
++ evict_inodes(mp->m_super);
+
+ if (readonly)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inod
+ #endif
+ extern void unlock_new_inode(struct inode *);
+ extern unsigned int get_next_ino(void);
++extern void evict_inodes(struct super_block *sb);
+
+ extern void __iget(struct inode * inode);
+ extern void iget_failed(struct inode *);
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:09 -0700
+Subject: xfs: fix incorrect log_flushed on fsync
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Amir Goldstein <amir73il@gmail.com>, Josef Bacik <jbacik@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-45-hch@lst.de>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit 47c7d0b19502583120c3f396c7559e7a77288a68 upstream.
+
+When calling into _xfs_log_force{,_lsn}() with a pointer
+to log_flushed variable, log_flushed will be set to 1 if:
+1. xlog_sync() is called to flush the active log buffer
+AND/OR
+2. xlog_wait() is called to wait on a syncing log buffers
+
+xfs_file_fsync() checks the value of log_flushed after
+_xfs_log_force_lsn() call to optimize away an explicit
+PREFLUSH request to the data block device after writing
+out all the file's pages to disk.
+
+This optimization is incorrect in the following sequence of events:
+
+ Task A Task B
+ -------------------------------------------------------
+ xfs_file_fsync()
+ _xfs_log_force_lsn()
+ xlog_sync()
+ [submit PREFLUSH]
+ xfs_file_fsync()
+ file_write_and_wait_range()
+ [submit WRITE X]
+ [endio WRITE X]
+ _xfs_log_force_lsn()
+ xlog_wait()
+ [endio PREFLUSH]
+
+The write X is not guarantied to be on persistent storage
+when PREFLUSH request in completed, because write A was submitted
+after the PREFLUSH request, but xfs_file_fsync() of task A will
+be notified of log_flushed=1 and will skip explicit flush.
+
+If the system crashes after fsync of task A, write X may not be
+present on disk after reboot.
+
+This bug was discovered and demonstrated using Josef Bacik's
+dm-log-writes target, which can be used to record block io operations
+and then replay a subset of these operations onto the target device.
+The test goes something like this:
+- Use fsx to execute ops of a file and record ops on log device
+- Every now and then fsync the file, store md5 of file and mark
+ the location in the log
+- Then replay log onto device for each mark, mount fs and compare
+ md5 of file to stored value
+
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Josef Bacik <jbacik@fb.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c | 7 -------
+ 1 file changed, 7 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -3337,8 +3337,6 @@ maybe_sleep:
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return -EIO;
+- if (log_flushed)
+- *log_flushed = 1;
+ } else {
+
+ no_sleep:
+@@ -3442,8 +3440,6 @@ try_again:
+
+ xlog_wait(&iclog->ic_prev->ic_write_wait,
+ &log->l_icloglock);
+- if (log_flushed)
+- *log_flushed = 1;
+ already_slept = 1;
+ goto try_again;
+ }
+@@ -3477,9 +3473,6 @@ try_again:
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return -EIO;
+-
+- if (log_flushed)
+- *log_flushed = 1;
+ } else { /* just return */
+ spin_unlock(&log->l_icloglock);
+ }
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:53 -0700
+Subject: xfs: fix log recovery corruption error due to tail overwrite
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-29-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 4a4f66eac4681378996a1837ad1ffec3a2e2981f upstream.
+
+If we consider the case where the tail (T) of the log is pinned long
+enough for the head (H) to push and block behind the tail, we can
+end up blocked in the following state without enough free space (f)
+in the log to satisfy a transaction reservation:
+
+ 0 phys. log N
+ [-------HffT---H'--T'---]
+
+The last good record in the log (before H) refers to T. The tail
+eventually pushes forward (T') leaving more free space in the log
+for writes to H. At this point, suppose space frees up in the log
+for the maximum of 8 in-core log buffers to start flushing out to
+the log. If this pushes the head from H to H', these next writes
+overwrite the previous tail T. This is safe because the items logged
+from T to T' have been written back and removed from the AIL.
+
+If the next log writes (H -> H') happen to fail and result in
+partial records in the log, the filesystem shuts down having
+overwritten T with invalid data. Log recovery correctly locates H on
+the subsequent mount, but H still refers to the now corrupted tail
+T. This results in log corruption errors and recovery failure.
+
+Since the tail overwrite results from otherwise correct runtime
+behavior, it is up to log recovery to try and deal with this
+situation. Update log recovery tail verification to run a CRC pass
+from the first record past the tail to the head. This facilitates
+error detection at T and moves the recovery tail to the first good
+record past H' (similar to truncating the head on torn write
+detection). If corruption is detected beyond the range possibly
+affected by the max number of iclogs, the log is legitimately
+corrupted and log recovery failure is expected.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 108 +++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 77 insertions(+), 31 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1029,61 +1029,106 @@ out_error:
+ }
+
+ /*
+- * Check the log tail for torn writes. This is required when torn writes are
+- * detected at the head and the head had to be walked back to a previous record.
+- * The tail of the previous record must now be verified to ensure the torn
+- * writes didn't corrupt the previous tail.
++ * Calculate distance from head to tail (i.e., unused space in the log).
++ */
++static inline int
++xlog_tail_distance(
++ struct xlog *log,
++ xfs_daddr_t head_blk,
++ xfs_daddr_t tail_blk)
++{
++ if (head_blk < tail_blk)
++ return tail_blk - head_blk;
++
++ return tail_blk + (log->l_logBBsize - head_blk);
++}
++
++/*
++ * Verify the log tail. This is particularly important when torn or incomplete
++ * writes have been detected near the front of the log and the head has been
++ * walked back accordingly.
++ *
++ * We also have to handle the case where the tail was pinned and the head
++ * blocked behind the tail right before a crash. If the tail had been pushed
++ * immediately prior to the crash and the subsequent checkpoint was only
++ * partially written, it's possible it overwrote the last referenced tail in the
++ * log with garbage. This is not a coherency problem because the tail must have
++ * been pushed before it can be overwritten, but appears as log corruption to
++ * recovery because we have no way to know the tail was updated if the
++ * subsequent checkpoint didn't write successfully.
+ *
+- * Return an error if CRC verification fails as recovery cannot proceed.
++ * Therefore, CRC check the log from tail to head. If a failure occurs and the
++ * offending record is within max iclog bufs from the head, walk the tail
++ * forward and retry until a valid tail is found or corruption is detected out
++ * of the range of a possible overwrite.
+ */
+ STATIC int
+ xlog_verify_tail(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+- xfs_daddr_t tail_blk)
++ xfs_daddr_t *tail_blk,
++ int hsize)
+ {
+ struct xlog_rec_header *thead;
+ struct xfs_buf *bp;
+ xfs_daddr_t first_bad;
+- int count;
+ int error = 0;
+ bool wrapped;
+- xfs_daddr_t tmp_head;
++ xfs_daddr_t tmp_tail;
++ xfs_daddr_t orig_tail = *tail_blk;
+
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return -ENOMEM;
+
+ /*
+- * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+- * a temporary head block that points after the last possible
+- * concurrently written record of the tail.
++ * Make sure the tail points to a record (returns positive count on
++ * success).
+ */
+- count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+- XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+- &wrapped);
+- if (count < 0) {
+- error = count;
++ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
++ &tmp_tail, &thead, &wrapped);
++ if (error < 0)
+ goto out;
+- }
++ if (*tail_blk != tmp_tail)
++ *tail_blk = tmp_tail;
+
+ /*
+- * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+- * into the actual log head. tmp_head points to the start of the record
+- * so update it to the actual head block.
++ * Run a CRC check from the tail to the head. We can't just check
++ * MAX_ICLOGS records past the tail because the tail may point to stale
++ * blocks cleared during the search for the head/tail. These blocks are
++ * overwritten with zero-length records and thus record count is not a
++ * reliable indicator of the iclog state before a crash.
+ */
+- if (count < XLOG_MAX_ICLOGS + 1)
+- tmp_head = head_blk;
+-
+- /*
+- * We now have a tail and temporary head block that covers at least
+- * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+- * records were completely written. Run a CRC verification pass from
+- * tail to head and return the result.
+- */
+- error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
++ first_bad = 0;
++ error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
++ while (error == -EFSBADCRC && first_bad) {
++ int tail_distance;
++
++ /*
++ * Is corruption within range of the head? If so, retry from
++ * the next record. Otherwise return an error.
++ */
++ tail_distance = xlog_tail_distance(log, head_blk, first_bad);
++ if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
++ break;
++
++ /* skip to the next record; returns positive count on success */
++ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
++ &tmp_tail, &thead, &wrapped);
++ if (error < 0)
++ goto out;
++
++ *tail_blk = tmp_tail;
++ first_bad = 0;
++ error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
++ XLOG_RECOVER_CRCPASS, &first_bad);
++ }
+
++ if (!error && *tail_blk != orig_tail)
++ xfs_warn(log->l_mp,
++ "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
++ orig_tail, *tail_blk);
+ out:
+ xlog_put_bp(bp);
+ return error;
+@@ -1187,7 +1232,8 @@ xlog_verify_head(
+ if (error)
+ return error;
+
+- return xlog_verify_tail(log, *head_blk, *tail_blk);
++ return xlog_verify_tail(log, *head_blk, tail_blk,
++ be32_to_cpu((*rhead)->h_size));
+ }
+
+ /*
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:51 -0700
+Subject: xfs: fix recovery failure when log record header wraps log end
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-27-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 284f1c2c9bebf871861184b0e2c40fa921dd380b upstream.
+
+The high-level log recovery algorithm consists of two loops that
+walk the physical log and process log records from the tail to the
+head. The first loop handles the case where the tail is beyond the
+head and processes records up to the end of the physical log. The
+subsequent loop processes records from the beginning of the physical
+log to the head.
+
+Because log records can wrap around the end of the physical log, the
+first loop mentioned above must handle this case appropriately.
+Records are processed from in-core buffers, which means that this
+algorithm must split the reads of such records into two partial
+I/Os: 1.) from the beginning of the record to the end of the log and
+2.) from the beginning of the log to the end of the record. This is
+further complicated by the fact that the log record header and log
+record data are read into independent buffers.
+
+The current handling of each buffer correctly splits the reads when
+either the header or data starts before the end of the log and wraps
+around the end. The data read does not correctly handle the case
+where the prior header read wrapped or ends on the physical log end
+boundary. blk_no is incremented to or beyond the log end after the
+header read to point to the record data, but the split data read
+logic triggers, attempts to read from an invalid log block and
+ultimately causes log recovery to fail. This can be reproduced
+fairly reliably via xfstests tests generic/047 and generic/388 with
+large iclog sizes (256k) and small (10M) logs.
+
+If the record header read has pushed beyond the end of the physical
+log, the subsequent data read is actually contiguous. Update the
+data read logic to detect the case where blk_no has wrapped, mod it
+against the log size to read from the correct address and issue one
+contiguous read for the log data buffer. The log record is processed
+as normal from the buffer(s), the loop exits after the current
+iteration and the subsequent loop picks up with the first new record
+after the start of the log.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 18 ++++++++++++++----
+ 1 file changed, 14 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -5216,7 +5216,7 @@ xlog_do_recovery_pass(
+ xfs_daddr_t *first_bad) /* out: first bad log rec */
+ {
+ xlog_rec_header_t *rhead;
+- xfs_daddr_t blk_no;
++ xfs_daddr_t blk_no, rblk_no;
+ xfs_daddr_t rhead_blk;
+ char *offset;
+ xfs_buf_t *hbp, *dbp;
+@@ -5369,9 +5369,19 @@ xlog_do_recovery_pass(
+ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+ blk_no += hblks;
+
+- /* Read in data for log record */
+- if (blk_no + bblks <= log->l_logBBsize) {
+- error = xlog_bread(log, blk_no, bblks, dbp,
++ /*
++ * Read the log record data in multiple reads if it
++ * wraps around the end of the log. Note that if the
++ * header already wrapped, blk_no could point past the
++ * end of the log. The record data is contiguous in
++ * that case.
++ */
++ if (blk_no + bblks <= log->l_logBBsize ||
++ blk_no >= log->l_logBBsize) {
++ /* mod blk_no in case the header wrapped and
++ * pushed it beyond the end of the log */
++ rblk_no = do_mod(blk_no, log->l_logBBsize);
++ error = xlog_bread(log, rblk_no, bblks, dbp,
+ &offset);
+ if (error)
+ goto bread_err2;
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:54 -0700
+Subject: xfs: handle -EFSCORRUPTED during head/tail verification
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-30-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a4c9b34d6a17081005ec459b57b8effc08f4c731 upstream.
+
+Torn write and tail overwrite detection both trigger only on
+-EFSBADCRC errors. While this is the most likely failure scenario
+for each condition, -EFSCORRUPTED is still possible in certain cases
+depending on what ends up on disk when a torn write or partial tail
+overwrite occurs. For example, an invalid log record h_len can lead
+to an -EFSCORRUPTED error when running the log recovery CRC pass.
+
+Therefore, update log head and tail verification to trigger the
+associated head/tail fixups in the event of -EFSCORRUPTED errors
+along with -EFSBADCRC. Also, -EFSCORRUPTED can currently be returned
+from xlog_do_recovery_pass() before rhead_blk is initialized if the
+first record encountered happens to be corrupted. This leads to an
+incorrect 'first_bad' return value. Initialize rhead_blk earlier in
+the function to address that problem as well.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1102,7 +1102,7 @@ xlog_verify_tail(
+ first_bad = 0;
+ error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+- while (error == -EFSBADCRC && first_bad) {
++ while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+ int tail_distance;
+
+ /*
+@@ -1188,7 +1188,7 @@ xlog_verify_head(
+ */
+ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+- if (error == -EFSBADCRC) {
++ if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+ /*
+ * We've hit a potential torn write. Reset the error and warn
+ * about it.
+@@ -5255,7 +5255,7 @@ xlog_do_recovery_pass(
+ LIST_HEAD (buffer_list);
+
+ ASSERT(head_blk != tail_blk);
+- rhead_blk = 0;
++ blk_no = rhead_blk = tail_blk;
+
+ for (i = 0; i < XLOG_RHASH_SIZE; i++)
+ INIT_HLIST_HEAD(&rhash[i]);
+@@ -5333,7 +5333,6 @@ xlog_do_recovery_pass(
+ }
+
+ memset(rhash, 0, sizeof(rhash));
+- blk_no = rhead_blk = tail_blk;
+ if (tail_blk > head_blk) {
+ /*
+ * Perform recovery around the end of the physical log.
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:05 -0700
+Subject: xfs: move bmbt owner change to last step of extent swap
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-41-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 6fb10d6d22094bc4062f92b9ccbcee2f54033d04 upstream.
+
+The extent swap operation currently resets bmbt block owners before
+the inode forks are swapped. The bmbt buffers are marked as ordered
+so they do not have to be physically logged in the transaction.
+
+This use of ordered buffers is not safe as bmbt buffers may have
+been previously physically logged. The bmbt owner change algorithm
+needs to be updated to physically log buffers that are already dirty
+when/if they are encountered. This means that an extent swap will
+eventually require multiple rolling transactions to handle large
+btrees. In addition, all inode related changes must be logged before
+the bmbt owner change scan begins and can roll the transaction for
+the first time to preserve fs consistency via log recovery.
+
+In preparation for such fixes to the bmbt owner change algorithm,
+refactor the bmbt scan out of the extent fork swap code to the last
+operation before the transaction is committed. Update
+xfs_swap_extent_forks() to only set the inode log flags when an
+owner change scan is necessary. Update xfs_swap_extents() to trigger
+the owner change based on the inode log flags. Note that since the
+owner change now occurs after the extent fork swap, the inode btrees
+must be fixed up with the inode number of the current inode (similar
+to log recovery).
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_util.c | 44 ++++++++++++++++++++++++++------------------
+ 1 file changed, 26 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1825,29 +1825,18 @@ xfs_swap_extent_forks(
+ }
+
+ /*
+- * Before we've swapped the forks, lets set the owners of the forks
+- * appropriately. We have to do this as we are demand paging the btree
+- * buffers, and so the validation done on read will expect the owner
+- * field to be correctly set. Once we change the owners, we can swap the
+- * inode forks.
++ * Btree format (v3) inodes have the inode number stamped in the bmbt
++ * block headers. We can't start changing the bmbt blocks until the
++ * inode owner change is logged so recovery does the right thing in the
++ * event of a crash. Set the owner change log flags now and leave the
++ * bmbt scan as the last step.
+ */
+ if (ip->i_d.di_version == 3 &&
+- ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++ ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ (*target_log_flags) |= XFS_ILOG_DOWNER;
+- error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+- tip->i_ino, NULL);
+- if (error)
+- return error;
+- }
+-
+ if (tip->i_d.di_version == 3 &&
+- tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ (*src_log_flags) |= XFS_ILOG_DOWNER;
+- error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+- ip->i_ino, NULL);
+- if (error)
+- return error;
+- }
+
+ /*
+ * Swap the data forks of the inodes
+@@ -2077,6 +2066,25 @@ xfs_swap_extents(
+ xfs_trans_log_inode(tp, tip, target_log_flags);
+
+ /*
++ * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
++ * have inode number owner values in the bmbt blocks that still refer to
++ * the old inode. Scan each bmbt to fix up the owner values with the
++ * inode number of the current inode.
++ */
++ if (src_log_flags & XFS_ILOG_DOWNER) {
++ error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
++ ip->i_ino, NULL);
++ if (error)
++ goto out_trans_cancel;
++ }
++ if (target_log_flags & XFS_ILOG_DOWNER) {
++ error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
++ tip->i_ino, NULL);
++ if (error)
++ goto out_trans_cancel;
++ }
++
++ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ */
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:11 -0700
+Subject: xfs: open code end_buffer_async_write in xfs_finish_page_writeback
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-47-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 8353a814f2518dcfa79a5bb77afd0e7dfa391bb1 upstream.
+
+Our loop in xfs_finish_page_writeback, which iterates over all buffer
+heads in a page and then calls end_buffer_async_write, which also
+iterates over all buffers in the page to check if any I/O is in flight
+is not only inefficient, but also potentially dangerous as
+end_buffer_async_write can cause the page and all buffers to be freed.
+
+Replace it with a single loop that does the work of end_buffer_async_write
+on a per-page basis.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c | 72 ++++++++++++++++++++++++++++++++++++------------------
+ 1 file changed, 48 insertions(+), 24 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
+ * associated buffer_heads, paying attention to the start and end offsets that
+ * we need to process on the page.
+ *
+- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+- * the page at all, as we may be racing with memory reclaim and it can free both
+- * the bufferhead chain and the page as it will see the page as clean and
+- * unused.
++ * Note that we open code the action in end_buffer_async_write here so that we
++ * only have to iterate over the buffers attached to the page once. This is not
++ * only more efficient, but also ensures that we only calls end_page_writeback
++ * at the end of the iteration, and thus avoids the pitfall of having the page
++ * and buffers potentially freed after every call to end_buffer_async_write.
+ */
+ static void
+ xfs_finish_page_writeback(
+@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
+ struct bio_vec *bvec,
+ int error)
+ {
+- unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
+- struct buffer_head *head, *bh, *next;
++ struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
++ bool busy = false;
+ unsigned int off = 0;
+- unsigned int bsize;
++ unsigned long flags;
+
+ ASSERT(bvec->bv_offset < PAGE_SIZE);
+ ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
+- ASSERT(end < PAGE_SIZE);
++ ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
+ ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
+
+- bh = head = page_buffers(bvec->bv_page);
+-
+- bsize = bh->b_size;
++ local_irq_save(flags);
++ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+- if (off > end)
+- break;
+- next = bh->b_this_page;
+- if (off < bvec->bv_offset)
+- goto next_bh;
+- bh->b_end_io(bh, !error);
+-next_bh:
+- off += bsize;
+- } while ((bh = next) != head);
++ if (off >= bvec->bv_offset &&
++ off < bvec->bv_offset + bvec->bv_len) {
++ ASSERT(buffer_async_write(bh));
++ ASSERT(bh->b_end_io == NULL);
++
++ if (error) {
++ mapping_set_error(bvec->bv_page->mapping, -EIO);
++ set_buffer_write_io_error(bh);
++ clear_buffer_uptodate(bh);
++ SetPageError(bvec->bv_page);
++ } else {
++ set_buffer_uptodate(bh);
++ }
++ clear_buffer_async_write(bh);
++ unlock_buffer(bh);
++ } else if (buffer_async_write(bh)) {
++ ASSERT(buffer_locked(bh));
++ busy = true;
++ }
++ off += bh->b_size;
++ } while ((bh = bh->b_this_page) != head);
++ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
++ local_irq_restore(flags);
++
++ if (!busy)
++ end_page_writeback(bvec->bv_page);
+ }
+
+ /*
+@@ -138,8 +154,10 @@ xfs_destroy_ioend(
+ int error)
+ {
+ struct inode *inode = ioend->io_inode;
+- struct bio *last = ioend->io_bio;
+- struct bio *bio, *next;
++ struct bio *bio = &ioend->io_inline_bio;
++ struct bio *last = ioend->io_bio, *next;
++ u64 start = bio->bi_iter.bi_sector;
++ bool quiet = bio_flagged(bio, BIO_QUIET);
+
+ for (bio = &ioend->io_inline_bio; bio; bio = next) {
+ struct bio_vec *bvec;
+@@ -160,6 +178,11 @@ xfs_destroy_ioend(
+
+ bio_put(bio);
+ }
++
++ if (unlikely(error && !quiet)) {
++ xfs_err_ratelimited(XFS_I(inode)->i_mount,
++ "writeback error on sector %llu", start);
++ }
+ }
+
+ /*
+@@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
+ ASSERT(!buffer_delay(bh));
+ ASSERT(!buffer_unwritten(bh));
+
+- mark_buffer_async_write(bh);
++ bh->b_end_io = NULL;
++ set_buffer_async_write(bh);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ }
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:59 -0700
+Subject: xfs: open-code xfs_buf_item_dirty()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-35-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a4f6cf6b2b6b60ec2a05a33a32e65caa4149aa2b upstream.
+
+It checks a single flag and has one caller. It probably isn't worth
+its own function.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c | 11 -----------
+ fs/xfs/xfs_buf_item.h | 1 -
+ fs/xfs/xfs_trans_buf.c | 2 +-
+ 3 files changed, 1 insertion(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -945,17 +945,6 @@ xfs_buf_item_log(
+ }
+
+
+-/*
+- * Return 1 if the buffer has been logged or ordered in a transaction (at any
+- * point, not just the current transaction) and 0 if not.
+- */
+-uint
+-xfs_buf_item_dirty(
+- xfs_buf_log_item_t *bip)
+-{
+- return (bip->bli_flags & XFS_BLI_DIRTY);
+-}
+-
+ STATIC void
+ xfs_buf_item_free(
+ xfs_buf_log_item_t *bip)
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,7 +64,6 @@ typedef struct xfs_buf_log_item {
+ int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void xfs_buf_item_relse(struct xfs_buf *);
+ void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+-uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
+ void xfs_buf_attach_iodone(struct xfs_buf *,
+ void(*)(struct xfs_buf *, xfs_log_item_t *),
+ xfs_log_item_t *);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
+ if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
+ xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bp);
+- } else if (!xfs_buf_item_dirty(bip)) {
++ } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
+ /***
+ ASSERT(bp->b_pincount == 0);
+ ***/
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:01 -0700
+Subject: xfs: ordered buffer log items are never formatted
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-37-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e9385cc6fb7edf23702de33a2dc82965d92d9392 upstream.
+
+Ordered buffers pass through the logging infrastructure without ever
+being written to the log. The way this works is that the ordered
+buffer status is transferred to the log vector at commit time via
+the ->iop_size() callback. In xlog_cil_insert_format_items(),
+ordered log vectors bypass ->iop_format() processing altogether.
+
+Therefore it is unnecessary for xfs_buf_item_format() to handle
+ordered buffers. Remove the unnecessary logic and assert that an
+ordered buffer never reaches this point.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c | 12 ++----------
+ fs/xfs/xfs_trace.h | 1 -
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -323,6 +323,8 @@ xfs_buf_item_format(
+ ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+ (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+ && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
++ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
++ (bip->bli_flags & XFS_BLI_STALE));
+
+
+ /*
+@@ -347,16 +349,6 @@ xfs_buf_item_format(
+ bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+ }
+
+- if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+- XFS_BLI_ORDERED) {
+- /*
+- * The buffer has been logged just to order it. It is not being
+- * included in the transaction commit, so don't format it.
+- */
+- trace_xfs_buf_item_format_ordered(bip);
+- return;
+- }
+-
+ for (i = 0; i < bip->bli_format_count; i++) {
+ xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+ &bip->bli_formats[i]);
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -520,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size)
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:50 -0700
+Subject: xfs: Properly retry failed inode items in case of error during buffer writeback
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-26-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit d3a304b6292168b83b45d624784f973fdc1ca674 upstream.
+
+When a buffer has been failed during writeback, the inode items into it
+are kept flush locked, and are never resubmitted due the flush lock, so,
+if any buffer fails to be written, the items in AIL are never written to
+disk and never unlocked.
+
+This causes unmount operation to hang due these items flush locked in AIL,
+but this also causes the items in AIL to never be written back, even when
+the IO device comes back to normal.
+
+I've been testing this patch with a DM-thin device, creating a
+filesystem larger than the real device.
+
+When writing enough data to fill the DM-thin device, XFS receives ENOSPC
+errors from the device, and keep spinning on xfsaild (when 'retry
+forever' configuration is set).
+
+At this point, the filesystem can not be unmounted because of the flush locked
+items in AIL, but worse, the items in AIL are never retried at all
+(once xfs_inode_item_push() will skip the items that are flush locked),
+even if the underlying DM-thin device is expanded to the proper size.
+
+This patch fixes both cases, retrying any item that has been failed
+previously, using the infra-structure provided by the previous patch.
+
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c | 28 ++++++++++++++++++++++++++++
+ fs/xfs/xfs_buf_item.h | 3 +++
+ fs/xfs/xfs_inode_item.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
+ fs/xfs/xfs_trans.h | 1 +
+ fs/xfs/xfs_trans_ail.c | 3 ++-
+ fs/xfs/xfs_trans_priv.h | 31 +++++++++++++++++++++++++++++++
+ 6 files changed, 108 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1234,3 +1234,31 @@ xfs_buf_iodone(
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+ xfs_buf_item_free(BUF_ITEM(lip));
+ }
++
++/*
++ * Requeue a failed buffer for writeback
++ *
++ * Return true if the buffer has been re-queued properly, false otherwise
++ */
++bool
++xfs_buf_resubmit_failed_buffers(
++ struct xfs_buf *bp,
++ struct xfs_log_item *lip,
++ struct list_head *buffer_list)
++{
++ struct xfs_log_item *next;
++
++ /*
++ * Clear XFS_LI_FAILED flag from all items before resubmit
++ *
++ * XFS_LI_FAILED set/clear is protected by xa_lock, caller this
++ * function already have it acquired
++ */
++ for (; lip; lip = next) {
++ next = lip->li_bio_list;
++ xfs_clear_li_failed(lip);
++ }
++
++ /* Add this buffer back to the delayed write list */
++ return xfs_buf_delwri_queue(bp, buffer_list);
++}
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -70,6 +70,9 @@ void xfs_buf_attach_iodone(struct xfs_bu
+ xfs_log_item_t *);
+ void xfs_buf_iodone_callbacks(struct xfs_buf *);
+ void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
++bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
++ struct xfs_log_item *,
++ struct list_head *);
+
+ extern kmem_zone_t *xfs_buf_item_zone;
+
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -27,6 +27,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_trans_priv.h"
++#include "xfs_buf_item.h"
+ #include "xfs_log.h"
+
+
+@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
+ wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
+
++/*
++ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
++ * have been failed during writeback
++ *
++ * This informs the AIL that the inode is already flush locked on the next push,
++ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
++ * dirty data makes it to disk.
++ */
++STATIC void
++xfs_inode_item_error(
++ struct xfs_log_item *lip,
++ struct xfs_buf *bp)
++{
++ ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
++ xfs_set_li_failed(lip, bp);
++}
++
+ STATIC uint
+ xfs_inode_item_push(
+ struct xfs_log_item *lip,
+@@ -484,13 +502,28 @@ xfs_inode_item_push(
+ {
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+- struct xfs_buf *bp = NULL;
++ struct xfs_buf *bp = lip->li_buf;
+ uint rval = XFS_ITEM_SUCCESS;
+ int error;
+
+ if (xfs_ipincount(ip) > 0)
+ return XFS_ITEM_PINNED;
+
++ /*
++ * The buffer containing this item failed to be written back
++ * previously. Resubmit the buffer for IO.
++ */
++ if (lip->li_flags & XFS_LI_FAILED) {
++ if (!xfs_buf_trylock(bp))
++ return XFS_ITEM_LOCKED;
++
++ if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
++ rval = XFS_ITEM_FLUSHING;
++
++ xfs_buf_unlock(bp);
++ return rval;
++ }
++
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ return XFS_ITEM_LOCKED;
+
+@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_ino
+ .iop_unlock = xfs_inode_item_unlock,
+ .iop_committed = xfs_inode_item_committed,
+ .iop_push = xfs_inode_item_push,
+- .iop_committing = xfs_inode_item_committing
++ .iop_committing = xfs_inode_item_committing,
++ .iop_error = xfs_inode_item_error
+ };
+
+
+@@ -710,7 +744,8 @@ xfs_iflush_done(
+ * the AIL lock.
+ */
+ iip = INODE_ITEM(blip);
+- if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
++ if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
++ lip->li_flags & XFS_LI_FAILED)
+ need_ail++;
+
+ blip = next;
+@@ -718,7 +753,8 @@ xfs_iflush_done(
+
+ /* make sure we capture the state of the initial inode. */
+ iip = INODE_ITEM(lip);
+- if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
++ if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
++ lip->li_flags & XFS_LI_FAILED)
+ need_ail++;
+
+ /*
+@@ -739,6 +775,9 @@ xfs_iflush_done(
+ if (INODE_ITEM(blip)->ili_logged &&
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+ mlip_changed |= xfs_ail_delete_one(ailp, blip);
++ else {
++ xfs_clear_li_failed(blip);
++ }
+ }
+
+ if (mlip_changed) {
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -50,6 +50,7 @@ typedef struct xfs_log_item {
+ struct xfs_ail *li_ailp; /* ptr to AIL */
+ uint li_type; /* item type */
+ uint li_flags; /* misc flags */
++ struct xfs_buf *li_buf; /* real buffer pointer */
+ struct xfs_log_item *li_bio_list; /* buffer item list */
+ void (*li_cb)(struct xfs_buf *,
+ struct xfs_log_item *);
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
+ bool
+ xfs_ail_delete_one(
+ struct xfs_ail *ailp,
+- struct xfs_log_item *lip)
++ struct xfs_log_item *lip)
+ {
+ struct xfs_log_item *mlip = xfs_ail_min(ailp);
+
+ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+ xfs_ail_delete(ailp, lip);
++ xfs_clear_li_failed(lip);
+ lip->li_flags &= ~XFS_LI_IN_AIL;
+ lip->li_lsn = 0;
+
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
+ *dst = *src;
+ }
+ #endif
++
++static inline void
++xfs_clear_li_failed(
++ struct xfs_log_item *lip)
++{
++ struct xfs_buf *bp = lip->li_buf;
++
++ ASSERT(lip->li_flags & XFS_LI_IN_AIL);
++ lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++ if (lip->li_flags & XFS_LI_FAILED) {
++ lip->li_flags &= ~XFS_LI_FAILED;
++ lip->li_buf = NULL;
++ xfs_buf_rele(bp);
++ }
++}
++
++static inline void
++xfs_set_li_failed(
++ struct xfs_log_item *lip,
++ struct xfs_buf *bp)
++{
++ lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++ if (!(lip->li_flags & XFS_LI_FAILED)) {
++ xfs_buf_hold(bp);
++ lip->li_flags |= XFS_LI_FAILED;
++ lip->li_buf = bp;
++ }
++}
++
+ #endif /* __XFS_TRANS_PRIV_H__ */
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:02 -0700
+Subject: xfs: refactor buffer logging into buffer dirtying helper
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-38-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 9684010d38eccda733b61106765e9357cf436f65 upstream.
+
+xfs_trans_log_buf() is responsible for logging the dirty segments of
+a buffer along with setting all of the necessary state on the
+transaction, buffer, bli, etc., to ensure that the associated items
+are marked as dirty and prepared for I/O. We have a couple use cases
+that need to to dirty a buffer in a transaction without actually
+logging dirty ranges of the buffer. One existing use case is
+ordered buffers, which are currently logged with arbitrary ranges to
+accomplish this even though the content of ordered buffers is never
+written to the log. Another pending use case is to relog an already
+dirty buffer across rolled transactions within the deferred
+operations infrastructure. This is required to prevent a held
+(XFS_BLI_HOLD) buffer from pinning the tail of the log.
+
+Refactor xfs_trans_log_buf() into a new function that contains all
+of the logic responsible to dirty the transaction, lidp, buffer and
+bli. This new function can be used in the future for the use cases
+outlined above. This patch does not introduce functional changes.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans.h | 4 +++-
+ fs/xfs/xfs_trans_buf.c | 46 ++++++++++++++++++++++++++++++----------------
+ 2 files changed, 33 insertions(+), 17 deletions(-)
+
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -222,7 +222,9 @@ void xfs_trans_dquot_buf(xfs_trans_t *,
+ void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
+-void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
++void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
++ uint);
++void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+ void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+
+ void xfs_extent_free_init_defer_op(void);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
+ }
+
+ /*
+- * This is called to mark bytes first through last inclusive of the given
+- * buffer as needing to be logged when the transaction is committed.
+- * The buffer must already be associated with the given transaction.
+- *
+- * First and last are numbers relative to the beginning of this buffer,
+- * so the first byte in the buffer is numbered 0 regardless of the
+- * value of b_blkno.
++ * Mark a buffer dirty in the transaction.
+ */
+ void
+-xfs_trans_log_buf(xfs_trans_t *tp,
+- xfs_buf_t *bp,
+- uint first,
+- uint last)
++xfs_trans_dirty_buf(
++ struct xfs_trans *tp,
++ struct xfs_buf *bp)
+ {
+- xfs_buf_log_item_t *bip = bp->b_fspriv;
++ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+- ASSERT(first <= last && last < BBTOB(bp->b_length));
+ ASSERT(bp->b_iodone == NULL ||
+ bp->b_iodone == xfs_buf_iodone_callbacks);
+
+@@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
+ bp->b_iodone = xfs_buf_iodone_callbacks;
+ bip->bli_item.li_cb = xfs_buf_iodone;
+
+- trace_xfs_trans_log_buf(bip);
+-
+ /*
+ * If we invalidated the buffer within this transaction, then
+ * cancel the invalidation now that we're dirtying the buffer
+@@ -545,15 +535,39 @@ xfs_trans_log_buf(xfs_trans_t *tp,
+ bp->b_flags &= ~XBF_STALE;
+ bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+ }
++ bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
++}
++
++/*
++ * This is called to mark bytes first through last inclusive of the given
++ * buffer as needing to be logged when the transaction is committed.
++ * The buffer must already be associated with the given transaction.
++ *
++ * First and last are numbers relative to the beginning of this buffer,
++ * so the first byte in the buffer is numbered 0 regardless of the
++ * value of b_blkno.
++ */
++void
++xfs_trans_log_buf(
++ struct xfs_trans *tp,
++ struct xfs_buf *bp,
++ uint first,
++ uint last)
++{
++ struct xfs_buf_log_item *bip = bp->b_fspriv;
++
++ ASSERT(first <= last && last < BBTOB(bp->b_length));
++
++ xfs_trans_dirty_buf(tp, bp);
+
+ /*
+ * If we have an ordered buffer we are not logging any dirty range but
+ * it still needs to be marked dirty and that it has been logged.
+ */
+- bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
++ trace_xfs_trans_log_buf(bip);
+ if (!(bip->bli_flags & XFS_BLI_ORDERED))
+ xfs_buf_item_log(bip, first, last);
+ }
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:07 -0700
+Subject: xfs: relog dirty buffers during swapext bmbt owner change
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-43-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 2dd3d709fc4338681a3aa61658122fa8faa5a437 upstream.
+
+The owner change bmbt scan that occurs during extent swap operations
+does not handle ordered buffer failures. Buffers that cannot be
+marked ordered must be physically logged so previously dirty ranges
+of the buffer can be relogged in the transaction.
+
+Since the bmbt scan may need to process and potentially log a large
+number of blocks, we can't expect to complete this operation in a
+single transaction. Update extent swap to use a permanent
+transaction with enough log reservation to physically log a buffer.
+Update the bmbt scan to physically log any buffers that cannot be
+ordered and to terminate the scan with -EAGAIN. On -EAGAIN, the
+caller rolls the transaction and restarts the scan. Finally, update
+the bmbt scan helper function to skip bmbt blocks that already match
+the expected owner so they are not reprocessed after scan restarts.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+[darrick: fix the xfs_trans_roll call]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c | 26 ++++++++++++++------
+ fs/xfs/xfs_bmap_util.c | 59 +++++++++++++++++++++++++++++++++++++---------
+ 2 files changed, 66 insertions(+), 19 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -4435,10 +4435,15 @@ xfs_btree_block_change_owner(
+
+ /* modify the owner */
+ block = xfs_btree_get_block(cur, level, &bp);
+- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
++ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
++ if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
++ return 0;
+ block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
+- else
++ } else {
++ if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
++ return 0;
+ block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
++ }
+
+ /*
+ * If the block is a root block hosted in an inode, we might not have a
+@@ -4447,14 +4452,19 @@ xfs_btree_block_change_owner(
+ * block is formatted into the on-disk inode fork. We still change it,
+ * though, so everything is consistent in memory.
+ */
+- if (bp) {
+- if (cur->bc_tp)
+- xfs_trans_ordered_buf(cur->bc_tp, bp);
+- else
+- xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+- } else {
++ if (!bp) {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
++ return 0;
++ }
++
++ if (cur->bc_tp) {
++ if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
++ xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
++ return -EAGAIN;
++ }
++ } else {
++ xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+ }
+
+ return 0;
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1914,6 +1914,48 @@ xfs_swap_extent_forks(
+ return 0;
+ }
+
++/*
++ * Fix up the owners of the bmbt blocks to refer to the current inode. The
++ * change owner scan attempts to order all modified buffers in the current
++ * transaction. In the event of ordered buffer failure, the offending buffer is
++ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
++ * the transaction in this case to replenish the fallback log reservation and
++ * restart the scan. This process repeats until the scan completes.
++ */
++static int
++xfs_swap_change_owner(
++ struct xfs_trans **tpp,
++ struct xfs_inode *ip,
++ struct xfs_inode *tmpip)
++{
++ int error;
++ struct xfs_trans *tp = *tpp;
++
++ do {
++ error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
++ NULL);
++ /* success or fatal error */
++ if (error != -EAGAIN)
++ break;
++
++ error = xfs_trans_roll(tpp, NULL);
++ if (error)
++ break;
++ tp = *tpp;
++
++ /*
++ * Redirty both inodes so they can relog and keep the log tail
++ * moving forward.
++ */
++ xfs_trans_ijoin(tp, ip, 0);
++ xfs_trans_ijoin(tp, tmpip, 0);
++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
++ xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
++ } while (true);
++
++ return error;
++}
++
+ int
+ xfs_swap_extents(
+ struct xfs_inode *ip, /* target inode */
+@@ -1927,8 +1969,8 @@ xfs_swap_extents(
+ int error = 0;
+ int lock_flags;
+ struct xfs_ifork *cowfp;
+- __uint64_t f;
+- int resblks;
++ uint64_t f;
++ int resblks = 0;
+
+ /*
+ * Lock the inodes against other IO, page faults and truncate to
+@@ -1976,11 +2018,8 @@ xfs_swap_extents(
+ XFS_SWAP_RMAP_SPACE_RES(mp,
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+ XFS_DATA_FORK);
+- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+- 0, 0, &tp);
+- } else
+- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+- 0, 0, &tp);
++ }
++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ if (error)
+ goto out_unlock;
+
+@@ -2072,14 +2111,12 @@ xfs_swap_extents(
+ * inode number of the current inode.
+ */
+ if (src_log_flags & XFS_ILOG_DOWNER) {
+- error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+- ip->i_ino, NULL);
++ error = xfs_swap_change_owner(&tp, ip, tip);
+ if (error)
+ goto out_trans_cancel;
+ }
+ if (target_log_flags & XFS_ILOG_DOWNER) {
+- error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+- tip->i_ino, NULL);
++ error = xfs_swap_change_owner(&tp, tip, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:00 -0700
+Subject: xfs: remove unnecessary dirty bli format check for ordered bufs
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-36-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 6453c65d3576bc3e602abb5add15f112755c08ca upstream.
+
+xfs_buf_item_unlock() historically checked the dirty state of the
+buffer by manually checking the buffer log formats for dirty
+segments. The introduction of ordered buffers invalidated this check
+because ordered buffers have dirty bli's but no dirty (logged)
+segments. The check was updated to accommodate ordered buffers by
+looking at the bli state first and considering the blf only if the
+bli is clean.
+
+This logic is safe but unnecessary. There is no valid case where the
+bli is clean yet the blf has dirty segments. The bli is set dirty
+whenever the blf is logged (via xfs_trans_log_buf()) and the blf is
+cleared in the only place BLI_DIRTY is cleared (xfs_trans_binval()).
+
+Remove the conditional blf dirty checks and replace with an assert
+that should catch any discrepencies between bli and blf dirty
+states. Refactor the old blf dirty check into a helper function to
+be used by the assert.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c | 62 +++++++++++++++++++++++++-------------------------
+ fs/xfs/xfs_buf_item.h | 1
+ 2 files changed, 33 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -575,26 +575,18 @@ xfs_buf_item_unlock(
+ {
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+- bool clean;
+- bool aborted;
+- int flags;
++ bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
++ bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
++ bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
++ bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+
+ /* Clear the buffer's association with this transaction. */
+ bp->b_transp = NULL;
+
+ /*
+- * If this is a transaction abort, don't return early. Instead, allow
+- * the brelse to happen. Normally it would be done for stale
+- * (cancelled) buffers at unpin time, but we'll never go through the
+- * pin/unpin cycle if we abort inside commit.
+- */
+- aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
+- /*
+- * Before possibly freeing the buf item, copy the per-transaction state
+- * so we can reference it safely later after clearing it from the
+- * buffer log item.
++ * The per-transaction state has been copied above so clear it from the
++ * bli.
+ */
+- flags = bip->bli_flags;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+
+ /*
+@@ -602,7 +594,7 @@ xfs_buf_item_unlock(
+ * unlock the buffer and free the buf item when the buffer is unpinned
+ * for the last time.
+ */
+- if (flags & XFS_BLI_STALE) {
++ if (bip->bli_flags & XFS_BLI_STALE) {
+ trace_xfs_buf_item_unlock_stale(bip);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ if (!aborted) {
+@@ -620,20 +612,11 @@ xfs_buf_item_unlock(
+ * regardless of whether it is dirty or not. A dirty abort implies a
+ * shutdown, anyway.
+ *
+- * Ordered buffers are dirty but may have no recorded changes, so ensure
+- * we only release clean items here.
++ * The bli dirty state should match whether the blf has logged segments
++ * except for ordered buffers, where only the bli should be dirty.
+ */
+- clean = (flags & XFS_BLI_DIRTY) ? false : true;
+- if (clean) {
+- int i;
+- for (i = 0; i < bip->bli_format_count; i++) {
+- if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+- bip->bli_formats[i].blf_map_size)) {
+- clean = false;
+- break;
+- }
+- }
+- }
++ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
++ (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+
+ /*
+ * Clean buffers, by definition, cannot be in the AIL. However, aborted
+@@ -652,11 +635,11 @@ xfs_buf_item_unlock(
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bp);
+- } else if (clean)
++ } else if (!dirty)
+ xfs_buf_item_relse(bp);
+ }
+
+- if (!(flags & XFS_BLI_HOLD))
++ if (!hold)
+ xfs_buf_relse(bp);
+ }
+
+@@ -945,6 +928,25 @@ xfs_buf_item_log(
+ }
+
+
++/*
++ * Return true if the buffer has any ranges logged/dirtied by a transaction,
++ * false otherwise.
++ */
++bool
++xfs_buf_item_dirty_format(
++ struct xfs_buf_log_item *bip)
++{
++ int i;
++
++ for (i = 0; i < bip->bli_format_count; i++) {
++ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
++ bip->bli_formats[i].blf_map_size))
++ return true;
++ }
++
++ return false;
++}
++
+ STATIC void
+ xfs_buf_item_free(
+ xfs_buf_log_item_t *bip)
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,6 +64,7 @@ typedef struct xfs_buf_log_item {
+ int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void xfs_buf_item_relse(struct xfs_buf *);
+ void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
++bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
+ void xfs_buf_attach_iodone(struct xfs_buf *,
+ void(*)(struct xfs_buf *, xfs_log_item_t *),
+ xfs_log_item_t *);
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:48 -0700
+Subject: xfs: remove xfs_trans_ail_delete_bulk
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-24-hch@lst.de>
+
+commit 27af1bbf524459962d1477a38ac6e0b7f79aaecc upstream.
+
+xfs_iflush_done uses an on-stack variable length array to pass the log
+items to be deleted to xfs_trans_ail_delete_bulk. On-stack VLAs are a
+nasty gcc extension that can lead to unbounded stack allocations, but
+fortunately we can easily avoid them by simply open coding
+xfs_trans_ail_delete_bulk in xfs_iflush_done, which is the only caller
+of it except for the single-item xfs_trans_ail_delete.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode_item.c | 29 +++++++++++--------
+ fs/xfs/xfs_trans_ail.c | 73 +++++++++++++++++++++++-------------------------
+ fs/xfs/xfs_trans_priv.h | 15 +--------
+ 3 files changed, 56 insertions(+), 61 deletions(-)
+
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -731,22 +731,27 @@ xfs_iflush_done(
+ * holding the lock before removing the inode from the AIL.
+ */
+ if (need_ail) {
+- struct xfs_log_item *log_items[need_ail];
+- int i = 0;
++ bool mlip_changed = false;
++
++ /* this is an opencoded batch version of xfs_trans_ail_delete */
+ spin_lock(&ailp->xa_lock);
+ for (blip = lip; blip; blip = blip->li_bio_list) {
+- iip = INODE_ITEM(blip);
+- if (iip->ili_logged &&
+- blip->li_lsn == iip->ili_flush_lsn) {
+- log_items[i++] = blip;
+- }
+- ASSERT(i <= need_ail);
++ if (INODE_ITEM(blip)->ili_logged &&
++ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
++ mlip_changed |= xfs_ail_delete_one(ailp, blip);
+ }
+- /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+- xfs_trans_ail_delete_bulk(ailp, log_items, i,
+- SHUTDOWN_CORRUPT_INCORE);
+- }
+
++ if (mlip_changed) {
++ if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
++ xlog_assign_tail_lsn_locked(ailp->xa_mount);
++ if (list_empty(&ailp->xa_ail))
++ wake_up_all(&ailp->xa_empty);
++ }
++ spin_unlock(&ailp->xa_lock);
++
++ if (mlip_changed)
++ xfs_log_space_wake(ailp->xa_mount);
++ }
+
+ /*
+ * clean up and unlock the flush lock now we are done. We can clear the
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
+ }
+ }
+
+-/*
+- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
++bool
++xfs_ail_delete_one(
++ struct xfs_ail *ailp,
++ struct xfs_log_item *lip)
++{
++ struct xfs_log_item *mlip = xfs_ail_min(ailp);
++
++ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
++ xfs_ail_delete(ailp, lip);
++ lip->li_flags &= ~XFS_LI_IN_AIL;
++ lip->li_lsn = 0;
++
++ return mlip == lip;
++}
++
++/**
++ * Remove a log items from the AIL
+ *
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
+ * before returning.
+ */
+ void
+-xfs_trans_ail_delete_bulk(
++xfs_trans_ail_delete(
+ struct xfs_ail *ailp,
+- struct xfs_log_item **log_items,
+- int nr_items,
++ struct xfs_log_item *lip,
+ int shutdown_type) __releases(ailp->xa_lock)
+ {
+- xfs_log_item_t *mlip;
+- int mlip_changed = 0;
+- int i;
+-
+- mlip = xfs_ail_min(ailp);
+-
+- for (i = 0; i < nr_items; i++) {
+- struct xfs_log_item *lip = log_items[i];
+- if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+- struct xfs_mount *mp = ailp->xa_mount;
+-
+- spin_unlock(&ailp->xa_lock);
+- if (!XFS_FORCED_SHUTDOWN(mp)) {
+- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+- "%s: attempting to delete a log item that is not in the AIL",
+- __func__);
+- xfs_force_shutdown(mp, shutdown_type);
+- }
+- return;
+- }
++ struct xfs_mount *mp = ailp->xa_mount;
++ bool mlip_changed;
+
+- trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+- xfs_ail_delete(ailp, lip);
+- lip->li_flags &= ~XFS_LI_IN_AIL;
+- lip->li_lsn = 0;
+- if (mlip == lip)
+- mlip_changed = 1;
++ if (!(lip->li_flags & XFS_LI_IN_AIL)) {
++ spin_unlock(&ailp->xa_lock);
++ if (!XFS_FORCED_SHUTDOWN(mp)) {
++ xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
++ "%s: attempting to delete a log item that is not in the AIL",
++ __func__);
++ xfs_force_shutdown(mp, shutdown_type);
++ }
++ return;
+ }
+
++ mlip_changed = xfs_ail_delete_one(ailp, lip);
+ if (mlip_changed) {
+- if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+- xlog_assign_tail_lsn_locked(ailp->xa_mount);
++ if (!XFS_FORCED_SHUTDOWN(mp))
++ xlog_assign_tail_lsn_locked(mp);
+ if (list_empty(&ailp->xa_ail))
+ wake_up_all(&ailp->xa_empty);
+- spin_unlock(&ailp->xa_lock);
++ }
+
++ spin_unlock(&ailp->xa_lock);
++ if (mlip_changed)
+ xfs_log_space_wake(ailp->xa_mount);
+- } else {
+- spin_unlock(&ailp->xa_lock);
+- }
+ }
+
+ int
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -106,18 +106,9 @@ xfs_trans_ail_update(
+ xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+ }
+
+-void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+- struct xfs_log_item **log_items, int nr_items,
+- int shutdown_type)
+- __releases(ailp->xa_lock);
+-static inline void
+-xfs_trans_ail_delete(
+- struct xfs_ail *ailp,
+- xfs_log_item_t *lip,
+- int shutdown_type) __releases(ailp->xa_lock)
+-{
+- xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
+-}
++bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
++void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
++ int shutdown_type) __releases(ailp->xa_lock);
+
+ static inline void
+ xfs_trans_ail_remove(
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:04 -0700
+Subject: xfs: skip bmbt block ino validation during owner change
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-40-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 99c794c639a65cc7b74f30a674048fd100fe9ac8 upstream.
+
+Extent swap uses xfs_btree_visit_blocks() to fix up bmbt block
+owners on v5 (!rmapbt) filesystems. The bmbt scan uses
+xfs_btree_lookup_get_block() to read bmbt blocks which verifies the
+current owner of the block against the parent inode of the bmbt.
+This works during extent swap because the bmbt owners are updated to
+the opposite inode number before the inode extent forks are swapped.
+
+The modified bmbt blocks are marked as ordered buffers which allows
+everything to commit in a single transaction. If the transaction
+commits to the log and the system crashes such that recovery of the
+extent swap is required, log recovery restarts the bmbt scan to fix
+up any bmbt blocks that may have not been written back before the
+crash. The log recovery bmbt scan occurs after the inode forks have
+been swapped, however. This causes the bmbt block owner verification
+to fail, leads to log recovery failure and requires xfs_repair to
+zap the log to recover.
+
+Define a new invalid inode owner flag to inform the btree block
+lookup mechanism that the current inode may be invalid with respect
+to the current owner of the bmbt block. Set this flag on the cursor
+used for change owner scans to allow this operation to work at
+runtime and during log recovery.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Fixes: bb3be7e7c ("xfs: check for bogus values in btree block headers")
+Cc: stable@vger.kernel.org
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap_btree.c | 1 +
+ fs/xfs/libxfs/xfs_btree.c | 1 +
+ fs/xfs/libxfs/xfs_btree.h | 3 ++-
+ 3 files changed, 4 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
+ cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+ if (!cur)
+ return -ENOMEM;
++ cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+
+ error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -1774,6 +1774,7 @@ xfs_btree_lookup_get_block(
+
+ /* Check the inode owner since the verifiers don't. */
+ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
++ !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+ be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+ cur->bc_private.b.ip->i_ino)
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
+ short forksize; /* fork's inode space */
+ char whichfork; /* data or attr fork */
+ char flags; /* flags */
+-#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
++#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
++#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
+ } b;
+ } bc_private; /* per-btree type data */
+ } xfs_btree_cur_t;
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:56 -0700
+Subject: xfs: stop searching for free slots in an inode chunk when there are none
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-32-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit 2d32311cf19bfb8c1d2b4601974ddd951f9cfd0b upstream.
+
+In a filesystem without finobt, the Space manager selects an AG to alloc a new
+inode, where xfs_dialloc_ag_inobt() will search the AG for the free slot chunk.
+
+When the new inode is in the same AG as its parent, the btree will be searched
+starting on the parent's record, and then retried from the top if no slot is
+available beyond the parent's record.
+
+To exit this loop though, xfs_dialloc_ag_inobt() relies on the fact that the
+btree must have a free slot available, once its callers relied on the
+agi->freecount when deciding how/where to allocate this new inode.
+
+In the case when the agi->freecount is corrupted, showing available inodes in an
+AG, when in fact there is none, this becomes an infinite loop.
+
+Add a way to stop the loop when a free slot is not found in the btree, making
+the function to fall into the whole AG scan which will then, be able to detect
+the corruption and shut the filesystem down.
+
+As pointed by Brian, this might impact performance, giving the fact we
+don't reset the search distance anymore when we reach the end of the
+tree, giving it fewer tries before falling back to the whole AG search, but
+it will only affect searches that start within 10 records to the end of the tree.
+
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ialloc.c | 55 ++++++++++++++++++++++-----------------------
+ 1 file changed, 27 insertions(+), 28 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -1123,6 +1123,7 @@ xfs_dialloc_ag_inobt(
+ int error;
+ int offset;
+ int i, j;
++ int searchdistance = 10;
+
+ pag = xfs_perag_get(mp, agno);
+
+@@ -1149,7 +1150,6 @@ xfs_dialloc_ag_inobt(
+ if (pagno == agno) {
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+- int searchdistance = 10;
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+@@ -1210,21 +1210,9 @@ xfs_dialloc_ag_inobt(
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+- while (!doneleft || !doneright) {
++ while (--searchdistance > 0 && (!doneleft || !doneright)) {
+ int useleft; /* using left inode chunk this time */
+
+- if (!--searchdistance) {
+- /*
+- * Not in range - save last search
+- * location and allocate a new inode
+- */
+- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+- pag->pagl_leftrec = trec.ir_startino;
+- pag->pagl_rightrec = rec.ir_startino;
+- pag->pagl_pagino = pagino;
+- goto newino;
+- }
+-
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+@@ -1268,26 +1256,37 @@ xfs_dialloc_ag_inobt(
+ goto error1;
+ }
+
+- /*
+- * We've reached the end of the btree. because
+- * we are only searching a small chunk of the
+- * btree each search, there is obviously free
+- * inodes closer to the parent inode than we
+- * are now. restart the search again.
+- */
+- pag->pagl_pagino = NULLAGINO;
+- pag->pagl_leftrec = NULLAGINO;
+- pag->pagl_rightrec = NULLAGINO;
+- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+- goto restart_pagno;
++ if (searchdistance <= 0) {
++ /*
++ * Not in range - save last search
++ * location and allocate a new inode
++ */
++ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++ pag->pagl_leftrec = trec.ir_startino;
++ pag->pagl_rightrec = rec.ir_startino;
++ pag->pagl_pagino = pagino;
++
++ } else {
++ /*
++ * We've reached the end of the btree. because
++ * we are only searching a small chunk of the
++ * btree each search, there is obviously free
++ * inodes closer to the parent inode than we
++ * are now. restart the search again.
++ */
++ pag->pagl_pagino = NULLAGINO;
++ pag->pagl_leftrec = NULLAGINO;
++ pag->pagl_rightrec = NULLAGINO;
++ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
++ goto restart_pagno;
++ }
+ }
+
+ /*
+ * In a different AG from the parent.
+ * See if the most recently allocated block has any free.
+ */
+-newino:
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:47 -0700
+Subject: xfs: toggle readonly state around xfs_log_mount_finish
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Eric Sandeen <sandeen@sandeen.net>, Eric Sandeen <sandeen@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-23-hch@lst.de>
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit 6f4a1eefdd0ad4561543270a7fceadabcca075dd upstream.
+
+When we do log recovery on a readonly mount, unlinked inode
+processing does not happen due to the readonly checks in
+xfs_inactive(), which are trying to prevent any I/O on a
+readonly mount.
+
+This is misguided - we do I/O on readonly mounts all the time,
+for consistency; for example, log recovery. So do the same
+RDONLY flag twiddling around xfs_log_mount_finish() as we
+do around xfs_log_mount(), for the same reason.
+
+This all cries out for a big rework but for now this is a
+simple fix to an obvious problem.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -743,10 +743,14 @@ xfs_log_mount_finish(
+ struct xfs_mount *mp)
+ {
+ int error = 0;
++ bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ return 0;
++ } else if (readonly) {
++ /* Allow unlinked processing to proceed */
++ mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ }
+
+ /*
+@@ -764,6 +768,9 @@ xfs_log_mount_finish(
+ xfs_log_work_queue(mp);
+ mp->m_super->s_flags &= ~MS_ACTIVE;
+
++ if (readonly)
++ mp->m_flags |= XFS_MOUNT_RDONLY;
++
+ return error;
+ }
+
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:12 -0700
+Subject: xfs: use kmem_free to free return value of kmem_zalloc
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Pan Bian <bianpan2016@163.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-48-hch@lst.de>
+
+From: Pan Bian <bianpan2016@163.com>
+
+commit 6c370590cfe0c36bcd62d548148aa65c984540b7 upstream.
+
+In function xfs_test_remount_options(), kfree() is used to free memory
+allocated by kmem_zalloc(). But it is better to use kmem_free().
+
+Signed-off-by: Pan Bian <bianpan2016@163.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_super.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1214,7 +1214,7 @@ xfs_test_remount_options(
+ tmp_mp->m_super = sb;
+ error = xfs_parseargs(tmp_mp, options);
+ xfs_free_fsname(tmp_mp);
+- kfree(tmp_mp);
++ kmem_free(tmp_mp);
+
+ return error;
+ }
--- /dev/null
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:46 -0700
+Subject: xfs: write unmount record for ro mounts
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Eric Sandeen <sandeen@sandeen.net>, Eric Sandeen <sandeen@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-22-hch@lst.de>
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit 757a69ef6cf2bf839bd4088e5609ddddd663b0c4 upstream.
+
+There are dueling comments in the xfs code about intent
+for log writes when unmounting a readonly filesystem.
+
+In xfs_mountfs, we see the intent:
+
+/*
+ * Now the log is fully replayed, we can transition to full read-only
+ * mode for read-only mounts. This will sync all the metadata and clean
+ * the log so that the recovery we just performed does not have to be
+ * replayed again on the next mount.
+ */
+
+and it calls xfs_quiesce_attr(), but by the time we get to
+xfs_log_unmount_write(), it returns early for a RDONLY mount:
+
+ * Don't write out unmount record on read-only mounts.
+
+Because of this, sequential ro mounts of a filesystem with
+a dirty log will replay the log each time, which seems odd.
+
+Fix this by writing an unmount record even for RO mounts, as long
+as norecovery wasn't specified (don't write a clean log record
+if a dirty log may still be there!) and the log device is
+writable.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -812,11 +812,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
+ int error;
+
+ /*
+- * Don't write out unmount record on read-only mounts.
++ * Don't write out unmount record on norecovery mounts or ro devices.
+ * Or, if we are doing a forced umount (typically because of IO errors).
+ */
+- if (mp->m_flags & XFS_MOUNT_RDONLY)
++ if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
++ xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
++ ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ return 0;
++ }
+
+ error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));