]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Sep 2017 08:21:23 +0000 (10:21 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Sep 2017 08:21:23 +0000 (10:21 +0200)
added patches:
iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
xfs-add-log-recovery-tracepoint-for-head-tail.patch
xfs-always-verify-the-log-tail-during-recovery.patch
xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
xfs-clear-ms_active-after-finishing-log-recovery.patch
xfs-disable-per-inode-dax-flag.patch
xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
xfs-don-t-set-v3-xflags-for-v2-inodes.patch
xfs-evict-all-inodes-involved-with-log-redo-item.patch
xfs-fix-incorrect-log_flushed-on-fsync.patch
xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
xfs-handle-efscorrupted-during-head-tail-verification.patch
xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
xfs-open-code-xfs_buf_item_dirty.patch
xfs-ordered-buffer-log-items-are-never-formatted.patch
xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
xfs-remove-xfs_trans_ail_delete_bulk.patch
xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
xfs-write-unmount-record-for-ro-mounts.patch

31 files changed:
queue-4.9/iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch [new file with mode: 0644]
queue-4.9/series
queue-4.9/xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch [new file with mode: 0644]
queue-4.9/xfs-add-log-recovery-tracepoint-for-head-tail.patch [new file with mode: 0644]
queue-4.9/xfs-always-verify-the-log-tail-during-recovery.patch [new file with mode: 0644]
queue-4.9/xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch [new file with mode: 0644]
queue-4.9/xfs-clear-ms_active-after-finishing-log-recovery.patch [new file with mode: 0644]
queue-4.9/xfs-disable-per-inode-dax-flag.patch [new file with mode: 0644]
queue-4.9/xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch [new file with mode: 0644]
queue-4.9/xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch [new file with mode: 0644]
queue-4.9/xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch [new file with mode: 0644]
queue-4.9/xfs-don-t-set-v3-xflags-for-v2-inodes.patch [new file with mode: 0644]
queue-4.9/xfs-evict-all-inodes-involved-with-log-redo-item.patch [new file with mode: 0644]
queue-4.9/xfs-fix-incorrect-log_flushed-on-fsync.patch [new file with mode: 0644]
queue-4.9/xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch [new file with mode: 0644]
queue-4.9/xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch [new file with mode: 0644]
queue-4.9/xfs-handle-efscorrupted-during-head-tail-verification.patch [new file with mode: 0644]
queue-4.9/xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch [new file with mode: 0644]
queue-4.9/xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch [new file with mode: 0644]
queue-4.9/xfs-open-code-xfs_buf_item_dirty.patch [new file with mode: 0644]
queue-4.9/xfs-ordered-buffer-log-items-are-never-formatted.patch [new file with mode: 0644]
queue-4.9/xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch [new file with mode: 0644]
queue-4.9/xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch [new file with mode: 0644]
queue-4.9/xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch [new file with mode: 0644]
queue-4.9/xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch [new file with mode: 0644]
queue-4.9/xfs-remove-xfs_trans_ail_delete_bulk.patch [new file with mode: 0644]
queue-4.9/xfs-skip-bmbt-block-ino-validation-during-owner-change.patch [new file with mode: 0644]
queue-4.9/xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch [new file with mode: 0644]
queue-4.9/xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch [new file with mode: 0644]
queue-4.9/xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch [new file with mode: 0644]
queue-4.9/xfs-write-unmount-record-for-ro-mounts.patch [new file with mode: 0644]

diff --git a/queue-4.9/iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch b/queue-4.9/iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
new file mode 100644 (file)
index 0000000..fff65d9
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:45 -0700
+Subject: iomap: fix integer truncation issues in the zeroing and dirtying helpers
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-21-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e28ae8e428fefe2facd72cea9f29906ecb9c861d upstream.
+
+Fix the min_t calls in the zeroing and dirtying helpers to perform the
+comparisms on 64-bit types, which prevents them from incorrectly
+being truncated, and larger zeroing operations being stuck in a never
+ending loop.
+
+Special thanks to Markus Stockhausen for spotting the bug.
+
+Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
+Tested-by: Paul Menzel <pmenzel@molgen.mpg.de>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/iomap.c
++++ b/fs/iomap.c
+@@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, l
+               unsigned long bytes;    /* Bytes to write to page */
+               offset = (pos & (PAGE_SIZE - 1));
+-              bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
++              bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+               rpage = __iomap_read_page(inode, pos);
+               if (IS_ERR(rpage))
+@@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *ino
+               unsigned offset, bytes;
+               offset = pos & (PAGE_SIZE - 1); /* Within page */
+-              bytes = min_t(unsigned, PAGE_SIZE - offset, count);
++              bytes = min_t(loff_t, PAGE_SIZE - offset, count);
+               if (IS_DAX(inode))
+                       status = iomap_dax_zero(pos, offset, bytes, iomap);
index cc556c6deeafee4c2a1edb008fd70249dc0599a4..5c530ede7c1a89e21df9312531db5d49c91462cc 100644 (file)
@@ -45,3 +45,33 @@ xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
 xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
 xfs-fix-per-inode-dax-flag-inheritance.patch
 xfs-fix-inobt-inode-allocation-search-optimization.patch
+xfs-clear-ms_active-after-finishing-log-recovery.patch
+xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
+iomap-fix-integer-truncation-issues-in-the-zeroing-and-dirtying-helpers.patch
+xfs-write-unmount-record-for-ro-mounts.patch
+xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
+xfs-remove-xfs_trans_ail_delete_bulk.patch
+xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
+xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
+xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
+xfs-always-verify-the-log-tail-during-recovery.patch
+xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
+xfs-handle-efscorrupted-during-head-tail-verification.patch
+xfs-add-log-recovery-tracepoint-for-head-tail.patch
+xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
+xfs-evict-all-inodes-involved-with-log-redo-item.patch
+xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
+xfs-open-code-xfs_buf_item_dirty.patch
+xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
+xfs-ordered-buffer-log-items-are-never-formatted.patch
+xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
+xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
+xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
+xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
+xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
+xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
+xfs-disable-per-inode-dax-flag.patch
+xfs-fix-incorrect-log_flushed-on-fsync.patch
+xfs-don-t-set-v3-xflags-for-v2-inodes.patch
+xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
+xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
diff --git a/queue-4.9/xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch b/queue-4.9/xfs-add-infrastructure-needed-for-error-propagation-during-buffer-io-failure.patch
new file mode 100644 (file)
index 0000000..662ed8f
--- /dev/null
@@ -0,0 +1,115 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:49 -0700
+Subject: xfs: Add infrastructure needed for error propagation during buffer IO failure
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-25-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit 0b80ae6ed13169bd3a244e71169f2cc020b0c57a upstream.
+
+With the current code, XFS never re-submit a failed buffer for IO,
+because the failed item in the buffer is kept in the flush locked state
+forever.
+
+To be able to resubmit an log item for IO, we need a way to mark an item
+as failed, if, for any reason the buffer which the item belonged to
+failed during writeback.
+
+Add a new log item callback to be used after an IO completion failure
+and make the needed clean ups.
+
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   32 +++++++++++++++++++++++++++++++-
+ fs/xfs/xfs_trans.h    |    7 +++++--
+ 2 files changed, 36 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -29,6 +29,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_log.h"
++#include "xfs_inode.h"
+ kmem_zone_t   *xfs_buf_item_zone;
+@@ -1054,6 +1055,31 @@ xfs_buf_do_callbacks(
+       }
+ }
++/*
++ * Invoke the error state callback for each log item affected by the failed I/O.
++ *
++ * If a metadata buffer write fails with a non-permanent error, the buffer is
++ * eventually resubmitted and so the completion callbacks are not run. The error
++ * state may need to be propagated to the log items attached to the buffer,
++ * however, so the next AIL push of the item knows hot to handle it correctly.
++ */
++STATIC void
++xfs_buf_do_callbacks_fail(
++      struct xfs_buf          *bp)
++{
++      struct xfs_log_item     *next;
++      struct xfs_log_item     *lip = bp->b_fspriv;
++      struct xfs_ail          *ailp = lip->li_ailp;
++
++      spin_lock(&ailp->xa_lock);
++      for (; lip; lip = next) {
++              next = lip->li_bio_list;
++              if (lip->li_ops->iop_error)
++                      lip->li_ops->iop_error(lip, bp);
++      }
++      spin_unlock(&ailp->xa_lock);
++}
++
+ static bool
+ xfs_buf_iodone_callback_error(
+       struct xfs_buf          *bp)
+@@ -1123,7 +1149,11 @@ xfs_buf_iodone_callback_error(
+       if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+               goto permanent_error;
+-      /* still a transient error, higher layers will retry */
++      /*
++       * Still a transient error, run IO completion failure callbacks and let
++       * the higher layers retry the buffer.
++       */
++      xfs_buf_do_callbacks_fail(bp);
+       xfs_buf_ioerror(bp, 0);
+       xfs_buf_relse(bp);
+       return true;
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -65,11 +65,13 @@ typedef struct xfs_log_item {
+ } xfs_log_item_t;
+ #define       XFS_LI_IN_AIL   0x1
+-#define XFS_LI_ABORTED        0x2
++#define       XFS_LI_ABORTED  0x2
++#define       XFS_LI_FAILED   0x4
+ #define XFS_LI_FLAGS \
+       { XFS_LI_IN_AIL,        "IN_AIL" }, \
+-      { XFS_LI_ABORTED,       "ABORTED" }
++      { XFS_LI_ABORTED,       "ABORTED" }, \
++      { XFS_LI_FAILED,        "FAILED" }
+ struct xfs_item_ops {
+       void (*iop_size)(xfs_log_item_t *, int *, int *);
+@@ -80,6 +82,7 @@ struct xfs_item_ops {
+       void (*iop_unlock)(xfs_log_item_t *);
+       xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+       void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
++      void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+ };
+ void  xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
diff --git a/queue-4.9/xfs-add-log-recovery-tracepoint-for-head-tail.patch b/queue-4.9/xfs-add-log-recovery-tracepoint-for-head-tail.patch
new file mode 100644 (file)
index 0000000..98aa89c
--- /dev/null
@@ -0,0 +1,65 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:55 -0700
+Subject: xfs: add log recovery tracepoint for head/tail
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-31-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e67d3d4246e5fbb0c7c700426d11241ca9c6f473 upstream.
+
+Torn write detection and tail overwrite detection can shift the log
+head and tail respectively in the event of CRC mismatch or
+corruption errors. Add a high-level log recovery tracepoint to dump
+the final log head/tail and make those values easily attainable in
+debug/diagnostic situations.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |    2 ++
+ fs/xfs/xfs_trace.h       |   18 ++++++++++++++++++
+ 2 files changed, 20 insertions(+)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -5596,6 +5596,8 @@ xlog_do_recover(
+       xfs_buf_t       *bp;
+       xfs_sb_t        *sbp;
++      trace_xfs_log_recover(log, head_blk, tail_blk);
++
+       /*
+        * First replay the images in the log.
+        */
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -1991,6 +1991,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
++TRACE_EVENT(xfs_log_recover,
++      TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
++      TP_ARGS(log, headblk, tailblk),
++      TP_STRUCT__entry(
++              __field(dev_t, dev)
++              __field(xfs_daddr_t, headblk)
++              __field(xfs_daddr_t, tailblk)
++      ),
++      TP_fast_assign(
++              __entry->dev = log->l_mp->m_super->s_dev;
++              __entry->headblk = headblk;
++              __entry->tailblk = tailblk;
++      ),
++      TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
++                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
++                __entry->tailblk)
++)
++
+ TRACE_EVENT(xfs_log_recover_record,
+       TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+       TP_ARGS(log, rhead, pass),
diff --git a/queue-4.9/xfs-always-verify-the-log-tail-during-recovery.patch b/queue-4.9/xfs-always-verify-the-log-tail-during-recovery.patch
new file mode 100644 (file)
index 0000000..76fa9b1
--- /dev/null
@@ -0,0 +1,76 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:52 -0700
+Subject: xfs: always verify the log tail during recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-28-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 5297ac1f6d7cbf45464a49b9558831f271dfc559 upstream.
+
+Log tail verification currently only occurs when torn writes are
+detected at the head of the log. This was introduced because a
+change in the head block due to torn writes can lead to a change in
+the tail block (each log record header references the current tail)
+and the tail block should be verified before log recovery proceeds.
+
+Tail corruption is possible outside of torn write scenarios,
+however. For example, partial log writes can be detected and cleared
+during the initial head/tail block discovery process. If the partial
+write coincides with a tail overwrite, the log tail is corrupted and
+recovery fails.
+
+To facilitate correct handling of log tail overwites, update log
+recovery to always perform tail verification. This is necessary to
+detect potential tail overwrite conditions when torn writes may not
+have occurred. This changes normal (i.e., no torn writes) recovery
+behavior slightly to detect and return CRC related errors near the
+tail before actual recovery starts.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   26 +++-----------------------
+ 1 file changed, 3 insertions(+), 23 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1183,31 +1183,11 @@ xlog_verify_head(
+                       ASSERT(0);
+                       return 0;
+               }
+-
+-              /*
+-               * Now verify the tail based on the updated head. This is
+-               * required because the torn writes trimmed from the head could
+-               * have been written over the tail of a previous record. Return
+-               * any errors since recovery cannot proceed if the tail is
+-               * corrupt.
+-               *
+-               * XXX: This leaves a gap in truly robust protection from torn
+-               * writes in the log. If the head is behind the tail, the tail
+-               * pushes forward to create some space and then a crash occurs
+-               * causing the writes into the previous record's tail region to
+-               * tear, log recovery isn't able to recover.
+-               *
+-               * How likely is this to occur? If possible, can we do something
+-               * more intelligent here? Is it safe to push the tail forward if
+-               * we can determine that the tail is within the range of the
+-               * torn write (e.g., the kernel can only overwrite the tail if
+-               * it has actually been pushed forward)? Alternatively, could we
+-               * somehow prevent this condition at runtime?
+-               */
+-              error = xlog_verify_tail(log, *head_blk, *tail_blk);
+       }
++      if (error)
++              return error;
+-      return error;
++      return xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
+ /*
diff --git a/queue-4.9/xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch b/queue-4.9/xfs-check-for-race-with-xfs_reclaim_inode-in-xfs_ifree_cluster.patch
new file mode 100644 (file)
index 0000000..a4c13aa
--- /dev/null
@@ -0,0 +1,85 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:58 -0700
+Subject: xfs: check for race with xfs_reclaim_inode() in xfs_ifree_cluster()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Omar Sandoval <osandov@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-34-hch@lst.de>
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit f2e9ad212def50bcf4c098c6288779dd97fff0f0 upstream.
+
+After xfs_ifree_cluster() finds an inode in the radix tree and verifies
+that the inode number is what it expected, xfs_reclaim_inode() can swoop
+in and free it. xfs_ifree_cluster() will then happily continue working
+on the freed inode. Most importantly, it will mark the inode stale,
+which will probably be overwritten when the inode slab object is
+reallocated, but if it has already been reallocated then we can end up
+with an inode spuriously marked stale.
+
+In 8a17d7ddedb4 ("xfs: mark reclaimed inodes invalid earlier") we added
+a second check to xfs_iflush_cluster() to detect this race, but the
+similar RCU lookup in xfs_ifree_cluster() needs the same treatment.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |   10 +++++-----
+ fs/xfs/xfs_inode.c  |   23 ++++++++++++++++++-----
+ 2 files changed, 23 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1078,11 +1078,11 @@ reclaim:
+        * Because we use RCU freeing we need to ensure the inode always appears
+        * to be reclaimed with an invalid inode number when in the free state.
+        * We do this as early as possible under the ILOCK so that
+-       * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+-       * By doing this, we guarantee that once xfs_iflush_cluster has locked
+-       * XFS_ILOCK that it will see either a valid, flushable inode that will
+-       * serialise correctly, or it will see a clean (and invalid) inode that
+-       * it can skip.
++       * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
++       * detect races with us here. By doing this, we guarantee that once
++       * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
++       * it will see either a valid inode that will serialise correctly, or it
++       * will see an invalid inode that it can skip.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2368,11 +2368,24 @@ retry:
+                        * already marked stale. If we can't lock it, back off
+                        * and retry.
+                        */
+-                      if (ip != free_ip &&
+-                          !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+-                              rcu_read_unlock();
+-                              delay(1);
+-                              goto retry;
++                      if (ip != free_ip) {
++                              if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
++                                      rcu_read_unlock();
++                                      delay(1);
++                                      goto retry;
++                              }
++
++                              /*
++                               * Check the inode number again in case we're
++                               * racing with freeing in xfs_reclaim_inode().
++                               * See the comments in that function for more
++                               * information as to why the initial check is
++                               * not sufficient.
++                               */
++                              if (ip->i_ino != inum + i) {
++                                      xfs_iunlock(ip, XFS_ILOCK_EXCL);
++                                      continue;
++                              }
+                       }
+                       rcu_read_unlock();
diff --git a/queue-4.9/xfs-clear-ms_active-after-finishing-log-recovery.patch b/queue-4.9/xfs-clear-ms_active-after-finishing-log-recovery.patch
new file mode 100644 (file)
index 0000000..72835e6
--- /dev/null
@@ -0,0 +1,85 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:43 -0700
+Subject: xfs: clear MS_ACTIVE after finishing log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-19-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 8204f8ddaafafcae074746fcf2a05a45e6827603 upstream.
+
+Way back when we established inode block-map redo log items, it was
+discovered that we needed to prevent the VFS from evicting inodes during
+log recovery because any given inode might be have bmap redo items to
+replay even if the inode has no link count and is ultimately deleted,
+and any eviction of an unlinked inode causes the inode to be truncated
+and freed too early.
+
+To make this possible, we set MS_ACTIVE so that inodes would not be torn
+down immediately upon release.  Unfortunately, this also results in the
+quota inodes not being released at all if a later part of the mount
+process should fail, because we never reclaim the inodes.  So, set
+MS_ACTIVE right before we do the last part of log recovery and clear it
+immediately after we finish the log recovery so that everything
+will be torn down properly if we abort the mount.
+
+Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c   |   11 +++++++++++
+ fs/xfs/xfs_mount.c |   10 ----------
+ 2 files changed, 11 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -749,9 +749,20 @@ xfs_log_mount_finish(
+               return 0;
+       }
++      /*
++       * During the second phase of log recovery, we need iget and
++       * iput to behave like they do for an active filesystem.
++       * xfs_fs_drop_inode needs to be able to prevent the deletion
++       * of inodes before we're done replaying log items on those
++       * inodes.  Turn it off immediately after recovery finishes
++       * so that we don't leak the quota inodes if subsequent mount
++       * activities fail.
++       */
++      mp->m_super->s_flags |= MS_ACTIVE;
+       error = xlog_recover_finish(mp->m_log);
+       if (!error)
+               xfs_log_work_queue(mp);
++      mp->m_super->s_flags &= ~MS_ACTIVE;
+       return error;
+ }
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -925,15 +925,6 @@ xfs_mountfs(
+       }
+       /*
+-       * During the second phase of log recovery, we need iget and
+-       * iput to behave like they do for an active filesystem.
+-       * xfs_fs_drop_inode needs to be able to prevent the deletion
+-       * of inodes before we're done replaying log items on those
+-       * inodes.
+-       */
+-      mp->m_super->s_flags |= MS_ACTIVE;
+-
+-      /*
+        * Finish recovering the file system.  This part needed to be delayed
+        * until after the root and real-time bitmap inodes were consistently
+        * read in.
+@@ -1008,7 +999,6 @@ xfs_mountfs(
+  out_quota:
+       xfs_qm_unmount_quotas(mp);
+  out_rtunmount:
+-      mp->m_super->s_flags &= ~MS_ACTIVE;
+       xfs_rtunmount_inodes(mp);
+  out_rele_rip:
+       IRELE(rip);
diff --git a/queue-4.9/xfs-disable-per-inode-dax-flag.patch b/queue-4.9/xfs-disable-per-inode-dax-flag.patch
new file mode 100644 (file)
index 0000000..d4b60e3
--- /dev/null
@@ -0,0 +1,39 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:08 -0700
+Subject: xfs: disable per-inode DAX flag
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-44-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 742d84290739ae908f1b61b7d17ea382c8c0073a upstream.
+
+Currently flag switching can be used to easily crash the kernel.  Disable
+the per-inode DAX flag until that is sorted out.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -1005,11 +1005,12 @@ xfs_diflags_to_linux(
+               inode->i_flags |= S_NOATIME;
+       else
+               inode->i_flags &= ~S_NOATIME;
++#if 0 /* disabled until the flag switching races are sorted out */
+       if (xflags & FS_XFLAG_DAX)
+               inode->i_flags |= S_DAX;
+       else
+               inode->i_flags &= ~S_DAX;
+-
++#endif
+ }
+ static int
diff --git a/queue-4.9/xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch b/queue-4.9/xfs-disallow-marking-previously-dirty-buffers-as-ordered.patch
new file mode 100644 (file)
index 0000000..17b9b5c
--- /dev/null
@@ -0,0 +1,97 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:06 -0700
+Subject: xfs: disallow marking previously dirty buffers as ordered
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-42-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a5814bceea48ee1c57c4db2bd54b0c0246daf54a upstream.
+
+Ordered buffers are used in situations where the buffer is not
+physically logged but must pass through the transaction/logging
+pipeline for a particular transaction. As a result, ordered buffers
+are not unpinned and written back until the transaction commits to
+the log. Ordered buffers have a strict requirement that the target
+buffer must not be currently dirty and resident in the log pipeline
+at the time it is marked ordered. If a dirty+ordered buffer is
+committed, the buffer is reinserted to the AIL but not physically
+relogged at the LSN of the associated checkpoint. The buffer log
+item is assigned the LSN of the latest checkpoint and the AIL
+effectively releases the previously logged buffer content from the
+active log before the buffer has been written back. If the tail
+pushes forward and a filesystem crash occurs while in this state, an
+inconsistent filesystem could result.
+
+It is currently the caller responsibility to ensure an ordered
+buffer is not already dirty from a previous modification. This is
+unclear and error prone when not used in situations where it is
+guaranteed a buffer has not been previously modified (such as new
+metadata allocations).
+
+To facilitate general purpose use of ordered buffers, update
+xfs_trans_ordered_buf() to conditionally order the buffer based on
+state of the log item and return the status of the result. If the
+bli is dirty, do not order the buffer and return false. The caller
+must either physically log the buffer (having acquired the
+appropriate log reservation) or push it from the AIL to clean it
+before it can be marked ordered in the current transaction.
+
+Note that ordered buffers are currently only used in two situations:
+1.) inode chunk allocation where previously logged buffers are not
+possible and 2.) extent swap which will be updated to handle ordered
+buffer failures in a separate patch.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans.h     |    2 +-
+ fs/xfs/xfs_trans_buf.c |    7 +++++--
+ 2 files changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -217,7 +217,7 @@ void               xfs_trans_bhold_release(xfs_trans_
+ void          xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+-void          xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
++bool          xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+ void          xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -724,7 +724,7 @@ xfs_trans_inode_alloc_buf(
+  * transactions rather than the physical changes we make to the buffer without
+  * changing writeback ordering constraints of metadata buffers.
+  */
+-void
++bool
+ xfs_trans_ordered_buf(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp)
+@@ -734,7 +734,9 @@ xfs_trans_ordered_buf(
+       ASSERT(bp->b_transp == tp);
+       ASSERT(bip != NULL);
+       ASSERT(atomic_read(&bip->bli_refcount) > 0);
+-      ASSERT(!xfs_buf_item_dirty_format(bip));
++
++      if (xfs_buf_item_dirty_format(bip))
++              return false;
+       bip->bli_flags |= XFS_BLI_ORDERED;
+       trace_xfs_buf_item_ordered(bip);
+@@ -744,6 +746,7 @@ xfs_trans_ordered_buf(
+        * to be marked dirty and that it has been logged.
+        */
+       xfs_trans_dirty_buf(tp, bp);
++      return true;
+ }
+ /*
diff --git a/queue-4.9/xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch b/queue-4.9/xfs-don-t-leak-quotacheck-dquots-when-cow-recovery.patch
new file mode 100644 (file)
index 0000000..c1fe019
--- /dev/null
@@ -0,0 +1,34 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:44 -0700
+Subject: xfs: don't leak quotacheck dquots when cow recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-20-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 77aff8c76425c8f49b50d0b9009915066739e7d2 upstream.
+
+If we fail a mount on account of cow recovery errors, it's possible that
+a previous quotacheck left some dquots in memory.  The bailout clause of
+xfs_mountfs forgets to purge these, and so we leak them.  Fix that.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_mount.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1004,6 +1004,8 @@ xfs_mountfs(
+       IRELE(rip);
+       cancel_delayed_work_sync(&mp->m_reclaim_work);
+       xfs_reclaim_inodes(mp, SYNC_WAIT);
++      /* Clean out dquots that might be in memory after quotacheck. */
++      xfs_qm_unmount(mp);
+  out_log_dealloc:
+       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+       xfs_log_mount_cancel(mp);
diff --git a/queue-4.9/xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch b/queue-4.9/xfs-don-t-log-dirty-ranges-for-ordered-buffers.patch
new file mode 100644 (file)
index 0000000..1aa9581
--- /dev/null
@@ -0,0 +1,121 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:03 -0700
+Subject: xfs: don't log dirty ranges for ordered buffers
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-39-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 8dc518dfa7dbd079581269e51074b3c55a65a880 upstream.
+
+Ordered buffers are attached to transactions and pushed through the
+logging infrastructure just like normal buffers with the exception
+that they are not actually written to the log. Therefore, we don't
+need to log dirty ranges of ordered buffers. xfs_trans_log_buf() is
+called on ordered buffers to set up all of the dirty state on the
+transaction, buffer and log item and prepare the buffer for I/O.
+
+Now that xfs_trans_dirty_buf() is available, call it from
+xfs_trans_ordered_buf() so the latter is now mutually exclusive with
+xfs_trans_log_buf(). This reflects the implementation of ordered
+buffers and helps eliminate confusion over the need to log ranges of
+ordered buffers just to set up internal log state.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c  |    6 ++----
+ fs/xfs/libxfs/xfs_ialloc.c |    2 --
+ fs/xfs/xfs_trans_buf.c     |   26 ++++++++++++++------------
+ 3 files changed, 16 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -4447,12 +4447,10 @@ xfs_btree_block_change_owner(
+        * though, so everything is consistent in memory.
+        */
+       if (bp) {
+-              if (cur->bc_tp) {
++              if (cur->bc_tp)
+                       xfs_trans_ordered_buf(cur->bc_tp, bp);
+-                      xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+-              } else {
++              else
+                       xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+-              }
+       } else {
+               ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+               ASSERT(level == cur->bc_nlevels - 1);
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
+                                * transaction and pin the log appropriately.
+                                */
+                               xfs_trans_ordered_buf(tp, fbuf);
+-                              xfs_trans_log_buf(tp, fbuf, 0,
+-                                                BBTOB(fbuf->b_length) - 1);
+                       }
+               } else {
+                       fbuf->b_flags |= XBF_DONE;
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -560,16 +560,12 @@ xfs_trans_log_buf(
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       ASSERT(first <= last && last < BBTOB(bp->b_length));
++      ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+       xfs_trans_dirty_buf(tp, bp);
+-      /*
+-       * If we have an ordered buffer we are not logging any dirty range but
+-       * it still needs to be marked dirty and that it has been logged.
+-       */
+       trace_xfs_trans_log_buf(bip);
+-      if (!(bip->bli_flags & XFS_BLI_ORDERED))
+-              xfs_buf_item_log(bip, first, last);
++      xfs_buf_item_log(bip, first, last);
+ }
+@@ -722,12 +718,11 @@ xfs_trans_inode_alloc_buf(
+ }
+ /*
+- * Mark the buffer as ordered for this transaction. This means
+- * that the contents of the buffer are not recorded in the transaction
+- * but it is tracked in the AIL as though it was. This allows us
+- * to record logical changes in transactions rather than the physical
+- * changes we make to the buffer without changing writeback ordering
+- * constraints of metadata buffers.
++ * Mark the buffer as ordered for this transaction. This means that the contents
++ * of the buffer are not recorded in the transaction but it is tracked in the
++ * AIL as though it was. This allows us to record logical changes in
++ * transactions rather than the physical changes we make to the buffer without
++ * changing writeback ordering constraints of metadata buffers.
+  */
+ void
+ xfs_trans_ordered_buf(
+@@ -739,9 +734,16 @@ xfs_trans_ordered_buf(
+       ASSERT(bp->b_transp == tp);
+       ASSERT(bip != NULL);
+       ASSERT(atomic_read(&bip->bli_refcount) > 0);
++      ASSERT(!xfs_buf_item_dirty_format(bip));
+       bip->bli_flags |= XFS_BLI_ORDERED;
+       trace_xfs_buf_item_ordered(bip);
++
++      /*
++       * We don't log a dirty range of an ordered buffer but it still needs
++       * to be marked dirty and that it has been logged.
++       */
++      xfs_trans_dirty_buf(tp, bp);
+ }
+ /*
diff --git a/queue-4.9/xfs-don-t-set-v3-xflags-for-v2-inodes.patch b/queue-4.9/xfs-don-t-set-v3-xflags-for-v2-inodes.patch
new file mode 100644 (file)
index 0000000..9d8f073
--- /dev/null
@@ -0,0 +1,102 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:10 -0700
+Subject: xfs: don't set v3 xflags for v2 inodes
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-46-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit dd60687ee541ca3f6df8758f38e6f22f57c42a37 upstream.
+
+Reject attempts to set XFLAGS that correspond to di_flags2 inode flags
+if the inode isn't a v3 inode, because di_flags2 only exists on v3.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c |   38 +++++++++++++++++++++++++-------------
+ 1 file changed, 25 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
+       return 0;
+ }
+-STATIC void
+-xfs_set_diflags(
++STATIC uint16_t
++xfs_flags2diflags(
+       struct xfs_inode        *ip,
+       unsigned int            xflags)
+ {
+-      unsigned int            di_flags;
+-      uint64_t                di_flags2;
+-
+       /* can't set PREALLOC this way, just preserve it */
+-      di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++      uint16_t                di_flags =
++              (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++
+       if (xflags & FS_XFLAG_IMMUTABLE)
+               di_flags |= XFS_DIFLAG_IMMUTABLE;
+       if (xflags & FS_XFLAG_APPEND)
+@@ -967,19 +966,24 @@ xfs_set_diflags(
+               if (xflags & FS_XFLAG_EXTSIZE)
+                       di_flags |= XFS_DIFLAG_EXTSIZE;
+       }
+-      ip->i_d.di_flags = di_flags;
+-      /* diflags2 only valid for v3 inodes. */
+-      if (ip->i_d.di_version < 3)
+-              return;
++      return di_flags;
++}
++
++STATIC uint64_t
++xfs_flags2diflags2(
++      struct xfs_inode        *ip,
++      unsigned int            xflags)
++{
++      uint64_t                di_flags2 =
++              (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+-      di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+       if (xflags & FS_XFLAG_DAX)
+               di_flags2 |= XFS_DIFLAG2_DAX;
+       if (xflags & FS_XFLAG_COWEXTSIZE)
+               di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+-      ip->i_d.di_flags2 = di_flags2;
++      return di_flags2;
+ }
+ STATIC void
+@@ -1020,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
+       struct fsxattr          *fa)
+ {
+       struct xfs_mount        *mp = ip->i_mount;
++      uint64_t                di_flags2;
+       /* Can't change realtime flag if any extents are allocated. */
+       if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+@@ -1050,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
+           !capable(CAP_LINUX_IMMUTABLE))
+               return -EPERM;
+-      xfs_set_diflags(ip, fa->fsx_xflags);
++      /* diflags2 only valid for v3 inodes. */
++      di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
++      if (di_flags2 && ip->i_d.di_version < 3)
++              return -EINVAL;
++
++      ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
++      ip->i_d.di_flags2 = di_flags2;
++
+       xfs_diflags_to_linux(ip);
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/queue-4.9/xfs-evict-all-inodes-involved-with-log-redo-item.patch b/queue-4.9/xfs-evict-all-inodes-involved-with-log-redo-item.patch
new file mode 100644 (file)
index 0000000..2f310bc
--- /dev/null
@@ -0,0 +1,94 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:57 -0700
+Subject: xfs: evict all inodes involved with log redo item
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>, viro@ZenIV.linux.org.uk
+Message-ID: <20170917210712.10804-33-hch@lst.de>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 799ea9e9c59949008770aab4e1da87f10e99dbe4 upstream.
+
+When we introduced the bmap redo log items, we set MS_ACTIVE on the
+mountpoint and XFS_IRECOVERY on the inode to prevent unlinked inodes
+from being truncated prematurely during log recovery.  This also had the
+effect of putting linked inodes on the lru instead of evicting them.
+
+Unfortunately, we neglected to find all those unreferenced lru inodes
+and evict them after finishing log recovery, which means that we leak
+them if anything goes wrong in the rest of xfs_mountfs, because the lru
+is only cleaned out on unmount.
+
+Therefore, evict unreferenced inodes in the lru list immediately
+after clearing MS_ACTIVE.
+
+Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: viro@ZenIV.linux.org.uk
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/inode.c         |    1 +
+ fs/internal.h      |    1 -
+ fs/xfs/xfs_log.c   |   12 ++++++++++++
+ include/linux/fs.h |    1 +
+ 4 files changed, 14 insertions(+), 1 deletion(-)
+
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -637,6 +637,7 @@ again:
+       dispose_list(&dispose);
+ }
++EXPORT_SYMBOL_GPL(evict_inodes);
+ /**
+  * invalidate_inodes  - attempt to free all inodes on a superblock
+--- a/fs/internal.h
++++ b/fs/internal.h
+@@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const
+ extern void inode_io_list_del(struct inode *inode);
+ extern long get_nr_dirty_inodes(void);
+-extern void evict_inodes(struct super_block *);
+ extern int invalidate_inodes(struct super_block *, bool);
+ /*
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -761,12 +761,24 @@ xfs_log_mount_finish(
+        * inodes.  Turn it off immediately after recovery finishes
+        * so that we don't leak the quota inodes if subsequent mount
+        * activities fail.
++       *
++       * We let all inodes involved in redo item processing end up on
++       * the LRU instead of being evicted immediately so that if we do
++       * something to an unlinked inode, the irele won't cause
++       * premature truncation and freeing of the inode, which results
++       * in log recovery failure.  We have to evict the unreferenced
++       * lru inodes after clearing MS_ACTIVE because we don't
++       * otherwise clean up the lru if there's a subsequent failure in
++       * xfs_mountfs, which leads to us leaking the inodes if nothing
++       * else (e.g. quotacheck) references the inodes before the
++       * mount failure occurs.
+        */
+       mp->m_super->s_flags |= MS_ACTIVE;
+       error = xlog_recover_finish(mp->m_log);
+       if (!error)
+               xfs_log_work_queue(mp);
+       mp->m_super->s_flags &= ~MS_ACTIVE;
++      evict_inodes(mp->m_super);
+       if (readonly)
+               mp->m_flags |= XFS_MOUNT_RDONLY;
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inod
+ #endif
+ extern void unlock_new_inode(struct inode *);
+ extern unsigned int get_next_ino(void);
++extern void evict_inodes(struct super_block *sb);
+ extern void __iget(struct inode * inode);
+ extern void iget_failed(struct inode *);
diff --git a/queue-4.9/xfs-fix-incorrect-log_flushed-on-fsync.patch b/queue-4.9/xfs-fix-incorrect-log_flushed-on-fsync.patch
new file mode 100644 (file)
index 0000000..2bf5e66
--- /dev/null
@@ -0,0 +1,98 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:09 -0700
+Subject: xfs: fix incorrect log_flushed on fsync
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Amir Goldstein <amir73il@gmail.com>, Josef Bacik <jbacik@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-45-hch@lst.de>
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit 47c7d0b19502583120c3f396c7559e7a77288a68 upstream.
+
+When calling into _xfs_log_force{,_lsn}() with a pointer
+to log_flushed variable, log_flushed will be set to 1 if:
+1. xlog_sync() is called to flush the active log buffer
+AND/OR
+2. xlog_wait() is called to wait on a syncing log buffers
+
+xfs_file_fsync() checks the value of log_flushed after
+_xfs_log_force_lsn() call to optimize away an explicit
+PREFLUSH request to the data block device after writing
+out all the file's pages to disk.
+
+This optimization is incorrect in the following sequence of events:
+
+ Task A                    Task B
+ -------------------------------------------------------
+ xfs_file_fsync()
+   _xfs_log_force_lsn()
+     xlog_sync()
+        [submit PREFLUSH]
+                           xfs_file_fsync()
+                             file_write_and_wait_range()
+                               [submit WRITE X]
+                               [endio  WRITE X]
+                             _xfs_log_force_lsn()
+                               xlog_wait()
+        [endio  PREFLUSH]
+
+The write X is not guarantied to be on persistent storage
+when PREFLUSH request in completed, because write A was submitted
+after the PREFLUSH request, but xfs_file_fsync() of task A will
+be notified of log_flushed=1 and will skip explicit flush.
+
+If the system crashes after fsync of task A, write X may not be
+present on disk after reboot.
+
+This bug was discovered and demonstrated using Josef Bacik's
+dm-log-writes target, which can be used to record block io operations
+and then replay a subset of these operations onto the target device.
+The test goes something like this:
+- Use fsx to execute ops of a file and record ops on log device
+- Every now and then fsync the file, store md5 of file and mark
+  the location in the log
+- Then replay log onto device for each mark, mount fs and compare
+  md5 of file to stored value
+
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Josef Bacik <jbacik@fb.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |    7 -------
+ 1 file changed, 7 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -3337,8 +3337,6 @@ maybe_sleep:
+                */
+               if (iclog->ic_state & XLOG_STATE_IOERROR)
+                       return -EIO;
+-              if (log_flushed)
+-                      *log_flushed = 1;
+       } else {
+ no_sleep:
+@@ -3442,8 +3440,6 @@ try_again:
+                               xlog_wait(&iclog->ic_prev->ic_write_wait,
+                                                       &log->l_icloglock);
+-                              if (log_flushed)
+-                                      *log_flushed = 1;
+                               already_slept = 1;
+                               goto try_again;
+                       }
+@@ -3477,9 +3473,6 @@ try_again:
+                        */
+                       if (iclog->ic_state & XLOG_STATE_IOERROR)
+                               return -EIO;
+-
+-                      if (log_flushed)
+-                              *log_flushed = 1;
+               } else {                /* just return */
+                       spin_unlock(&log->l_icloglock);
+               }
diff --git a/queue-4.9/xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch b/queue-4.9/xfs-fix-log-recovery-corruption-error-due-to-tail-overwrite.patch
new file mode 100644 (file)
index 0000000..8ccc26b
--- /dev/null
@@ -0,0 +1,201 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:53 -0700
+Subject: xfs: fix log recovery corruption error due to tail overwrite
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-29-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 4a4f66eac4681378996a1837ad1ffec3a2e2981f upstream.
+
+If we consider the case where the tail (T) of the log is pinned long
+enough for the head (H) to push and block behind the tail, we can
+end up blocked in the following state without enough free space (f)
+in the log to satisfy a transaction reservation:
+
+       0       phys. log       N
+       [-------HffT---H'--T'---]
+
+The last good record in the log (before H) refers to T. The tail
+eventually pushes forward (T') leaving more free space in the log
+for writes to H. At this point, suppose space frees up in the log
+for the maximum of 8 in-core log buffers to start flushing out to
+the log. If this pushes the head from H to H', these next writes
+overwrite the previous tail T. This is safe because the items logged
+from T to T' have been written back and removed from the AIL.
+
+If the next log writes (H -> H') happen to fail and result in
+partial records in the log, the filesystem shuts down having
+overwritten T with invalid data. Log recovery correctly locates H on
+the subsequent mount, but H still refers to the now corrupted tail
+T. This results in log corruption errors and recovery failure.
+
+Since the tail overwrite results from otherwise correct runtime
+behavior, it is up to log recovery to try and deal with this
+situation. Update log recovery tail verification to run a CRC pass
+from the first record past the tail to the head. This facilitates
+error detection at T and moves the recovery tail to the first good
+record past H' (similar to truncating the head on torn write
+detection). If corruption is detected beyond the range possibly
+affected by the max number of iclogs, the log is legitimately
+corrupted and log recovery failure is expected.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |  108 +++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 77 insertions(+), 31 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1029,61 +1029,106 @@ out_error:
+ }
+ /*
+- * Check the log tail for torn writes. This is required when torn writes are
+- * detected at the head and the head had to be walked back to a previous record.
+- * The tail of the previous record must now be verified to ensure the torn
+- * writes didn't corrupt the previous tail.
++ * Calculate distance from head to tail (i.e., unused space in the log).
++ */
++static inline int
++xlog_tail_distance(
++      struct xlog     *log,
++      xfs_daddr_t     head_blk,
++      xfs_daddr_t     tail_blk)
++{
++      if (head_blk < tail_blk)
++              return tail_blk - head_blk;
++
++      return tail_blk + (log->l_logBBsize - head_blk);
++}
++
++/*
++ * Verify the log tail. This is particularly important when torn or incomplete
++ * writes have been detected near the front of the log and the head has been
++ * walked back accordingly.
++ *
++ * We also have to handle the case where the tail was pinned and the head
++ * blocked behind the tail right before a crash. If the tail had been pushed
++ * immediately prior to the crash and the subsequent checkpoint was only
++ * partially written, it's possible it overwrote the last referenced tail in the
++ * log with garbage. This is not a coherency problem because the tail must have
++ * been pushed before it can be overwritten, but appears as log corruption to
++ * recovery because we have no way to know the tail was updated if the
++ * subsequent checkpoint didn't write successfully.
+  *
+- * Return an error if CRC verification fails as recovery cannot proceed.
++ * Therefore, CRC check the log from tail to head. If a failure occurs and the
++ * offending record is within max iclog bufs from the head, walk the tail
++ * forward and retry until a valid tail is found or corruption is detected out
++ * of the range of a possible overwrite.
+  */
+ STATIC int
+ xlog_verify_tail(
+       struct xlog             *log,
+       xfs_daddr_t             head_blk,
+-      xfs_daddr_t             tail_blk)
++      xfs_daddr_t             *tail_blk,
++      int                     hsize)
+ {
+       struct xlog_rec_header  *thead;
+       struct xfs_buf          *bp;
+       xfs_daddr_t             first_bad;
+-      int                     count;
+       int                     error = 0;
+       bool                    wrapped;
+-      xfs_daddr_t             tmp_head;
++      xfs_daddr_t             tmp_tail;
++      xfs_daddr_t             orig_tail = *tail_blk;
+       bp = xlog_get_bp(log, 1);
+       if (!bp)
+               return -ENOMEM;
+       /*
+-       * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+-       * a temporary head block that points after the last possible
+-       * concurrently written record of the tail.
++       * Make sure the tail points to a record (returns positive count on
++       * success).
+        */
+-      count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+-                                   XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+-                                   &wrapped);
+-      if (count < 0) {
+-              error = count;
++      error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
++                      &tmp_tail, &thead, &wrapped);
++      if (error < 0)
+               goto out;
+-      }
++      if (*tail_blk != tmp_tail)
++              *tail_blk = tmp_tail;
+       /*
+-       * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+-       * into the actual log head. tmp_head points to the start of the record
+-       * so update it to the actual head block.
++       * Run a CRC check from the tail to the head. We can't just check
++       * MAX_ICLOGS records past the tail because the tail may point to stale
++       * blocks cleared during the search for the head/tail. These blocks are
++       * overwritten with zero-length records and thus record count is not a
++       * reliable indicator of the iclog state before a crash.
+        */
+-      if (count < XLOG_MAX_ICLOGS + 1)
+-              tmp_head = head_blk;
+-
+-      /*
+-       * We now have a tail and temporary head block that covers at least
+-       * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+-       * records were completely written. Run a CRC verification pass from
+-       * tail to head and return the result.
+-       */
+-      error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
++      first_bad = 0;
++      error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+                                     XLOG_RECOVER_CRCPASS, &first_bad);
++      while (error == -EFSBADCRC && first_bad) {
++              int     tail_distance;
++
++              /*
++               * Is corruption within range of the head? If so, retry from
++               * the next record. Otherwise return an error.
++               */
++              tail_distance = xlog_tail_distance(log, head_blk, first_bad);
++              if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
++                      break;
++
++              /* skip to the next record; returns positive count on success */
++              error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
++                              &tmp_tail, &thead, &wrapped);
++              if (error < 0)
++                      goto out;
++
++              *tail_blk = tmp_tail;
++              first_bad = 0;
++              error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
++                                            XLOG_RECOVER_CRCPASS, &first_bad);
++      }
++      if (!error && *tail_blk != orig_tail)
++              xfs_warn(log->l_mp,
++              "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
++                       orig_tail, *tail_blk);
+ out:
+       xlog_put_bp(bp);
+       return error;
+@@ -1187,7 +1232,8 @@ xlog_verify_head(
+       if (error)
+               return error;
+-      return xlog_verify_tail(log, *head_blk, *tail_blk);
++      return xlog_verify_tail(log, *head_blk, tail_blk,
++                              be32_to_cpu((*rhead)->h_size));
+ }
+ /*
diff --git a/queue-4.9/xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch b/queue-4.9/xfs-fix-recovery-failure-when-log-record-header-wraps-log-end.patch
new file mode 100644 (file)
index 0000000..1f74f45
--- /dev/null
@@ -0,0 +1,90 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:51 -0700
+Subject: xfs: fix recovery failure when log record header wraps log end
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-27-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 284f1c2c9bebf871861184b0e2c40fa921dd380b upstream.
+
+The high-level log recovery algorithm consists of two loops that
+walk the physical log and process log records from the tail to the
+head. The first loop handles the case where the tail is beyond the
+head and processes records up to the end of the physical log. The
+subsequent loop processes records from the beginning of the physical
+log to the head.
+
+Because log records can wrap around the end of the physical log, the
+first loop mentioned above must handle this case appropriately.
+Records are processed from in-core buffers, which means that this
+algorithm must split the reads of such records into two partial
+I/Os: 1.) from the beginning of the record to the end of the log and
+2.) from the beginning of the log to the end of the record. This is
+further complicated by the fact that the log record header and log
+record data are read into independent buffers.
+
+The current handling of each buffer correctly splits the reads when
+either the header or data starts before the end of the log and wraps
+around the end. The data read does not correctly handle the case
+where the prior header read wrapped or ends on the physical log end
+boundary. blk_no is incremented to or beyond the log end after the
+header read to point to the record data, but the split data read
+logic triggers, attempts to read from an invalid log block and
+ultimately causes log recovery to fail. This can be reproduced
+fairly reliably via xfstests tests generic/047 and generic/388 with
+large iclog sizes (256k) and small (10M) logs.
+
+If the record header read has pushed beyond the end of the physical
+log, the subsequent data read is actually contiguous. Update the
+data read logic to detect the case where blk_no has wrapped, mod it
+against the log size to read from the correct address and issue one
+contiguous read for the log data buffer. The log record is processed
+as normal from the buffer(s), the loop exits after the current
+iteration and the subsequent loop picks up with the first new record
+after the start of the log.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   18 ++++++++++++++----
+ 1 file changed, 14 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -5216,7 +5216,7 @@ xlog_do_recovery_pass(
+       xfs_daddr_t             *first_bad)     /* out: first bad log rec */
+ {
+       xlog_rec_header_t       *rhead;
+-      xfs_daddr_t             blk_no;
++      xfs_daddr_t             blk_no, rblk_no;
+       xfs_daddr_t             rhead_blk;
+       char                    *offset;
+       xfs_buf_t               *hbp, *dbp;
+@@ -5369,9 +5369,19 @@ xlog_do_recovery_pass(
+                       bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+                       blk_no += hblks;
+-                      /* Read in data for log record */
+-                      if (blk_no + bblks <= log->l_logBBsize) {
+-                              error = xlog_bread(log, blk_no, bblks, dbp,
++                      /*
++                       * Read the log record data in multiple reads if it
++                       * wraps around the end of the log. Note that if the
++                       * header already wrapped, blk_no could point past the
++                       * end of the log. The record data is contiguous in
++                       * that case.
++                       */
++                      if (blk_no + bblks <= log->l_logBBsize ||
++                          blk_no >= log->l_logBBsize) {
++                              /* mod blk_no in case the header wrapped and
++                               * pushed it beyond the end of the log */
++                              rblk_no = do_mod(blk_no, log->l_logBBsize);
++                              error = xlog_bread(log, rblk_no, bblks, dbp,
+                                                  &offset);
+                               if (error)
+                                       goto bread_err2;
diff --git a/queue-4.9/xfs-handle-efscorrupted-during-head-tail-verification.patch b/queue-4.9/xfs-handle-efscorrupted-during-head-tail-verification.patch
new file mode 100644 (file)
index 0000000..dab3e54
--- /dev/null
@@ -0,0 +1,72 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:54 -0700
+Subject: xfs: handle -EFSCORRUPTED during head/tail verification
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-30-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a4c9b34d6a17081005ec459b57b8effc08f4c731 upstream.
+
+Torn write and tail overwrite detection both trigger only on
+-EFSBADCRC errors. While this is the most likely failure scenario
+for each condition, -EFSCORRUPTED is still possible in certain cases
+depending on what ends up on disk when a torn write or partial tail
+overwrite occurs. For example, an invalid log record h_len can lead
+to an -EFSCORRUPTED error when running the log recovery CRC pass.
+
+Therefore, update log head and tail verification to trigger the
+associated head/tail fixups in the event of -EFSCORRUPTED errors
+along with -EFSBADCRC. Also, -EFSCORRUPTED can currently be returned
+from xlog_do_recovery_pass() before rhead_blk is initialized if the
+first record encountered happens to be corrupted. This leads to an
+incorrect 'first_bad' return value. Initialize rhead_blk earlier in
+the function to address that problem as well.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |    7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1102,7 +1102,7 @@ xlog_verify_tail(
+       first_bad = 0;
+       error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+                                     XLOG_RECOVER_CRCPASS, &first_bad);
+-      while (error == -EFSBADCRC && first_bad) {
++      while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+               int     tail_distance;
+               /*
+@@ -1188,7 +1188,7 @@ xlog_verify_head(
+        */
+       error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+                                     XLOG_RECOVER_CRCPASS, &first_bad);
+-      if (error == -EFSBADCRC) {
++      if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+               /*
+                * We've hit a potential torn write. Reset the error and warn
+                * about it.
+@@ -5255,7 +5255,7 @@ xlog_do_recovery_pass(
+       LIST_HEAD               (buffer_list);
+       ASSERT(head_blk != tail_blk);
+-      rhead_blk = 0;
++      blk_no = rhead_blk = tail_blk;
+       for (i = 0; i < XLOG_RHASH_SIZE; i++)
+               INIT_HLIST_HEAD(&rhash[i]);
+@@ -5333,7 +5333,6 @@ xlog_do_recovery_pass(
+       }
+       memset(rhash, 0, sizeof(rhash));
+-      blk_no = rhead_blk = tail_blk;
+       if (tail_blk > head_blk) {
+               /*
+                * Perform recovery around the end of the physical log.
diff --git a/queue-4.9/xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch b/queue-4.9/xfs-move-bmbt-owner-change-to-last-step-of-extent-swap.patch
new file mode 100644 (file)
index 0000000..c2283bb
--- /dev/null
@@ -0,0 +1,109 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:05 -0700
+Subject: xfs: move bmbt owner change to last step of extent swap
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-41-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 6fb10d6d22094bc4062f92b9ccbcee2f54033d04 upstream.
+
+The extent swap operation currently resets bmbt block owners before
+the inode forks are swapped. The bmbt buffers are marked as ordered
+so they do not have to be physically logged in the transaction.
+
+This use of ordered buffers is not safe as bmbt buffers may have
+been previously physically logged. The bmbt owner change algorithm
+needs to be updated to physically log buffers that are already dirty
+when/if they are encountered. This means that an extent swap will
+eventually require multiple rolling transactions to handle large
+btrees. In addition, all inode related changes must be logged before
+the bmbt owner change scan begins and can roll the transaction for
+the first time to preserve fs consistency via log recovery.
+
+In preparation for such fixes to the bmbt owner change algorithm,
+refactor the bmbt scan out of the extent fork swap code to the last
+operation before the transaction is committed. Update
+xfs_swap_extent_forks() to only set the inode log flags when an
+owner change scan is necessary. Update xfs_swap_extents() to trigger
+the owner change based on the inode log flags. Note that since the
+owner change now occurs after the extent fork swap, the inode btrees
+must be fixed up with the inode number of the current inode (similar
+to log recovery).
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_util.c |   44 ++++++++++++++++++++++++++------------------
+ 1 file changed, 26 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1825,29 +1825,18 @@ xfs_swap_extent_forks(
+       }
+       /*
+-       * Before we've swapped the forks, lets set the owners of the forks
+-       * appropriately. We have to do this as we are demand paging the btree
+-       * buffers, and so the validation done on read will expect the owner
+-       * field to be correctly set. Once we change the owners, we can swap the
+-       * inode forks.
++       * Btree format (v3) inodes have the inode number stamped in the bmbt
++       * block headers. We can't start changing the bmbt blocks until the
++       * inode owner change is logged so recovery does the right thing in the
++       * event of a crash. Set the owner change log flags now and leave the
++       * bmbt scan as the last step.
+        */
+       if (ip->i_d.di_version == 3 &&
+-          ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++          ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+               (*target_log_flags) |= XFS_ILOG_DOWNER;
+-              error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+-                                            tip->i_ino, NULL);
+-              if (error)
+-                      return error;
+-      }
+-
+       if (tip->i_d.di_version == 3 &&
+-          tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++          tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+               (*src_log_flags) |= XFS_ILOG_DOWNER;
+-              error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+-                                            ip->i_ino, NULL);
+-              if (error)
+-                      return error;
+-      }
+       /*
+        * Swap the data forks of the inodes
+@@ -2077,6 +2066,25 @@ xfs_swap_extents(
+       xfs_trans_log_inode(tp, tip, target_log_flags);
+       /*
++       * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
++       * have inode number owner values in the bmbt blocks that still refer to
++       * the old inode. Scan each bmbt to fix up the owner values with the
++       * inode number of the current inode.
++       */
++      if (src_log_flags & XFS_ILOG_DOWNER) {
++              error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
++                                            ip->i_ino, NULL);
++              if (error)
++                      goto out_trans_cancel;
++      }
++      if (target_log_flags & XFS_ILOG_DOWNER) {
++              error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
++                                            tip->i_ino, NULL);
++              if (error)
++                      goto out_trans_cancel;
++      }
++
++      /*
+        * If this is a synchronous mount, make sure that the
+        * transaction goes to disk before returning to the user.
+        */
diff --git a/queue-4.9/xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch b/queue-4.9/xfs-open-code-end_buffer_async_write-in-xfs_finish_page_writeback.patch
new file mode 100644 (file)
index 0000000..ee6eafd
--- /dev/null
@@ -0,0 +1,146 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:11 -0700
+Subject: xfs: open code end_buffer_async_write in xfs_finish_page_writeback
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-47-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 8353a814f2518dcfa79a5bb77afd0e7dfa391bb1 upstream.
+
+Our loop in xfs_finish_page_writeback, which iterates over all buffer
+heads in a page and then calls end_buffer_async_write, which also
+iterates over all buffers in the page to check if any I/O is in flight
+is not only inefficient, but also potentially dangerous as
+end_buffer_async_write can cause the page and all buffers to be freed.
+
+Replace it with a single loop that does the work of end_buffer_async_write
+on a per-page basis.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c |   72 ++++++++++++++++++++++++++++++++++++------------------
+ 1 file changed, 48 insertions(+), 24 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
+  * associated buffer_heads, paying attention to the start and end offsets that
+  * we need to process on the page.
+  *
+- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+- * the page at all, as we may be racing with memory reclaim and it can free both
+- * the bufferhead chain and the page as it will see the page as clean and
+- * unused.
++ * Note that we open code the action in end_buffer_async_write here so that we
++ * only have to iterate over the buffers attached to the page once.  This is not
++ * only more efficient, but also ensures that we only calls end_page_writeback
++ * at the end of the iteration, and thus avoids the pitfall of having the page
++ * and buffers potentially freed after every call to end_buffer_async_write.
+  */
+ static void
+ xfs_finish_page_writeback(
+@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
+       struct bio_vec          *bvec,
+       int                     error)
+ {
+-      unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
+-      struct buffer_head      *head, *bh, *next;
++      struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
++      bool                    busy = false;
+       unsigned int            off = 0;
+-      unsigned int            bsize;
++      unsigned long           flags;
+       ASSERT(bvec->bv_offset < PAGE_SIZE);
+       ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
+-      ASSERT(end < PAGE_SIZE);
++      ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
+       ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
+-      bh = head = page_buffers(bvec->bv_page);
+-
+-      bsize = bh->b_size;
++      local_irq_save(flags);
++      bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+       do {
+-              if (off > end)
+-                      break;
+-              next = bh->b_this_page;
+-              if (off < bvec->bv_offset)
+-                      goto next_bh;
+-              bh->b_end_io(bh, !error);
+-next_bh:
+-              off += bsize;
+-      } while ((bh = next) != head);
++              if (off >= bvec->bv_offset &&
++                  off < bvec->bv_offset + bvec->bv_len) {
++                      ASSERT(buffer_async_write(bh));
++                      ASSERT(bh->b_end_io == NULL);
++
++                      if (error) {
++                              mapping_set_error(bvec->bv_page->mapping, -EIO);
++                              set_buffer_write_io_error(bh);
++                              clear_buffer_uptodate(bh);
++                              SetPageError(bvec->bv_page);
++                      } else {
++                              set_buffer_uptodate(bh);
++                      }
++                      clear_buffer_async_write(bh);
++                      unlock_buffer(bh);
++              } else if (buffer_async_write(bh)) {
++                      ASSERT(buffer_locked(bh));
++                      busy = true;
++              }
++              off += bh->b_size;
++      } while ((bh = bh->b_this_page) != head);
++      bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
++      local_irq_restore(flags);
++
++      if (!busy)
++              end_page_writeback(bvec->bv_page);
+ }
+ /*
+@@ -138,8 +154,10 @@ xfs_destroy_ioend(
+       int                     error)
+ {
+       struct inode            *inode = ioend->io_inode;
+-      struct bio              *last = ioend->io_bio;
+-      struct bio              *bio, *next;
++      struct bio              *bio = &ioend->io_inline_bio;
++      struct bio              *last = ioend->io_bio, *next;
++      u64                     start = bio->bi_iter.bi_sector;
++      bool                    quiet = bio_flagged(bio, BIO_QUIET);
+       for (bio = &ioend->io_inline_bio; bio; bio = next) {
+               struct bio_vec  *bvec;
+@@ -160,6 +178,11 @@ xfs_destroy_ioend(
+               bio_put(bio);
+       }
++
++      if (unlikely(error && !quiet)) {
++              xfs_err_ratelimited(XFS_I(inode)->i_mount,
++                      "writeback error on sector %llu", start);
++      }
+ }
+ /*
+@@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
+       ASSERT(!buffer_delay(bh));
+       ASSERT(!buffer_unwritten(bh));
+-      mark_buffer_async_write(bh);
++      bh->b_end_io = NULL;
++      set_buffer_async_write(bh);
+       set_buffer_uptodate(bh);
+       clear_buffer_dirty(bh);
+ }
diff --git a/queue-4.9/xfs-open-code-xfs_buf_item_dirty.patch b/queue-4.9/xfs-open-code-xfs_buf_item_dirty.patch
new file mode 100644 (file)
index 0000000..3ed5b1d
--- /dev/null
@@ -0,0 +1,67 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:59 -0700
+Subject: xfs: open-code xfs_buf_item_dirty()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-35-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a4f6cf6b2b6b60ec2a05a33a32e65caa4149aa2b upstream.
+
+It checks a single flag and has one caller. It probably isn't worth
+its own function.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c  |   11 -----------
+ fs/xfs/xfs_buf_item.h  |    1 -
+ fs/xfs/xfs_trans_buf.c |    2 +-
+ 3 files changed, 1 insertion(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -945,17 +945,6 @@ xfs_buf_item_log(
+ }
+-/*
+- * Return 1 if the buffer has been logged or ordered in a transaction (at any
+- * point, not just the current transaction) and 0 if not.
+- */
+-uint
+-xfs_buf_item_dirty(
+-      xfs_buf_log_item_t      *bip)
+-{
+-      return (bip->bli_flags & XFS_BLI_DIRTY);
+-}
+-
+ STATIC void
+ xfs_buf_item_free(
+       xfs_buf_log_item_t      *bip)
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,7 +64,6 @@ typedef struct xfs_buf_log_item {
+ int   xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void  xfs_buf_item_relse(struct xfs_buf *);
+ void  xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+-uint  xfs_buf_item_dirty(xfs_buf_log_item_t *);
+ void  xfs_buf_attach_iodone(struct xfs_buf *,
+                             void(*)(struct xfs_buf *, xfs_log_item_t *),
+                             xfs_log_item_t *);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t       *tp,
+       if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
+               xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
+               xfs_buf_item_relse(bp);
+-      } else if (!xfs_buf_item_dirty(bip)) {
++      } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
+ /***
+               ASSERT(bp->b_pincount == 0);
+ ***/
diff --git a/queue-4.9/xfs-ordered-buffer-log-items-are-never-formatted.patch b/queue-4.9/xfs-ordered-buffer-log-items-are-never-formatted.patch
new file mode 100644 (file)
index 0000000..6b31418
--- /dev/null
@@ -0,0 +1,70 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:01 -0700
+Subject: xfs: ordered buffer log items are never formatted
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-37-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e9385cc6fb7edf23702de33a2dc82965d92d9392 upstream.
+
+Ordered buffers pass through the logging infrastructure without ever
+being written to the log. The way this works is that the ordered
+buffer status is transferred to the log vector at commit time via
+the ->iop_size() callback. In xlog_cil_insert_format_items(),
+ordered log vectors bypass ->iop_format() processing altogether.
+
+Therefore it is unnecessary for xfs_buf_item_format() to handle
+ordered buffers. Remove the unnecessary logic and assert that an
+ordered buffer never reaches this point.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   12 ++----------
+ fs/xfs/xfs_trace.h    |    1 -
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -323,6 +323,8 @@ xfs_buf_item_format(
+       ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+              (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+               && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
++      ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
++             (bip->bli_flags & XFS_BLI_STALE));
+       /*
+@@ -347,16 +349,6 @@ xfs_buf_item_format(
+               bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+       }
+-      if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+-                                                      XFS_BLI_ORDERED) {
+-              /*
+-               * The buffer has been logged just to order it.  It is not being
+-               * included in the transaction commit, so don't format it.
+-               */
+-              trace_xfs_buf_item_format_ordered(bip);
+-              return;
+-      }
+-
+       for (i = 0; i < bip->bli_format_count; i++) {
+               xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+                                           &bip->bli_formats[i]);
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -520,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size)
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
diff --git a/queue-4.9/xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch b/queue-4.9/xfs-properly-retry-failed-inode-items-in-case-of-error-during-buffer-writeback.patch
new file mode 100644 (file)
index 0000000..e8361be
--- /dev/null
@@ -0,0 +1,265 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:50 -0700
+Subject: xfs: Properly retry failed inode items in case of error during buffer writeback
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-26-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit d3a304b6292168b83b45d624784f973fdc1ca674 upstream.
+
+When a buffer has been failed during writeback, the inode items into it
+are kept flush locked, and are never resubmitted due the flush lock, so,
+if any buffer fails to be written, the items in AIL are never written to
+disk and never unlocked.
+
+This causes unmount operation to hang due these items flush locked in AIL,
+but this also causes the items in AIL to never be written back, even when
+the IO device comes back to normal.
+
+I've been testing this patch with a DM-thin device, creating a
+filesystem larger than the real device.
+
+When writing enough data to fill the DM-thin device, XFS receives ENOSPC
+errors from the device, and keep spinning on xfsaild (when 'retry
+forever' configuration is set).
+
+At this point, the filesystem can not be unmounted because of the flush locked
+items in AIL, but worse, the items in AIL are never retried at all
+(once xfs_inode_item_push() will skip the items that are flush locked),
+even if the underlying DM-thin device is expanded to the proper size.
+
+This patch fixes both cases, retrying any item that has been failed
+previously, using the infra-structure provided by the previous patch.
+
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c   |   28 ++++++++++++++++++++++++++++
+ fs/xfs/xfs_buf_item.h   |    3 +++
+ fs/xfs/xfs_inode_item.c |   47 +++++++++++++++++++++++++++++++++++++++++++----
+ fs/xfs/xfs_trans.h      |    1 +
+ fs/xfs/xfs_trans_ail.c  |    3 ++-
+ fs/xfs/xfs_trans_priv.h |   31 +++++++++++++++++++++++++++++++
+ 6 files changed, 108 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1234,3 +1234,31 @@ xfs_buf_iodone(
+       xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+       xfs_buf_item_free(BUF_ITEM(lip));
+ }
++
++/*
++ * Requeue a failed buffer for writeback
++ *
++ * Return true if the buffer has been re-queued properly, false otherwise
++ */
++bool
++xfs_buf_resubmit_failed_buffers(
++      struct xfs_buf          *bp,
++      struct xfs_log_item     *lip,
++      struct list_head        *buffer_list)
++{
++      struct xfs_log_item     *next;
++
++      /*
++       * Clear XFS_LI_FAILED flag from all items before resubmit
++       *
++       * XFS_LI_FAILED set/clear is protected by xa_lock, caller  this
++       * function already have it acquired
++       */
++      for (; lip; lip = next) {
++              next = lip->li_bio_list;
++              xfs_clear_li_failed(lip);
++      }
++
++      /* Add this buffer back to the delayed write list */
++      return xfs_buf_delwri_queue(bp, buffer_list);
++}
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -70,6 +70,9 @@ void xfs_buf_attach_iodone(struct xfs_bu
+                             xfs_log_item_t *);
+ void  xfs_buf_iodone_callbacks(struct xfs_buf *);
+ void  xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
++bool  xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
++                                      struct xfs_log_item *,
++                                      struct list_head *);
+ extern kmem_zone_t    *xfs_buf_item_zone;
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -27,6 +27,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_trans_priv.h"
++#include "xfs_buf_item.h"
+ #include "xfs_log.h"
+@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
+               wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
++/*
++ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
++ * have been failed during writeback
++ *
++ * This informs the AIL that the inode is already flush locked on the next push,
++ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
++ * dirty data makes it to disk.
++ */
++STATIC void
++xfs_inode_item_error(
++      struct xfs_log_item     *lip,
++      struct xfs_buf          *bp)
++{
++      ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
++      xfs_set_li_failed(lip, bp);
++}
++
+ STATIC uint
+ xfs_inode_item_push(
+       struct xfs_log_item     *lip,
+@@ -484,13 +502,28 @@ xfs_inode_item_push(
+ {
+       struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+       struct xfs_inode        *ip = iip->ili_inode;
+-      struct xfs_buf          *bp = NULL;
++      struct xfs_buf          *bp = lip->li_buf;
+       uint                    rval = XFS_ITEM_SUCCESS;
+       int                     error;
+       if (xfs_ipincount(ip) > 0)
+               return XFS_ITEM_PINNED;
++      /*
++       * The buffer containing this item failed to be written back
++       * previously. Resubmit the buffer for IO.
++       */
++      if (lip->li_flags & XFS_LI_FAILED) {
++              if (!xfs_buf_trylock(bp))
++                      return XFS_ITEM_LOCKED;
++
++              if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
++                      rval = XFS_ITEM_FLUSHING;
++
++              xfs_buf_unlock(bp);
++              return rval;
++      }
++
+       if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+               return XFS_ITEM_LOCKED;
+@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_ino
+       .iop_unlock     = xfs_inode_item_unlock,
+       .iop_committed  = xfs_inode_item_committed,
+       .iop_push       = xfs_inode_item_push,
+-      .iop_committing = xfs_inode_item_committing
++      .iop_committing = xfs_inode_item_committing,
++      .iop_error      = xfs_inode_item_error
+ };
+@@ -710,7 +744,8 @@ xfs_iflush_done(
+                * the AIL lock.
+                */
+               iip = INODE_ITEM(blip);
+-              if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
++              if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
++                  lip->li_flags & XFS_LI_FAILED)
+                       need_ail++;
+               blip = next;
+@@ -718,7 +753,8 @@ xfs_iflush_done(
+       /* make sure we capture the state of the initial inode. */
+       iip = INODE_ITEM(lip);
+-      if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
++      if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
++          lip->li_flags & XFS_LI_FAILED)
+               need_ail++;
+       /*
+@@ -739,6 +775,9 @@ xfs_iflush_done(
+                       if (INODE_ITEM(blip)->ili_logged &&
+                           blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+                               mlip_changed |= xfs_ail_delete_one(ailp, blip);
++                      else {
++                              xfs_clear_li_failed(blip);
++                      }
+               }
+               if (mlip_changed) {
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -50,6 +50,7 @@ typedef struct xfs_log_item {
+       struct xfs_ail                  *li_ailp;       /* ptr to AIL */
+       uint                            li_type;        /* item type */
+       uint                            li_flags;       /* misc flags */
++      struct xfs_buf                  *li_buf;        /* real buffer pointer */
+       struct xfs_log_item             *li_bio_list;   /* buffer item list */
+       void                            (*li_cb)(struct xfs_buf *,
+                                                struct xfs_log_item *);
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
+ bool
+ xfs_ail_delete_one(
+       struct xfs_ail          *ailp,
+-      struct xfs_log_item     *lip)
++      struct xfs_log_item     *lip)
+ {
+       struct xfs_log_item     *mlip = xfs_ail_min(ailp);
+       trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+       xfs_ail_delete(ailp, lip);
++      xfs_clear_li_failed(lip);
+       lip->li_flags &= ~XFS_LI_IN_AIL;
+       lip->li_lsn = 0;
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
+       *dst = *src;
+ }
+ #endif
++
++static inline void
++xfs_clear_li_failed(
++      struct xfs_log_item     *lip)
++{
++      struct xfs_buf  *bp = lip->li_buf;
++
++      ASSERT(lip->li_flags & XFS_LI_IN_AIL);
++      lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++      if (lip->li_flags & XFS_LI_FAILED) {
++              lip->li_flags &= ~XFS_LI_FAILED;
++              lip->li_buf = NULL;
++              xfs_buf_rele(bp);
++      }
++}
++
++static inline void
++xfs_set_li_failed(
++      struct xfs_log_item     *lip,
++      struct xfs_buf          *bp)
++{
++      lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++      if (!(lip->li_flags & XFS_LI_FAILED)) {
++              xfs_buf_hold(bp);
++              lip->li_flags |= XFS_LI_FAILED;
++              lip->li_buf = bp;
++      }
++}
++
+ #endif        /* __XFS_TRANS_PRIV_H__ */
diff --git a/queue-4.9/xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch b/queue-4.9/xfs-refactor-buffer-logging-into-buffer-dirtying-helper.patch
new file mode 100644 (file)
index 0000000..0d57207
--- /dev/null
@@ -0,0 +1,137 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:02 -0700
+Subject: xfs: refactor buffer logging into buffer dirtying helper
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-38-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 9684010d38eccda733b61106765e9357cf436f65 upstream.
+
+xfs_trans_log_buf() is responsible for logging the dirty segments of
+a buffer along with setting all of the necessary state on the
+transaction, buffer, bli, etc., to ensure that the associated items
+are marked as dirty and prepared for I/O. We have a couple use cases
+that need to to dirty a buffer in a transaction without actually
+logging dirty ranges of the buffer.  One existing use case is
+ordered buffers, which are currently logged with arbitrary ranges to
+accomplish this even though the content of ordered buffers is never
+written to the log. Another pending use case is to relog an already
+dirty buffer across rolled transactions within the deferred
+operations infrastructure. This is required to prevent a held
+(XFS_BLI_HOLD) buffer from pinning the tail of the log.
+
+Refactor xfs_trans_log_buf() into a new function that contains all
+of the logic responsible to dirty the transaction, lidp, buffer and
+bli. This new function can be used in the future for the use cases
+outlined above. This patch does not introduce functional changes.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans.h     |    4 +++-
+ fs/xfs/xfs_trans_buf.c |   46 ++++++++++++++++++++++++++++++----------------
+ 2 files changed, 33 insertions(+), 17 deletions(-)
+
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -222,7 +222,9 @@ void               xfs_trans_dquot_buf(xfs_trans_t *,
+ void          xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+ void          xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
+-void          xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
++void          xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
++                                uint);
++void          xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+ void          xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+ void          xfs_extent_free_init_defer_op(void);
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t      *tp,
+ }
+ /*
+- * This is called to mark bytes first through last inclusive of the given
+- * buffer as needing to be logged when the transaction is committed.
+- * The buffer must already be associated with the given transaction.
+- *
+- * First and last are numbers relative to the beginning of this buffer,
+- * so the first byte in the buffer is numbered 0 regardless of the
+- * value of b_blkno.
++ * Mark a buffer dirty in the transaction.
+  */
+ void
+-xfs_trans_log_buf(xfs_trans_t *tp,
+-                xfs_buf_t     *bp,
+-                uint          first,
+-                uint          last)
++xfs_trans_dirty_buf(
++      struct xfs_trans        *tp,
++      struct xfs_buf          *bp)
+ {
+-      xfs_buf_log_item_t      *bip = bp->b_fspriv;
++      struct xfs_buf_log_item *bip = bp->b_fspriv;
+       ASSERT(bp->b_transp == tp);
+       ASSERT(bip != NULL);
+-      ASSERT(first <= last && last < BBTOB(bp->b_length));
+       ASSERT(bp->b_iodone == NULL ||
+              bp->b_iodone == xfs_buf_iodone_callbacks);
+@@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t      *tp,
+       bp->b_iodone = xfs_buf_iodone_callbacks;
+       bip->bli_item.li_cb = xfs_buf_iodone;
+-      trace_xfs_trans_log_buf(bip);
+-
+       /*
+        * If we invalidated the buffer within this transaction, then
+        * cancel the invalidation now that we're dirtying the buffer
+@@ -545,15 +535,39 @@ xfs_trans_log_buf(xfs_trans_t    *tp,
+               bp->b_flags &= ~XBF_STALE;
+               bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+       }
++      bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
++}
++
++/*
++ * This is called to mark bytes first through last inclusive of the given
++ * buffer as needing to be logged when the transaction is committed.
++ * The buffer must already be associated with the given transaction.
++ *
++ * First and last are numbers relative to the beginning of this buffer,
++ * so the first byte in the buffer is numbered 0 regardless of the
++ * value of b_blkno.
++ */
++void
++xfs_trans_log_buf(
++      struct xfs_trans        *tp,
++      struct xfs_buf          *bp,
++      uint                    first,
++      uint                    last)
++{
++      struct xfs_buf_log_item *bip = bp->b_fspriv;
++
++      ASSERT(first <= last && last < BBTOB(bp->b_length));
++
++      xfs_trans_dirty_buf(tp, bp);
+       /*
+        * If we have an ordered buffer we are not logging any dirty range but
+        * it still needs to be marked dirty and that it has been logged.
+        */
+-      bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
++      trace_xfs_trans_log_buf(bip);
+       if (!(bip->bli_flags & XFS_BLI_ORDERED))
+               xfs_buf_item_log(bip, first, last);
+ }
diff --git a/queue-4.9/xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch b/queue-4.9/xfs-relog-dirty-buffers-during-swapext-bmbt-owner-change.patch
new file mode 100644 (file)
index 0000000..346318d
--- /dev/null
@@ -0,0 +1,177 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:07 -0700
+Subject: xfs: relog dirty buffers during swapext bmbt owner change
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-43-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 2dd3d709fc4338681a3aa61658122fa8faa5a437 upstream.
+
+The owner change bmbt scan that occurs during extent swap operations
+does not handle ordered buffer failures. Buffers that cannot be
+marked ordered must be physically logged so previously dirty ranges
+of the buffer can be relogged in the transaction.
+
+Since the bmbt scan may need to process and potentially log a large
+number of blocks, we can't expect to complete this operation in a
+single transaction. Update extent swap to use a permanent
+transaction with enough log reservation to physically log a buffer.
+Update the bmbt scan to physically log any buffers that cannot be
+ordered and to terminate the scan with -EAGAIN. On -EAGAIN, the
+caller rolls the transaction and restarts the scan. Finally, update
+the bmbt scan helper function to skip bmbt blocks that already match
+the expected owner so they are not reprocessed after scan restarts.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+[darrick: fix the xfs_trans_roll call]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c |   26 ++++++++++++++------
+ fs/xfs/xfs_bmap_util.c    |   59 +++++++++++++++++++++++++++++++++++++---------
+ 2 files changed, 66 insertions(+), 19 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -4435,10 +4435,15 @@ xfs_btree_block_change_owner(
+       /* modify the owner */
+       block = xfs_btree_get_block(cur, level, &bp);
+-      if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
++      if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
++              if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
++                      return 0;
+               block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
+-      else
++      } else {
++              if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
++                      return 0;
+               block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
++      }
+       /*
+        * If the block is a root block hosted in an inode, we might not have a
+@@ -4447,14 +4452,19 @@ xfs_btree_block_change_owner(
+        * block is formatted into the on-disk inode fork. We still change it,
+        * though, so everything is consistent in memory.
+        */
+-      if (bp) {
+-              if (cur->bc_tp)
+-                      xfs_trans_ordered_buf(cur->bc_tp, bp);
+-              else
+-                      xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+-      } else {
++      if (!bp) {
+               ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+               ASSERT(level == cur->bc_nlevels - 1);
++              return 0;
++      }
++
++      if (cur->bc_tp) {
++              if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
++                      xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
++                      return -EAGAIN;
++              }
++      } else {
++              xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+       }
+       return 0;
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1914,6 +1914,48 @@ xfs_swap_extent_forks(
+       return 0;
+ }
++/*
++ * Fix up the owners of the bmbt blocks to refer to the current inode. The
++ * change owner scan attempts to order all modified buffers in the current
++ * transaction. In the event of ordered buffer failure, the offending buffer is
++ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
++ * the transaction in this case to replenish the fallback log reservation and
++ * restart the scan. This process repeats until the scan completes.
++ */
++static int
++xfs_swap_change_owner(
++      struct xfs_trans        **tpp,
++      struct xfs_inode        *ip,
++      struct xfs_inode        *tmpip)
++{
++      int                     error;
++      struct xfs_trans        *tp = *tpp;
++
++      do {
++              error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
++                                            NULL);
++              /* success or fatal error */
++              if (error != -EAGAIN)
++                      break;
++
++              error = xfs_trans_roll(tpp, NULL);
++              if (error)
++                      break;
++              tp = *tpp;
++
++              /*
++               * Redirty both inodes so they can relog and keep the log tail
++               * moving forward.
++               */
++              xfs_trans_ijoin(tp, ip, 0);
++              xfs_trans_ijoin(tp, tmpip, 0);
++              xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
++              xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
++      } while (true);
++
++      return error;
++}
++
+ int
+ xfs_swap_extents(
+       struct xfs_inode        *ip,    /* target inode */
+@@ -1927,8 +1969,8 @@ xfs_swap_extents(
+       int                     error = 0;
+       int                     lock_flags;
+       struct xfs_ifork        *cowfp;
+-      __uint64_t              f;
+-      int                     resblks;
++      uint64_t                f;
++      int                     resblks = 0;
+       /*
+        * Lock the inodes against other IO, page faults and truncate to
+@@ -1976,11 +2018,8 @@ xfs_swap_extents(
+                         XFS_SWAP_RMAP_SPACE_RES(mp,
+                               XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+                               XFS_DATA_FORK);
+-              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+-                              0, 0, &tp);
+-      } else
+-              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+-                              0, 0, &tp);
++      }
++      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+       if (error)
+               goto out_unlock;
+@@ -2072,14 +2111,12 @@ xfs_swap_extents(
+        * inode number of the current inode.
+        */
+       if (src_log_flags & XFS_ILOG_DOWNER) {
+-              error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+-                                            ip->i_ino, NULL);
++              error = xfs_swap_change_owner(&tp, ip, tip);
+               if (error)
+                       goto out_trans_cancel;
+       }
+       if (target_log_flags & XFS_ILOG_DOWNER) {
+-              error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+-                                            tip->i_ino, NULL);
++              error = xfs_swap_change_owner(&tp, tip, ip);
+               if (error)
+                       goto out_trans_cancel;
+       }
diff --git a/queue-4.9/xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch b/queue-4.9/xfs-remove-unnecessary-dirty-bli-format-check-for-ordered-bufs.patch
new file mode 100644 (file)
index 0000000..e5b5b36
--- /dev/null
@@ -0,0 +1,158 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:00 -0700
+Subject: xfs: remove unnecessary dirty bli format check for ordered bufs
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-36-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 6453c65d3576bc3e602abb5add15f112755c08ca upstream.
+
+xfs_buf_item_unlock() historically checked the dirty state of the
+buffer by manually checking the buffer log formats for dirty
+segments. The introduction of ordered buffers invalidated this check
+because ordered buffers have dirty bli's but no dirty (logged)
+segments. The check was updated to accommodate ordered buffers by
+looking at the bli state first and considering the blf only if the
+bli is clean.
+
+This logic is safe but unnecessary. There is no valid case where the
+bli is clean yet the blf has dirty segments. The bli is set dirty
+whenever the blf is logged (via xfs_trans_log_buf()) and the blf is
+cleared in the only place BLI_DIRTY is cleared (xfs_trans_binval()).
+
+Remove the conditional blf dirty checks and replace with an assert
+that should catch any discrepencies between bli and blf dirty
+states. Refactor the old blf dirty check into a helper function to
+be used by the assert.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   62 +++++++++++++++++++++++++-------------------------
+ fs/xfs/xfs_buf_item.h |    1 
+ 2 files changed, 33 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -575,26 +575,18 @@ xfs_buf_item_unlock(
+ {
+       struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+       struct xfs_buf          *bp = bip->bli_buf;
+-      bool                    clean;
+-      bool                    aborted;
+-      int                     flags;
++      bool                    aborted = !!(lip->li_flags & XFS_LI_ABORTED);
++      bool                    hold = !!(bip->bli_flags & XFS_BLI_HOLD);
++      bool                    dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
++      bool                    ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+       /* Clear the buffer's association with this transaction. */
+       bp->b_transp = NULL;
+       /*
+-       * If this is a transaction abort, don't return early.  Instead, allow
+-       * the brelse to happen.  Normally it would be done for stale
+-       * (cancelled) buffers at unpin time, but we'll never go through the
+-       * pin/unpin cycle if we abort inside commit.
+-       */
+-      aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
+-      /*
+-       * Before possibly freeing the buf item, copy the per-transaction state
+-       * so we can reference it safely later after clearing it from the
+-       * buffer log item.
++       * The per-transaction state has been copied above so clear it from the
++       * bli.
+        */
+-      flags = bip->bli_flags;
+       bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+       /*
+@@ -602,7 +594,7 @@ xfs_buf_item_unlock(
+        * unlock the buffer and free the buf item when the buffer is unpinned
+        * for the last time.
+        */
+-      if (flags & XFS_BLI_STALE) {
++      if (bip->bli_flags & XFS_BLI_STALE) {
+               trace_xfs_buf_item_unlock_stale(bip);
+               ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+               if (!aborted) {
+@@ -620,20 +612,11 @@ xfs_buf_item_unlock(
+        * regardless of whether it is dirty or not. A dirty abort implies a
+        * shutdown, anyway.
+        *
+-       * Ordered buffers are dirty but may have no recorded changes, so ensure
+-       * we only release clean items here.
++       * The bli dirty state should match whether the blf has logged segments
++       * except for ordered buffers, where only the bli should be dirty.
+        */
+-      clean = (flags & XFS_BLI_DIRTY) ? false : true;
+-      if (clean) {
+-              int i;
+-              for (i = 0; i < bip->bli_format_count; i++) {
+-                      if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+-                                   bip->bli_formats[i].blf_map_size)) {
+-                              clean = false;
+-                              break;
+-                      }
+-              }
+-      }
++      ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
++             (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+       /*
+        * Clean buffers, by definition, cannot be in the AIL. However, aborted
+@@ -652,11 +635,11 @@ xfs_buf_item_unlock(
+                       ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+                       xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+                       xfs_buf_item_relse(bp);
+-              } else if (clean)
++              } else if (!dirty)
+                       xfs_buf_item_relse(bp);
+       }
+-      if (!(flags & XFS_BLI_HOLD))
++      if (!hold)
+               xfs_buf_relse(bp);
+ }
+@@ -945,6 +928,25 @@ xfs_buf_item_log(
+ }
++/*
++ * Return true if the buffer has any ranges logged/dirtied by a transaction,
++ * false otherwise.
++ */
++bool
++xfs_buf_item_dirty_format(
++      struct xfs_buf_log_item *bip)
++{
++      int                     i;
++
++      for (i = 0; i < bip->bli_format_count; i++) {
++              if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
++                           bip->bli_formats[i].blf_map_size))
++                      return true;
++      }
++
++      return false;
++}
++
+ STATIC void
+ xfs_buf_item_free(
+       xfs_buf_log_item_t      *bip)
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,6 +64,7 @@ typedef struct xfs_buf_log_item {
+ int   xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void  xfs_buf_item_relse(struct xfs_buf *);
+ void  xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
++bool  xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
+ void  xfs_buf_attach_iodone(struct xfs_buf *,
+                             void(*)(struct xfs_buf *, xfs_log_item_t *),
+                             xfs_log_item_t *);
diff --git a/queue-4.9/xfs-remove-xfs_trans_ail_delete_bulk.patch b/queue-4.9/xfs-remove-xfs_trans_ail_delete_bulk.patch
new file mode 100644 (file)
index 0000000..428abd0
--- /dev/null
@@ -0,0 +1,193 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:48 -0700
+Subject: xfs: remove xfs_trans_ail_delete_bulk
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-24-hch@lst.de>
+
+commit 27af1bbf524459962d1477a38ac6e0b7f79aaecc upstream.
+
+xfs_iflush_done uses an on-stack variable length array to pass the log
+items to be deleted to xfs_trans_ail_delete_bulk.  On-stack VLAs are a
+nasty gcc extension that can lead to unbounded stack allocations, but
+fortunately we can easily avoid them by simply open coding
+xfs_trans_ail_delete_bulk in xfs_iflush_done, which is the only caller
+of it except for the single-item xfs_trans_ail_delete.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode_item.c |   29 +++++++++++--------
+ fs/xfs/xfs_trans_ail.c  |   73 +++++++++++++++++++++++-------------------------
+ fs/xfs/xfs_trans_priv.h |   15 +--------
+ 3 files changed, 56 insertions(+), 61 deletions(-)
+
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -731,22 +731,27 @@ xfs_iflush_done(
+        * holding the lock before removing the inode from the AIL.
+        */
+       if (need_ail) {
+-              struct xfs_log_item *log_items[need_ail];
+-              int i = 0;
++              bool                    mlip_changed = false;
++
++              /* this is an opencoded batch version of xfs_trans_ail_delete */
+               spin_lock(&ailp->xa_lock);
+               for (blip = lip; blip; blip = blip->li_bio_list) {
+-                      iip = INODE_ITEM(blip);
+-                      if (iip->ili_logged &&
+-                          blip->li_lsn == iip->ili_flush_lsn) {
+-                              log_items[i++] = blip;
+-                      }
+-                      ASSERT(i <= need_ail);
++                      if (INODE_ITEM(blip)->ili_logged &&
++                          blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
++                              mlip_changed |= xfs_ail_delete_one(ailp, blip);
+               }
+-              /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+-              xfs_trans_ail_delete_bulk(ailp, log_items, i,
+-                                        SHUTDOWN_CORRUPT_INCORE);
+-      }
++              if (mlip_changed) {
++                      if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
++                              xlog_assign_tail_lsn_locked(ailp->xa_mount);
++                      if (list_empty(&ailp->xa_ail))
++                              wake_up_all(&ailp->xa_empty);
++              }
++              spin_unlock(&ailp->xa_lock);
++
++              if (mlip_changed)
++                      xfs_log_space_wake(ailp->xa_mount);
++      }
+       /*
+        * clean up and unlock the flush lock now we are done. We can clear the
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
+       }
+ }
+-/*
+- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
++bool
++xfs_ail_delete_one(
++      struct xfs_ail          *ailp,
++      struct xfs_log_item     *lip)
++{
++      struct xfs_log_item     *mlip = xfs_ail_min(ailp);
++
++      trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
++      xfs_ail_delete(ailp, lip);
++      lip->li_flags &= ~XFS_LI_IN_AIL;
++      lip->li_lsn = 0;
++
++      return mlip == lip;
++}
++
++/**
++ * Remove a log items from the AIL
+  *
+  * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+  * removed from the AIL. The caller is already holding the AIL lock, and done
+@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
+  * before returning.
+  */
+ void
+-xfs_trans_ail_delete_bulk(
++xfs_trans_ail_delete(
+       struct xfs_ail          *ailp,
+-      struct xfs_log_item     **log_items,
+-      int                     nr_items,
++      struct xfs_log_item     *lip,
+       int                     shutdown_type) __releases(ailp->xa_lock)
+ {
+-      xfs_log_item_t          *mlip;
+-      int                     mlip_changed = 0;
+-      int                     i;
+-
+-      mlip = xfs_ail_min(ailp);
+-
+-      for (i = 0; i < nr_items; i++) {
+-              struct xfs_log_item *lip = log_items[i];
+-              if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+-                      struct xfs_mount        *mp = ailp->xa_mount;
+-
+-                      spin_unlock(&ailp->xa_lock);
+-                      if (!XFS_FORCED_SHUTDOWN(mp)) {
+-                              xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+-              "%s: attempting to delete a log item that is not in the AIL",
+-                                              __func__);
+-                              xfs_force_shutdown(mp, shutdown_type);
+-                      }
+-                      return;
+-              }
++      struct xfs_mount        *mp = ailp->xa_mount;
++      bool                    mlip_changed;
+-              trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+-              xfs_ail_delete(ailp, lip);
+-              lip->li_flags &= ~XFS_LI_IN_AIL;
+-              lip->li_lsn = 0;
+-              if (mlip == lip)
+-                      mlip_changed = 1;
++      if (!(lip->li_flags & XFS_LI_IN_AIL)) {
++              spin_unlock(&ailp->xa_lock);
++              if (!XFS_FORCED_SHUTDOWN(mp)) {
++                      xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
++      "%s: attempting to delete a log item that is not in the AIL",
++                                      __func__);
++                      xfs_force_shutdown(mp, shutdown_type);
++              }
++              return;
+       }
++      mlip_changed = xfs_ail_delete_one(ailp, lip);
+       if (mlip_changed) {
+-              if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+-                      xlog_assign_tail_lsn_locked(ailp->xa_mount);
++              if (!XFS_FORCED_SHUTDOWN(mp))
++                      xlog_assign_tail_lsn_locked(mp);
+               if (list_empty(&ailp->xa_ail))
+                       wake_up_all(&ailp->xa_empty);
+-              spin_unlock(&ailp->xa_lock);
++      }
++      spin_unlock(&ailp->xa_lock);
++      if (mlip_changed)
+               xfs_log_space_wake(ailp->xa_mount);
+-      } else {
+-              spin_unlock(&ailp->xa_lock);
+-      }
+ }
+ int
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -106,18 +106,9 @@ xfs_trans_ail_update(
+       xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+ }
+-void  xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+-                              struct xfs_log_item **log_items, int nr_items,
+-                              int shutdown_type)
+-                              __releases(ailp->xa_lock);
+-static inline void
+-xfs_trans_ail_delete(
+-      struct xfs_ail  *ailp,
+-      xfs_log_item_t  *lip,
+-      int             shutdown_type) __releases(ailp->xa_lock)
+-{
+-      xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
+-}
++bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
++void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
++              int shutdown_type) __releases(ailp->xa_lock);
+ static inline void
+ xfs_trans_ail_remove(
diff --git a/queue-4.9/xfs-skip-bmbt-block-ino-validation-during-owner-change.patch b/queue-4.9/xfs-skip-bmbt-block-ino-validation-during-owner-change.patch
new file mode 100644 (file)
index 0000000..90303e5
--- /dev/null
@@ -0,0 +1,80 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:04 -0700
+Subject: xfs: skip bmbt block ino validation during owner change
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-40-hch@lst.de>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 99c794c639a65cc7b74f30a674048fd100fe9ac8 upstream.
+
+Extent swap uses xfs_btree_visit_blocks() to fix up bmbt block
+owners on v5 (!rmapbt) filesystems. The bmbt scan uses
+xfs_btree_lookup_get_block() to read bmbt blocks which verifies the
+current owner of the block against the parent inode of the bmbt.
+This works during extent swap because the bmbt owners are updated to
+the opposite inode number before the inode extent forks are swapped.
+
+The modified bmbt blocks are marked as ordered buffers which allows
+everything to commit in a single transaction. If the transaction
+commits to the log and the system crashes such that recovery of the
+extent swap is required, log recovery restarts the bmbt scan to fix
+up any bmbt blocks that may have not been written back before the
+crash. The log recovery bmbt scan occurs after the inode forks have
+been swapped, however. This causes the bmbt block owner verification
+to fail, leads to log recovery failure and requires xfs_repair to
+zap the log to recover.
+
+Define a new invalid inode owner flag to inform the btree block
+lookup mechanism that the current inode may be invalid with respect
+to the current owner of the bmbt block. Set this flag on the cursor
+used for change owner scans to allow this operation to work at
+runtime and during log recovery.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Fixes: bb3be7e7c ("xfs: check for bogus values in btree block headers")
+Cc: stable@vger.kernel.org
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap_btree.c |    1 +
+ fs/xfs/libxfs/xfs_btree.c      |    1 +
+ fs/xfs/libxfs/xfs_btree.h      |    3 ++-
+ 3 files changed, 4 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
+       cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+       if (!cur)
+               return -ENOMEM;
++      cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+       error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -1774,6 +1774,7 @@ xfs_btree_lookup_get_block(
+       /* Check the inode owner since the verifiers don't. */
+       if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
++          !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+           (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+           be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+                       cur->bc_private.b.ip->i_ino)
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
+                       short           forksize;       /* fork's inode space */
+                       char            whichfork;      /* data or attr fork */
+                       char            flags;          /* flags */
+-#define       XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
++#define       XFS_BTCUR_BPRV_WASDEL           (1<<0)          /* was delayed */
++#define       XFS_BTCUR_BPRV_INVALID_OWNER    (1<<1)          /* for ext swap */
+               } b;
+       }               bc_private;     /* per-btree type data */
+ } xfs_btree_cur_t;
diff --git a/queue-4.9/xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch b/queue-4.9/xfs-stop-searching-for-free-slots-in-an-inode-chunk-when-there-are-none.patch
new file mode 100644 (file)
index 0000000..7a4371e
--- /dev/null
@@ -0,0 +1,136 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:56 -0700
+Subject: xfs: stop searching for free slots in an inode chunk when there are none
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Carlos Maiolino <cmaiolino@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-32-hch@lst.de>
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit 2d32311cf19bfb8c1d2b4601974ddd951f9cfd0b upstream.
+
+In a filesystem without finobt, the Space manager selects an AG to alloc a new
+inode, where xfs_dialloc_ag_inobt() will search the AG for the free slot chunk.
+
+When the new inode is in the same AG as its parent, the btree will be searched
+starting on the parent's record, and then retried from the top if no slot is
+available beyond the parent's record.
+
+To exit this loop though, xfs_dialloc_ag_inobt() relies on the fact that the
+btree must have a free slot available, once its callers relied on the
+agi->freecount when deciding how/where to allocate this new inode.
+
+In the case when the agi->freecount is corrupted, showing available inodes in an
+AG, when in fact there is none, this becomes an infinite loop.
+
+Add a way to stop the loop when a free slot is not found in the btree, making
+the function to fall into the whole AG scan which will then, be able to detect
+the corruption and shut the filesystem down.
+
+As pointed by Brian, this might impact performance, giving the fact we
+don't reset the search distance anymore when we reach the end of the
+tree, giving it fewer tries before falling back to the whole AG search, but
+it will only affect searches that start within 10 records to the end of the tree.
+
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ialloc.c |   55 ++++++++++++++++++++++-----------------------
+ 1 file changed, 27 insertions(+), 28 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -1123,6 +1123,7 @@ xfs_dialloc_ag_inobt(
+       int                     error;
+       int                     offset;
+       int                     i, j;
++      int                     searchdistance = 10;
+       pag = xfs_perag_get(mp, agno);
+@@ -1149,7 +1150,6 @@ xfs_dialloc_ag_inobt(
+       if (pagno == agno) {
+               int             doneleft;       /* done, to the left */
+               int             doneright;      /* done, to the right */
+-              int             searchdistance = 10;
+               error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+               if (error)
+@@ -1210,21 +1210,9 @@ xfs_dialloc_ag_inobt(
+               /*
+                * Loop until we find an inode chunk with a free inode.
+                */
+-              while (!doneleft || !doneright) {
++              while (--searchdistance > 0 && (!doneleft || !doneright)) {
+                       int     useleft;  /* using left inode chunk this time */
+-                      if (!--searchdistance) {
+-                              /*
+-                               * Not in range - save last search
+-                               * location and allocate a new inode
+-                               */
+-                              xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-                              pag->pagl_leftrec = trec.ir_startino;
+-                              pag->pagl_rightrec = rec.ir_startino;
+-                              pag->pagl_pagino = pagino;
+-                              goto newino;
+-                      }
+-
+                       /* figure out the closer block if both are valid. */
+                       if (!doneleft && !doneright) {
+                               useleft = pagino -
+@@ -1268,26 +1256,37 @@ xfs_dialloc_ag_inobt(
+                               goto error1;
+               }
+-              /*
+-               * We've reached the end of the btree. because
+-               * we are only searching a small chunk of the
+-               * btree each search, there is obviously free
+-               * inodes closer to the parent inode than we
+-               * are now. restart the search again.
+-               */
+-              pag->pagl_pagino = NULLAGINO;
+-              pag->pagl_leftrec = NULLAGINO;
+-              pag->pagl_rightrec = NULLAGINO;
+-              xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-              xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+-              goto restart_pagno;
++              if (searchdistance <= 0) {
++                      /*
++                       * Not in range - save last search
++                       * location and allocate a new inode
++                       */
++                      xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++                      pag->pagl_leftrec = trec.ir_startino;
++                      pag->pagl_rightrec = rec.ir_startino;
++                      pag->pagl_pagino = pagino;
++
++              } else {
++                      /*
++                       * We've reached the end of the btree. because
++                       * we are only searching a small chunk of the
++                       * btree each search, there is obviously free
++                       * inodes closer to the parent inode than we
++                       * are now. restart the search again.
++                       */
++                      pag->pagl_pagino = NULLAGINO;
++                      pag->pagl_leftrec = NULLAGINO;
++                      pag->pagl_rightrec = NULLAGINO;
++                      xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++                      xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
++                      goto restart_pagno;
++              }
+       }
+       /*
+        * In a different AG from the parent.
+        * See if the most recently allocated block has any free.
+        */
+-newino:
+       if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+               error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+                                        XFS_LOOKUP_EQ, &i);
diff --git a/queue-4.9/xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch b/queue-4.9/xfs-toggle-readonly-state-around-xfs_log_mount_finish.patch
new file mode 100644 (file)
index 0000000..9694f99
--- /dev/null
@@ -0,0 +1,62 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:47 -0700
+Subject: xfs: toggle readonly state around xfs_log_mount_finish
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Eric Sandeen <sandeen@sandeen.net>, Eric Sandeen <sandeen@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-23-hch@lst.de>
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit 6f4a1eefdd0ad4561543270a7fceadabcca075dd upstream.
+
+When we do log recovery on a readonly mount, unlinked inode
+processing does not happen due to the readonly checks in
+xfs_inactive(), which are trying to prevent any I/O on a
+readonly mount.
+
+This is misguided - we do I/O on readonly mounts all the time,
+for consistency; for example, log recovery.  So do the same
+RDONLY flag twiddling around xfs_log_mount_finish() as we
+do around xfs_log_mount(), for the same reason.
+
+This all cries out for a big rework but for now this is a
+simple fix to an obvious problem.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -743,10 +743,14 @@ xfs_log_mount_finish(
+       struct xfs_mount        *mp)
+ {
+       int     error = 0;
++      bool    readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+       if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+               ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+               return 0;
++      } else if (readonly) {
++              /* Allow unlinked processing to proceed */
++              mp->m_flags &= ~XFS_MOUNT_RDONLY;
+       }
+       /*
+@@ -764,6 +768,9 @@ xfs_log_mount_finish(
+               xfs_log_work_queue(mp);
+       mp->m_super->s_flags &= ~MS_ACTIVE;
++      if (readonly)
++              mp->m_flags |= XFS_MOUNT_RDONLY;
++
+       return error;
+ }
diff --git a/queue-4.9/xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch b/queue-4.9/xfs-use-kmem_free-to-free-return-value-of-kmem_zalloc.patch
new file mode 100644 (file)
index 0000000..1af9399
--- /dev/null
@@ -0,0 +1,34 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:07:12 -0700
+Subject: xfs: use kmem_free to free return value of kmem_zalloc
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Pan Bian <bianpan2016@163.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-48-hch@lst.de>
+
+From: Pan Bian <bianpan2016@163.com>
+
+commit 6c370590cfe0c36bcd62d548148aa65c984540b7 upstream.
+
+In function xfs_test_remount_options(), kfree() is used to free memory
+allocated by kmem_zalloc(). But it is better to use kmem_free().
+
+Signed-off-by: Pan Bian <bianpan2016@163.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_super.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1214,7 +1214,7 @@ xfs_test_remount_options(
+       tmp_mp->m_super = sb;
+       error = xfs_parseargs(tmp_mp, options);
+       xfs_free_fsname(tmp_mp);
+-      kfree(tmp_mp);
++      kmem_free(tmp_mp);
+       return error;
+ }
diff --git a/queue-4.9/xfs-write-unmount-record-for-ro-mounts.patch b/queue-4.9/xfs-write-unmount-record-for-ro-mounts.patch
new file mode 100644 (file)
index 0000000..5d2bca5
--- /dev/null
@@ -0,0 +1,66 @@
+From foo@baz Mon Sep 18 10:16:36 CEST 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:46 -0700
+Subject: xfs: write unmount record for ro mounts
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Eric Sandeen <sandeen@sandeen.net>, Eric Sandeen <sandeen@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-22-hch@lst.de>
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit 757a69ef6cf2bf839bd4088e5609ddddd663b0c4 upstream.
+
+There are dueling comments in the xfs code about intent
+for log writes when unmounting a readonly filesystem.
+
+In xfs_mountfs, we see the intent:
+
+/*
+ * Now the log is fully replayed, we can transition to full read-only
+ * mode for read-only mounts. This will sync all the metadata and clean
+ * the log so that the recovery we just performed does not have to be
+ * replayed again on the next mount.
+ */
+
+and it calls xfs_quiesce_attr(), but by the time we get to
+xfs_log_unmount_write(), it returns early for a RDONLY mount:
+
+ * Don't write out unmount record on read-only mounts.
+
+Because of this, sequential ro mounts of a filesystem with
+a dirty log will replay the log each time, which seems odd.
+
+Fix this by writing an unmount record even for RO mounts, as long
+as norecovery wasn't specified (don't write a clean log record
+if a dirty log may still be there!) and the log device is
+writable.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -812,11 +812,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
+       int              error;
+       /*
+-       * Don't write out unmount record on read-only mounts.
++       * Don't write out unmount record on norecovery mounts or ro devices.
+        * Or, if we are doing a forced umount (typically because of IO errors).
+        */
+-      if (mp->m_flags & XFS_MOUNT_RDONLY)
++      if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
++          xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
++              ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+               return 0;
++      }
+       error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+       ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));