]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 1 Apr 2017 17:32:07 +0000 (19:32 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 1 Apr 2017 17:32:07 +0000 (19:32 +0200)
added patches:
xfs-allow-unwritten-extents-in-the-cow-fork.patch
xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
xfs-correct-null-checks-and-error-processing-in-xfs_initialize_perag.patch
xfs-don-t-fail-xfs_extent_busy-allocation.patch
xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
xfs-fail-_dir_open-when-readahead-fails.patch
xfs-filter-out-obviously-bad-btree-pointers.patch
xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
xfs-only-reclaim-unwritten-cow-extents-periodically.patch
xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
xfs-split-indlen-reservations-fairly-when-under-reserved.patch
xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
xfs-tune-down-agno-asserts-in-the-bmap-code.patch
xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
xfs-verify-free-block-header-fields.patch

25 files changed:
queue-4.10/series
queue-4.10/xfs-allow-unwritten-extents-in-the-cow-fork.patch [new file with mode: 0644]
queue-4.10/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch [new file with mode: 0644]
queue-4.10/xfs-correct-null-checks-and-error-processing-in-xfs_initialize_perag.patch [new file with mode: 0644]
queue-4.10/xfs-don-t-fail-xfs_extent_busy-allocation.patch [new file with mode: 0644]
queue-4.10/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch [new file with mode: 0644]
queue-4.10/xfs-fail-_dir_open-when-readahead-fails.patch [new file with mode: 0644]
queue-4.10/xfs-filter-out-obviously-bad-btree-pointers.patch [new file with mode: 0644]
queue-4.10/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch [new file with mode: 0644]
queue-4.10/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch [new file with mode: 0644]
queue-4.10/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch [new file with mode: 0644]
queue-4.10/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch [new file with mode: 0644]
queue-4.10/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch [new file with mode: 0644]
queue-4.10/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch [new file with mode: 0644]
queue-4.10/xfs-only-reclaim-unwritten-cow-extents-periodically.patch [new file with mode: 0644]
queue-4.10/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch [new file with mode: 0644]
queue-4.10/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch [new file with mode: 0644]
queue-4.10/xfs-split-indlen-reservations-fairly-when-under-reserved.patch [new file with mode: 0644]
queue-4.10/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch [new file with mode: 0644]
queue-4.10/xfs-tune-down-agno-asserts-in-the-bmap-code.patch [new file with mode: 0644]
queue-4.10/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch [new file with mode: 0644]
queue-4.10/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch [new file with mode: 0644]
queue-4.10/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch [new file with mode: 0644]
queue-4.10/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch [new file with mode: 0644]
queue-4.10/xfs-verify-free-block-header-fields.patch [new file with mode: 0644]

index 2d2ac804b394e3af98af5983c9f5b38a8a675650..f620f80a411c5c6addb448784524959356857e6b 100644 (file)
@@ -2,3 +2,27 @@ libceph-force-gfp_noio-for-socket-allocations.patch
 kvm-nvmx-fix-nested-ept-detection.patch
 xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
 xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
+xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
+xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
+xfs-fail-_dir_open-when-readahead-fails.patch
+xfs-filter-out-obviously-bad-btree-pointers.patch
+xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
+xfs-verify-free-block-header-fields.patch
+xfs-allow-unwritten-extents-in-the-cow-fork.patch
+xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
+xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
+xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
+xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
+xfs-correct-null-checks-and-error-processing-in-xfs_initialize_perag.patch
+xfs-don-t-fail-xfs_extent_busy-allocation.patch
+xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
+xfs-split-indlen-reservations-fairly-when-under-reserved.patch
+xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
+xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
+xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
+xfs-tune-down-agno-asserts-in-the-bmap-code.patch
+xfs-only-reclaim-unwritten-cow-extents-periodically.patch
+xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
+xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
+xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
+xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
diff --git a/queue-4.10/xfs-allow-unwritten-extents-in-the-cow-fork.patch b/queue-4.10/xfs-allow-unwritten-extents-in-the-cow-fork.patch
new file mode 100644 (file)
index 0000000..332ba66
--- /dev/null
@@ -0,0 +1,278 @@
+From 05a630d76bd3f39baf0eecfa305bed2820796dee Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:01 -0800
+Subject: xfs: allow unwritten extents in the CoW fork
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 05a630d76bd3f39baf0eecfa305bed2820796dee upstream.
+
+In the data fork, we only allow extents to perform the following state
+transitions:
+
+delay -> real <-> unwritten
+
+There's no way to move directly from a delalloc reservation to an
+/unwritten/ allocated extent.  However, for the CoW fork we want to be
+able to do the following to each extent:
+
+delalloc -> unwritten -> written -> remapped to data fork
+
+This will help us to avoid a race in the speculative CoW preallocation
+code between a first thread that is allocating a CoW extent and a second
+thread that is remapping part of a file after a write.  In order to do
+this, however, we need two things: first, we have to be able to
+transition from da to unwritten, and second the function that converts
+between real and unwritten has to be made aware of the cow fork.  Do
+both of those things.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   80 +++++++++++++++++++++++++++++------------------
+ 1 file changed, 50 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -1861,6 +1861,7 @@ xfs_bmap_add_extent_delay_real(
+                */
+               trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+               xfs_bmbt_set_startblock(ep, new->br_startblock);
++              xfs_bmbt_set_state(ep, new->br_state);
+               trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+               (*nextents)++;
+@@ -2199,6 +2200,7 @@ STATIC int                               /* error */
+ xfs_bmap_add_extent_unwritten_real(
+       struct xfs_trans        *tp,
+       xfs_inode_t             *ip,    /* incore inode pointer */
++      int                     whichfork,
+       xfs_extnum_t            *idx,   /* extent number to update/insert */
+       xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+       xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
+@@ -2218,12 +2220,14 @@ xfs_bmap_add_extent_unwritten_real(
+                                       /* left is 0, right is 1, prev is 2 */
+       int                     rval=0; /* return value (logging flags) */
+       int                     state = 0;/* state bits, accessed thru macros */
+-      struct xfs_mount        *mp = tp->t_mountp;
++      struct xfs_mount        *mp = ip->i_mount;
+       *logflagsp = 0;
+       cur = *curp;
+-      ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
++      ifp = XFS_IFORK_PTR(ip, whichfork);
++      if (whichfork == XFS_COW_FORK)
++              state |= BMAP_COWFORK;
+       ASSERT(*idx >= 0);
+       ASSERT(*idx <= xfs_iext_count(ifp));
+@@ -2282,7 +2286,7 @@ xfs_bmap_add_extent_unwritten_real(
+        * Don't set contiguous if the combined extent would be too large.
+        * Also check for all-three-contiguous being too large.
+        */
+-      if (*idx < xfs_iext_count(&ip->i_df) - 1) {
++      if (*idx < xfs_iext_count(ifp) - 1) {
+               state |= BMAP_RIGHT_VALID;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+               if (isnullstartblock(RIGHT.br_startblock))
+@@ -2322,7 +2326,8 @@ xfs_bmap_add_extent_unwritten_real(
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               xfs_iext_remove(ip, *idx + 1, 2, state);
+-              ip->i_d.di_nextents -= 2;
++              XFS_IFORK_NEXT_SET(ip, whichfork,
++                              XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+@@ -2365,7 +2370,8 @@ xfs_bmap_add_extent_unwritten_real(
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               xfs_iext_remove(ip, *idx + 1, 1, state);
+-              ip->i_d.di_nextents--;
++              XFS_IFORK_NEXT_SET(ip, whichfork,
++                              XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+@@ -2400,7 +2406,8 @@ xfs_bmap_add_extent_unwritten_real(
+               xfs_bmbt_set_state(ep, newext);
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               xfs_iext_remove(ip, *idx + 1, 1, state);
+-              ip->i_d.di_nextents--;
++              XFS_IFORK_NEXT_SET(ip, whichfork,
++                              XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+@@ -2512,7 +2519,8 @@ xfs_bmap_add_extent_unwritten_real(
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+               xfs_iext_insert(ip, *idx, 1, new, state);
+-              ip->i_d.di_nextents++;
++              XFS_IFORK_NEXT_SET(ip, whichfork,
++                              XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+@@ -2590,7 +2598,8 @@ xfs_bmap_add_extent_unwritten_real(
+               ++*idx;
+               xfs_iext_insert(ip, *idx, 1, new, state);
+-              ip->i_d.di_nextents++;
++              XFS_IFORK_NEXT_SET(ip, whichfork,
++                              XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+@@ -2638,7 +2647,8 @@ xfs_bmap_add_extent_unwritten_real(
+               ++*idx;
+               xfs_iext_insert(ip, *idx, 2, &r[0], state);
+-              ip->i_d.di_nextents += 2;
++              XFS_IFORK_NEXT_SET(ip, whichfork,
++                              XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
+               if (cur == NULL)
+                       rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+               else {
+@@ -2692,17 +2702,17 @@ xfs_bmap_add_extent_unwritten_real(
+       }
+       /* update reverse mappings */
+-      error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
++      error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new);
+       if (error)
+               goto done;
+       /* convert to a btree if necessary */
+-      if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
++      if (xfs_bmap_needs_btree(ip, whichfork)) {
+               int     tmp_logflags;   /* partial log flag return val */
+               ASSERT(cur == NULL);
+               error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
+-                              0, &tmp_logflags, XFS_DATA_FORK);
++                              0, &tmp_logflags, whichfork);
+               *logflagsp |= tmp_logflags;
+               if (error)
+                       goto done;
+@@ -2714,7 +2724,7 @@ xfs_bmap_add_extent_unwritten_real(
+               *curp = cur;
+       }
+-      xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
++      xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+ done:
+       *logflagsp |= rval;
+       return error;
+@@ -4365,10 +4375,16 @@ xfs_bmapi_allocate(
+       bma->got.br_state = XFS_EXT_NORM;
+       /*
+-       * A wasdelay extent has been initialized, so shouldn't be flagged
+-       * as unwritten.
++       * In the data fork, a wasdelay extent has been initialized, so
++       * shouldn't be flagged as unwritten.
++       *
++       * For the cow fork, however, we convert delalloc reservations
++       * (extents allocated for speculative preallocation) to
++       * allocated unwritten extents, and only convert the unwritten
++       * extents to real extents when we're about to write the data.
+        */
+-      if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
++      if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
++          (bma->flags & XFS_BMAPI_PREALLOC) &&
+           xfs_sb_version_hasextflgbit(&mp->m_sb))
+               bma->got.br_state = XFS_EXT_UNWRITTEN;
+@@ -4419,8 +4435,6 @@ xfs_bmapi_convert_unwritten(
+                       (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+               return 0;
+-      ASSERT(whichfork != XFS_COW_FORK);
+-
+       /*
+        * Modify (by adding) the state flag, if writing.
+        */
+@@ -4445,8 +4459,8 @@ xfs_bmapi_convert_unwritten(
+                       return error;
+       }
+-      error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+-                      &bma->cur, mval, bma->firstblock, bma->dfops,
++      error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
++                      &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
+                       &tmp_logflags);
+       /*
+        * Log the inode core unconditionally in the unwritten extent conversion
+@@ -4455,8 +4469,12 @@ xfs_bmapi_convert_unwritten(
+        * in the transaction for the sake of fsync(), even if nothing has
+        * changed, because fsync() will not force the log for this transaction
+        * unless it sees the inode pinned.
++       *
++       * Note: If we're only converting cow fork extents, there aren't
++       * any on-disk updates to make, so we don't need to log anything.
+        */
+-      bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
++      if (whichfork != XFS_COW_FORK)
++              bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
+       if (error)
+               return error;
+@@ -4530,15 +4548,15 @@ xfs_bmapi_write(
+       ASSERT(*nmap >= 1);
+       ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+       ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+-      ASSERT(tp != NULL);
++      ASSERT(tp != NULL ||
++             (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
++                      (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+       ASSERT(len > 0);
+       ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
+       ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
+       ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+-      ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
+-      ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
+       /* zeroing is for currently only for data extents, not metadata */
+       ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+@@ -5553,8 +5571,8 @@ __xfs_bunmapi(
+                       }
+                       del.br_state = XFS_EXT_UNWRITTEN;
+                       error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+-                                      &lastx, &cur, &del, firstblock, dfops,
+-                                      &logflags);
++                                      whichfork, &lastx, &cur, &del,
++                                      firstblock, dfops, &logflags);
+                       if (error)
+                               goto error0;
+                       goto nodelete;
+@@ -5607,8 +5625,9 @@ __xfs_bunmapi(
+                               prev.br_state = XFS_EXT_UNWRITTEN;
+                               lastx--;
+                               error = xfs_bmap_add_extent_unwritten_real(tp,
+-                                              ip, &lastx, &cur, &prev,
+-                                              firstblock, dfops, &logflags);
++                                              ip, whichfork, &lastx, &cur,
++                                              &prev, firstblock, dfops,
++                                              &logflags);
+                               if (error)
+                                       goto error0;
+                               goto nodelete;
+@@ -5616,8 +5635,9 @@ __xfs_bunmapi(
+                               ASSERT(del.br_state == XFS_EXT_NORM);
+                               del.br_state = XFS_EXT_UNWRITTEN;
+                               error = xfs_bmap_add_extent_unwritten_real(tp,
+-                                              ip, &lastx, &cur, &del,
+-                                              firstblock, dfops, &logflags);
++                                              ip, whichfork, &lastx, &cur,
++                                              &del, firstblock, dfops,
++                                              &logflags);
+                               if (error)
+                                       goto error0;
+                               goto nodelete;
diff --git a/queue-4.10/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch b/queue-4.10/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
new file mode 100644 (file)
index 0000000..5d1c314
--- /dev/null
@@ -0,0 +1,55 @@
+From b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:59 -0800
+Subject: xfs: check for obviously bad level values in the bmbt root
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 upstream.
+
+We can't handle a bmbt that's taller than BTREE_MAXLEVELS, and there's
+no such thing as a zero-level bmbt (for that we have extents format),
+so if we see this, send back an error code.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_inode_fork.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -26,6 +26,7 @@
+ #include "xfs_inode.h"
+ #include "xfs_trans.h"
+ #include "xfs_inode_item.h"
++#include "xfs_btree.h"
+ #include "xfs_bmap_btree.h"
+ #include "xfs_bmap.h"
+ #include "xfs_error.h"
+@@ -429,11 +430,13 @@ xfs_iformat_btree(
+       /* REFERENCED */
+       int                     nrecs;
+       int                     size;
++      int                     level;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+       size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+       nrecs = be16_to_cpu(dfp->bb_numrecs);
++      level = be16_to_cpu(dfp->bb_level);
+       /*
+        * blow out if -- fork has less extents than can fit in
+@@ -446,7 +449,8 @@ xfs_iformat_btree(
+                                       XFS_IFORK_MAXEXT(ip, whichfork) ||
+                    XFS_BMDR_SPACE_CALC(nrecs) >
+                                       XFS_DFORK_SIZE(dip, mp, whichfork) ||
+-                   XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
++                   XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
++                   level == 0 || level > XFS_BTREE_MAXLEVELS) {
+               xfs_warn(mp, "corrupt inode %Lu (btree).",
+                                       (unsigned long long) ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
diff --git a/queue-4.10/xfs-correct-null-checks-and-error-processing-in-xfs_initialize_perag.patch b/queue-4.10/xfs-correct-null-checks-and-error-processing-in-xfs_initialize_perag.patch
new file mode 100644 (file)
index 0000000..1f8ee5a
--- /dev/null
@@ -0,0 +1,109 @@
+From b20fe4730ea5c037c16631fb0df659c7b6d4b3b1 Mon Sep 17 00:00:00 2001
+From: Bill O'Donnell <billodo@redhat.com>
+Date: Tue, 7 Feb 2017 12:59:33 -0800
+Subject: xfs: correct null checks and error processing in xfs_initialize_perag
+
+From: Bill O'Donnell <billodo@redhat.com>
+
+commit b20fe4730ea5c037c16631fb0df659c7b6d4b3b1 upstream.
+
+If pag cannot be allocated, the current error exit path will trip
+a null pointer deference error when calling xfs_buf_hash_destroy
+with a null pag.  Fix this by adding a new error exit labels and
+jumping to those accordingly, avoiding the hash destroy and
+unnecessary kmem_free on pag.
+
+Up to three things need to be properly unwound:
+
+1) pag memory allocation
+2) xfs_buf_hash_init
+3) radix_tree_insert
+
+For any given iteration through the loop, any of the above which
+succeed must be unwound for /this/ pag, and then all prior
+initialized pags must be unwound.
+
+Addresses-Coverity-Id: 1397628 ("Dereference after null check")
+
+Reported-by: Colin Ian King <colin.king@canonical.com>
+Signed-off-by: Bill O'Donnell <billodo@redhat.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_mount.c |   24 +++++++++++++++---------
+ 1 file changed, 15 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -187,7 +187,7 @@ xfs_initialize_perag(
+       xfs_agnumber_t  *maxagi)
+ {
+       xfs_agnumber_t  index;
+-      xfs_agnumber_t  first_initialised = 0;
++      xfs_agnumber_t  first_initialised = NULLAGNUMBER;
+       xfs_perag_t     *pag;
+       int             error = -ENOMEM;
+@@ -202,22 +202,20 @@ xfs_initialize_perag(
+                       xfs_perag_put(pag);
+                       continue;
+               }
+-              if (!first_initialised)
+-                      first_initialised = index;
+               pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+               if (!pag)
+-                      goto out_unwind;
++                      goto out_unwind_new_pags;
+               pag->pag_agno = index;
+               pag->pag_mount = mp;
+               spin_lock_init(&pag->pag_ici_lock);
+               mutex_init(&pag->pag_ici_reclaim_lock);
+               INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+               if (xfs_buf_hash_init(pag))
+-                      goto out_unwind;
++                      goto out_free_pag;
+               if (radix_tree_preload(GFP_NOFS))
+-                      goto out_unwind;
++                      goto out_hash_destroy;
+               spin_lock(&mp->m_perag_lock);
+               if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+@@ -225,10 +223,13 @@ xfs_initialize_perag(
+                       spin_unlock(&mp->m_perag_lock);
+                       radix_tree_preload_end();
+                       error = -EEXIST;
+-                      goto out_unwind;
++                      goto out_hash_destroy;
+               }
+               spin_unlock(&mp->m_perag_lock);
+               radix_tree_preload_end();
++              /* first new pag is fully initialized */
++              if (first_initialised == NULLAGNUMBER)
++                      first_initialised = index;
+       }
+       index = xfs_set_inode_alloc(mp, agcount);
+@@ -239,11 +240,16 @@ xfs_initialize_perag(
+       mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
+       return 0;
+-out_unwind:
++out_hash_destroy:
+       xfs_buf_hash_destroy(pag);
++out_free_pag:
+       kmem_free(pag);
+-      for (; index > first_initialised; index--) {
++out_unwind_new_pags:
++      /* unwind any prior newly initialized pags */
++      for (index = first_initialised; index < agcount; index++) {
+               pag = radix_tree_delete(&mp->m_perag_tree, index);
++              if (!pag)
++                      break;
+               xfs_buf_hash_destroy(pag);
+               kmem_free(pag);
+       }
diff --git a/queue-4.10/xfs-don-t-fail-xfs_extent_busy-allocation.patch b/queue-4.10/xfs-don-t-fail-xfs_extent_busy-allocation.patch
new file mode 100644 (file)
index 0000000..4d007c3
--- /dev/null
@@ -0,0 +1,45 @@
+From 5e30c23d13919a718b22d4921dc5c0accc59da27 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 7 Feb 2017 14:06:46 -0800
+Subject: xfs: don't fail xfs_extent_busy allocation
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 5e30c23d13919a718b22d4921dc5c0accc59da27 upstream.
+
+We don't just need the structure to track busy extents which can be
+avoided with a synchronous transaction, but also to keep track of
+pending discard.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_extent_busy.c |   13 +------------
+ 1 file changed, 1 insertion(+), 12 deletions(-)
+
+--- a/fs/xfs/xfs_extent_busy.c
++++ b/fs/xfs/xfs_extent_busy.c
+@@ -45,18 +45,7 @@ xfs_extent_busy_insert(
+       struct rb_node          **rbp;
+       struct rb_node          *parent = NULL;
+-      new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+-      if (!new) {
+-              /*
+-               * No Memory!  Since it is now not possible to track the free
+-               * block, make this a synchronous transaction to insure that
+-               * the block is not reused before this transaction commits.
+-               */
+-              trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
+-              xfs_trans_set_sync(tp);
+-              return;
+-      }
+-
++      new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+       new->agno = agno;
+       new->bno = bno;
+       new->length = len;
diff --git a/queue-4.10/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch b/queue-4.10/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
new file mode 100644 (file)
index 0000000..a7aa9fd
--- /dev/null
@@ -0,0 +1,94 @@
+From 48af96ab92bc68fb645068b978ce36df2379e076 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 15 Feb 2017 10:18:10 -0800
+Subject: xfs: don't reserve blocks for right shift transactions
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 48af96ab92bc68fb645068b978ce36df2379e076 upstream.
+
+The block reservation for the transaction allocated in
+xfs_shift_file_space() is an artifact of the original collapse range
+support. It exists to handle the case where a collapse range occurs,
+the initial extent is left shifted into a location that forms a
+contiguous boundary with the previous extent and thus the extents
+are merged. This code was subsequently refactored and reused for
+insert range (right shift) support.
+
+If an insert range occurs under low free space conditions, the
+extent at the starting offset is split before the first shift
+transaction is allocated. If the block reservation fails, this
+leaves separate, but contiguous extents around in the inode. While
+not a fatal problem, this is unexpected and will flag a warning on
+subsequent insert range operations on the inode. This problem has
+been reproduce intermittently by generic/270 running against a
+ramdisk device.
+
+Since right shift does not create new extent boundaries in the
+inode, a block reservation for extent merge is unnecessary. Update
+xfs_shift_file_space() to conditionally reserve fs blocks for left
+shift transactions only. This avoids the warning reproduced by
+generic/270.
+
+Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |   20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1387,10 +1387,16 @@ xfs_shift_file_space(
+       xfs_fileoff_t           stop_fsb;
+       xfs_fileoff_t           next_fsb;
+       xfs_fileoff_t           shift_fsb;
++      uint                    resblks;
+       ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+       if (direction == SHIFT_LEFT) {
++              /*
++               * Reserve blocks to cover potential extent merges after left
++               * shift operations.
++               */
++              resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+               next_fsb = XFS_B_TO_FSB(mp, offset + len);
+               stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+       } else {
+@@ -1398,6 +1404,7 @@ xfs_shift_file_space(
+                * If right shift, delegate the work of initialization of
+                * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+                */
++              resblks = 0;
+               next_fsb = NULLFSBLOCK;
+               stop_fsb = XFS_B_TO_FSB(mp, offset);
+       }
+@@ -1439,21 +1446,14 @@ xfs_shift_file_space(
+       }
+       while (!error && !done) {
+-              /*
+-               * We would need to reserve permanent block for transaction.
+-               * This will come into picture when after shifting extent into
+-               * hole we found that adjacent extents can be merged which
+-               * may lead to freeing of a block during record update.
+-               */
+-              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+-                              XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
++              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
++                                      &tp);
+               if (error)
+                       break;
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+-                              ip->i_gdquot, ip->i_pdquot,
+-                              XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
++                              ip->i_gdquot, ip->i_pdquot, resblks, 0,
+                               XFS_QMOPT_RES_REGBLKS);
+               if (error)
+                       goto out_trans_cancel;
diff --git a/queue-4.10/xfs-fail-_dir_open-when-readahead-fails.patch b/queue-4.10/xfs-fail-_dir_open-when-readahead-fails.patch
new file mode 100644 (file)
index 0000000..4d8d176
--- /dev/null
@@ -0,0 +1,75 @@
+From 7a652bbe366464267190c2792a32ce4fff5595ef Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:58 -0800
+Subject: xfs: fail _dir_open when readahead fails
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 7a652bbe366464267190c2792a32ce4fff5595ef upstream.
+
+When we open a directory, we try to readahead block 0 of the directory
+on the assumption that we're going to need it soon.  If the bmbt is
+corrupt, the directory will never be usable and the readahead fails
+immediately, so we might as well prevent the directory from being opened
+at all.  This prevents a subsequent read or modify operation from
+hitting it and taking the fs offline.
+
+NOTE: We're only checking for early failures in the block mapping, not
+the readahead directory block itself.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_da_btree.c |    6 ++----
+ fs/xfs/libxfs/xfs_da_btree.h |    2 +-
+ fs/xfs/xfs_file.c            |    4 ++--
+ 3 files changed, 5 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_da_btree.c
++++ b/fs/xfs/libxfs/xfs_da_btree.c
+@@ -2633,7 +2633,7 @@ out_free:
+ /*
+  * Readahead the dir/attr block.
+  */
+-xfs_daddr_t
++int
+ xfs_da_reada_buf(
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+@@ -2664,7 +2664,5 @@ out_free:
+       if (mapp != &map)
+               kmem_free(mapp);
+-      if (error)
+-              return -1;
+-      return mappedbno;
++      return error;
+ }
+--- a/fs/xfs/libxfs/xfs_da_btree.h
++++ b/fs/xfs/libxfs/xfs_da_btree.h
+@@ -201,7 +201,7 @@ int        xfs_da_read_buf(struct xfs_trans *tr
+                              xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                              struct xfs_buf **bpp, int whichfork,
+                              const struct xfs_buf_ops *ops);
+-xfs_daddr_t   xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
++int   xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+                               xfs_daddr_t mapped_bno, int whichfork,
+                               const struct xfs_buf_ops *ops);
+ int   xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -913,9 +913,9 @@ xfs_dir_open(
+        */
+       mode = xfs_ilock_data_map_shared(ip);
+       if (ip->i_d.di_nextents > 0)
+-              xfs_dir3_data_readahead(ip, 0, -1);
++              error = xfs_dir3_data_readahead(ip, 0, -1);
+       xfs_iunlock(ip, mode);
+-      return 0;
++      return error;
+ }
+ STATIC int
diff --git a/queue-4.10/xfs-filter-out-obviously-bad-btree-pointers.patch b/queue-4.10/xfs-filter-out-obviously-bad-btree-pointers.patch
new file mode 100644 (file)
index 0000000..2192212
--- /dev/null
@@ -0,0 +1,66 @@
+From d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:58 -0800
+Subject: xfs: filter out obviously bad btree pointers
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 upstream.
+
+Don't let anybody load an obviously bad btree pointer.  Since the values
+come from disk, we must return an error, not just ASSERT.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c  |    5 +----
+ fs/xfs/libxfs/xfs_btree.c |    3 ++-
+ fs/xfs/libxfs/xfs_btree.h |    2 +-
+ 3 files changed, 4 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -1278,7 +1278,6 @@ xfs_bmap_read_extents(
+       /* REFERENCED */
+       xfs_extnum_t            room;   /* number of entries there's room for */
+-      bno = NULLFSBLOCK;
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+@@ -1291,9 +1290,7 @@ xfs_bmap_read_extents(
+       ASSERT(level > 0);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+       bno = be64_to_cpu(*pp);
+-      ASSERT(bno != NULLFSBLOCK);
+-      ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+-      ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
++
+       /*
+        * Go down the tree until leaf level is reached, following the first
+        * pointer (leftmost) at each level.
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -810,7 +810,8 @@ xfs_btree_read_bufl(
+       xfs_daddr_t             d;              /* real disk block address */
+       int                     error;
+-      ASSERT(fsbno != NULLFSBLOCK);
++      if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
++              return -EFSCORRUPTED;
+       d = XFS_FSB_TO_DADDR(mp, fsbno);
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+                                  mp->m_bsize, lock, &bp, ops);
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -456,7 +456,7 @@ static inline int xfs_btree_get_level(st
+ #define       XFS_FILBLKS_MAX(a,b)    max_t(xfs_filblks_t, (a), (b))
+ #define       XFS_FSB_SANITY_CHECK(mp,fsb)    \
+-      (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
++      (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+               XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+ /*
diff --git a/queue-4.10/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch b/queue-4.10/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
new file mode 100644 (file)
index 0000000..6c22505
--- /dev/null
@@ -0,0 +1,111 @@
+From 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 2 Mar 2017 15:02:51 -0800
+Subject: xfs: fix and streamline error handling in xfs_end_io
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe upstream.
+
+There are two different cases of buffered I/O errors:
+
+ - first we can have an already shutdown fs.  In that case we should skip
+   any on-disk operations and just clean up the appen transaction if
+   present and destroy the ioend
+ - a real I/O error.  In that case we should cleanup any lingering COW
+   blocks.  This gets skipped in the current code and is fixed by this
+   patch.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c |   59 ++++++++++++++++++++++++------------------------------
+ 1 file changed, 27 insertions(+), 32 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -274,54 +274,49 @@ xfs_end_io(
+       struct xfs_ioend        *ioend =
+               container_of(work, struct xfs_ioend, io_work);
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
++      xfs_off_t               offset = ioend->io_offset;
++      size_t                  size = ioend->io_size;
+       int                     error = ioend->io_bio->bi_error;
+       /*
+-       * Set an error if the mount has shut down and proceed with end I/O
+-       * processing so it can perform whatever cleanups are necessary.
++       * Just clean up the in-memory strutures if the fs has been shut down.
+        */
+-      if (XFS_FORCED_SHUTDOWN(ip->i_mount))
++      if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+               error = -EIO;
++              goto done;
++      }
+       /*
+-       * For a CoW extent, we need to move the mapping from the CoW fork
+-       * to the data fork.  If instead an error happened, just dump the
+-       * new blocks.
++       * Clean up any COW blocks on an I/O error.
+        */
+-      if (ioend->io_type == XFS_IO_COW) {
+-              if (error)
+-                      goto done;
+-              if (ioend->io_bio->bi_error) {
+-                      error = xfs_reflink_cancel_cow_range(ip,
+-                                      ioend->io_offset, ioend->io_size, true);
+-                      goto done;
++      if (unlikely(error)) {
++              switch (ioend->io_type) {
++              case XFS_IO_COW:
++                      xfs_reflink_cancel_cow_range(ip, offset, size, true);
++                      break;
+               }
+-              error = xfs_reflink_end_cow(ip, ioend->io_offset,
+-                              ioend->io_size);
+-              if (error)
+-                      goto done;
++
++              goto done;
+       }
+       /*
+-       * For unwritten extents we need to issue transactions to convert a
+-       * range to normal written extens after the data I/O has finished.
+-       * Detecting and handling completion IO errors is done individually
+-       * for each case as different cleanup operations need to be performed
+-       * on error.
++       * Success:  commit the COW or unwritten blocks if needed.
+        */
+-      if (ioend->io_type == XFS_IO_UNWRITTEN) {
+-              if (error)
+-                      goto done;
+-              error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+-                                                ioend->io_size);
+-      } else if (ioend->io_append_trans) {
+-              error = xfs_setfilesize_ioend(ioend, error);
+-      } else {
+-              ASSERT(!xfs_ioend_is_append(ioend) ||
+-                     ioend->io_type == XFS_IO_COW);
++      switch (ioend->io_type) {
++      case XFS_IO_COW:
++              error = xfs_reflink_end_cow(ip, offset, size);
++              break;
++      case XFS_IO_UNWRITTEN:
++              error = xfs_iomap_write_unwritten(ip, offset, size);
++              break;
++      default:
++              ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
++              break;
+       }
+ done:
++      if (ioend->io_append_trans)
++              error = xfs_setfilesize_ioend(ioend, error);
+       xfs_destroy_ioend(ioend, error);
+ }
diff --git a/queue-4.10/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch b/queue-4.10/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
new file mode 100644 (file)
index 0000000..b67003b
--- /dev/null
@@ -0,0 +1,61 @@
+From e4229d6b0bc9280f29624faf170cf76a9f1ca60e Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 27 Jan 2017 23:22:57 -0800
+Subject: xfs: fix eofblocks race with file extending async dio writes
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e4229d6b0bc9280f29624faf170cf76a9f1ca60e upstream.
+
+It's possible for post-eof blocks to end up being used for direct I/O
+writes. dio write performs an upfront unwritten extent allocation, sends
+the dio and then updates the inode size (if necessary) on write
+completion. If a file release occurs while a file extending dio write is
+in flight, it is possible to mistake the post-eof blocks for speculative
+preallocation and incorrectly truncate them from the inode. This means
+that the resulting dio write completion can discover a hole and allocate
+new blocks rather than perform unwritten extent conversion.
+
+This requires a strange mix of I/O and is thus not likely to reproduce
+in real world workloads. It is intermittently reproduced by generic/299.
+The error manifests as an assert failure due to transaction overrun
+because the aforementioned write completion transaction has only
+reserved enough blocks for btree operations:
+
+  XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, \
+   file: fs/xfs//xfs_trans.c, line: 309
+
+The root cause is that xfs_free_eofblocks() uses i_size to truncate
+post-eof blocks from the inode, but async, file extending direct writes
+do not update i_size until write completion, long after inode locks are
+dropped. Therefore, xfs_free_eofblocks() effectively truncates the inode
+to the incorrect size.
+
+Update xfs_free_eofblocks() to serialize against dio similar to how
+extending writes are serialized against i_size updates before post-eof
+block zeroing. Specifically, wait on dio while under the iolock. This
+ensures that dio write completions have updated i_size before post-eof
+blocks are processed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -959,6 +959,9 @@ xfs_free_eofblocks(
+               if (error)
+                       return error;
++              /* wait on dio to ensure i_size has settled */
++              inode_dio_wait(VFS_I(ip));
++
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+                               &tp);
+               if (error) {
diff --git a/queue-4.10/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch b/queue-4.10/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
new file mode 100644 (file)
index 0000000..a35bb54
--- /dev/null
@@ -0,0 +1,48 @@
+From 4b5bd5bf3fb182dc504b1b64e0331300f156e756 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:57 -0800
+Subject: xfs: fix toctou race when locking an inode to access the data map
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 4b5bd5bf3fb182dc504b1b64e0331300f156e756 upstream.
+
+We use di_format and if_flags to decide whether we're grabbing the ilock
+in btree mode (btree extents not loaded) or shared mode (anything else),
+but the state of those fields can be changed by other threads that are
+also trying to load the btree extents -- IFEXTENTS gets set before the
+_bmap_read_extents call and cleared if it fails.
+
+We don't actually need to have IFEXTENTS set until after the bmbt
+records are successfully loaded and validated, which will fix the race
+between multiple threads trying to read the same directory.  The next
+patch strengthens directory bmbt validation by refusing to open the
+directory if reading the bmbt to start directory readahead fails.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_inode_fork.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -497,15 +497,14 @@ xfs_iread_extents(
+        * We know that the size is valid (it's checked in iformat_btree)
+        */
+       ifp->if_bytes = ifp->if_real_bytes = 0;
+-      ifp->if_flags |= XFS_IFEXTENTS;
+       xfs_iext_add(ifp, 0, nextents);
+       error = xfs_bmap_read_extents(tp, ip, whichfork);
+       if (error) {
+               xfs_iext_destroy(ifp);
+-              ifp->if_flags &= ~XFS_IFEXTENTS;
+               return error;
+       }
+       xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
++      ifp->if_flags |= XFS_IFEXTENTS;
+       return 0;
+ }
+ /*
diff --git a/queue-4.10/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch b/queue-4.10/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
new file mode 100644 (file)
index 0000000..aab6041
--- /dev/null
@@ -0,0 +1,31 @@
+From 93aaead52a9eebdc20dc8fa673c350e592a06949 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 13 Feb 2017 22:52:27 -0800
+Subject: xfs: fix uninitialized variable in _reflink_convert_cow
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 93aaead52a9eebdc20dc8fa673c350e592a06949 upstream.
+
+Fix an uninitialize variable.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_reflink.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -363,7 +363,7 @@ xfs_reflink_convert_cow(
+       xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
+       xfs_extnum_t            idx;
+       bool                    found;
+-      int                     error;
++      int                     error = 0;
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/queue-4.10/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch b/queue-4.10/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
new file mode 100644 (file)
index 0000000..a01bc1c
--- /dev/null
@@ -0,0 +1,76 @@
+From 0e339ef8556d9e567aa7925f8892c263d79430d9 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Mon, 13 Feb 2017 22:48:18 -0800
+Subject: xfs: handle indlen shortage on delalloc extent merge
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 0e339ef8556d9e567aa7925f8892c263d79430d9 upstream.
+
+When a delalloc extent is created, it can be merged with pre-existing,
+contiguous, delalloc extents. When this occurs,
+xfs_bmap_add_extent_hole_delay() merges the extents along with the
+associated indirect block reservations. The expectation here is that the
+combined worst case indlen reservation is always less than or equal to
+the indlen reservation for the individual extents.
+
+This is not always the case, however, as existing extents can less than
+the expected indlen reservation if the extent was previously split due
+to a hole punch. If a new extent merges with such an extent, the total
+indlen requirement may be larger than the sum of the indlen reservations
+held by both extents.
+
+xfs_bmap_add_extent_hole_delay() assumes that the worst case indlen
+reservation is always available and assigns it to the merged extent
+without consideration for the indlen held by the pre-existing extent. As
+a result, the subsequent xfs_mod_fdblocks() call can attempt an
+unintentional allocation rather than a free (indicated by an ASSERT()
+failure). Further, if the allocation happens to fail in this context,
+the failure goes unhandled and creates a filesystem wide block
+accounting inconsistency.
+
+Fix xfs_bmap_add_extent_hole_delay() to function as designed. Cap the
+indlen reservation assigned to the merged extent to the sum of the
+indlen reservations held by each of the individual extents.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -2816,7 +2816,8 @@ xfs_bmap_add_extent_hole_delay(
+               oldlen = startblockval(left.br_startblock) +
+                       startblockval(new->br_startblock) +
+                       startblockval(right.br_startblock);
+-              newlen = xfs_bmap_worst_indlen(ip, temp);
++              newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++                                       oldlen);
+               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+                       nullstartblock((int)newlen));
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+@@ -2837,7 +2838,8 @@ xfs_bmap_add_extent_hole_delay(
+               xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+               oldlen = startblockval(left.br_startblock) +
+                       startblockval(new->br_startblock);
+-              newlen = xfs_bmap_worst_indlen(ip, temp);
++              newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++                                       oldlen);
+               xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+                       nullstartblock((int)newlen));
+               trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+@@ -2853,7 +2855,8 @@ xfs_bmap_add_extent_hole_delay(
+               temp = new->br_blockcount + right.br_blockcount;
+               oldlen = startblockval(new->br_startblock) +
+                       startblockval(right.br_startblock);
+-              newlen = xfs_bmap_worst_indlen(ip, temp);
++              newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++                                       oldlen);
+               xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                       new->br_startoff,
+                       nullstartblock((int)newlen), temp, right.br_state);
diff --git a/queue-4.10/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch b/queue-4.10/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
new file mode 100644 (file)
index 0000000..6556b51
--- /dev/null
@@ -0,0 +1,339 @@
+From 5eda43000064a69a39fb7869cc63c9571535ad29 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:02 -0800
+Subject: xfs: mark speculative prealloc CoW fork extents unwritten
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 5eda43000064a69a39fb7869cc63c9571535ad29 upstream.
+
+Christoph Hellwig pointed out that there's a potentially nasty race when
+performing simultaneous nearby directio cow writes:
+
+"Thread 1 writes a range from B to c
+
+"                    B --------- C
+                           p
+
+"a little later thread 2 writes from A to B
+
+"        A --------- B
+               p
+
+[editor's note: the 'p' denote cowextsize boundaries, which I added to
+make this more clear]
+
+"but the code preallocates beyond B into the range where thread
+"1 has just written, but ->end_io hasn't been called yet.
+"But once ->end_io is called thread 2 has already allocated
+"up to the extent size hint into the write range of thread 1,
+"so the end_io handler will splice the unintialized blocks from
+"that preallocation back into the file right after B."
+
+We can avoid this race by ensuring that thread 1 cannot accidentally
+remap the blocks that thread 2 allocated (as part of speculative
+preallocation) as part of t2's write preparation in t1's end_io handler.
+The way we make this happen is by taking advantage of the unwritten
+extent flag as an intermediate step.
+
+Recall that when we begin the process of writing data to shared blocks,
+we create a delayed allocation extent in the CoW fork:
+
+D: --RRRRRRSSSRRRRRRRR---
+C: ------DDDDDDD---------
+
+When a thread prepares to CoW some dirty data out to disk, it will now
+convert the delalloc reservation into an /unwritten/ allocated extent in
+the cow fork.  The da conversion code tries to opportunistically
+allocate as much of a (speculatively prealloc'd) extent as possible, so
+we may end up allocating a larger extent than we're actually writing
+out:
+
+D: --RRRRRRSSSRRRRRRRR---
+U: ------UUUUUUU---------
+
+Next, we convert only the part of the extent that we're actively
+planning to write to normal (i.e. not unwritten) status:
+
+D: --RRRRRRSSSRRRRRRRR---
+U: ------UURRUUU---------
+
+If the write succeeds, the end_cow function will now scan the relevant
+range of the CoW fork for real extents and remap only the real extents
+into the data fork:
+
+D: --RRRRRRRRSRRRRRRRR---
+U: ------UU--UUU---------
+
+This ensures that we never obliterate valid data fork extents with
+unwritten blocks from the CoW fork.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c    |    6 ++
+ fs/xfs/xfs_iomap.c   |    2 
+ fs/xfs/xfs_reflink.c |  116 +++++++++++++++++++++++++++++++++++++++++++++++----
+ fs/xfs/xfs_reflink.h |    2 
+ fs/xfs/xfs_trace.h   |    8 ++-
+ 5 files changed, 123 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -481,6 +481,12 @@ xfs_submit_ioend(
+       struct xfs_ioend        *ioend,
+       int                     status)
+ {
++      /* Convert CoW extents to regular */
++      if (!status && ioend->io_type == XFS_IO_COW) {
++              status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
++                              ioend->io_offset, ioend->io_size);
++      }
++
+       /* Reserve log space if we might write beyond the on-disk inode size. */
+       if (!status &&
+           ioend->io_type != XFS_IO_UNWRITTEN &&
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -685,7 +685,7 @@ xfs_iomap_write_allocate(
+       int             nres;
+       if (whichfork == XFS_COW_FORK)
+-              flags |= XFS_BMAPI_COWFORK;
++              flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+       /*
+        * Make sure that the dquots are there.
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -82,11 +82,22 @@
+  * mappings are a reservation against the free space in the filesystem;
+  * adjacent mappings can also be combined into fewer larger mappings.
+  *
++ * As an optimization, the CoW extent size hint (cowextsz) creates
++ * outsized aligned delalloc reservations in the hope of landing out of
++ * order nearby CoW writes in a single extent on disk, thereby reducing
++ * fragmentation and improving future performance.
++ *
++ * D: --RRRRRRSSSRRRRRRRR--- (data fork)
++ * C: ------DDDDDDD--------- (CoW fork)
++ *
+  * When dirty pages are being written out (typically in writepage), the
+- * delalloc reservations are converted into real mappings by allocating
+- * blocks and replacing the delalloc mapping with real ones.  A delalloc
+- * mapping can be replaced by several real ones if the free space is
+- * fragmented.
++ * delalloc reservations are converted into unwritten mappings by
++ * allocating blocks and replacing the delalloc mapping with real ones.
++ * A delalloc mapping can be replaced by several unwritten ones if the
++ * free space is fragmented.
++ *
++ * D: --RRRRRRSSSRRRRRRRR---
++ * C: ------UUUUUUU---------
+  *
+  * We want to adapt the delalloc mechanism for copy-on-write, since the
+  * write paths are similar.  The first two steps (creating the reservation
+@@ -101,13 +112,29 @@
+  * Block-aligned directio writes will use the same mechanism as buffered
+  * writes.
+  *
++ * Just prior to submitting the actual disk write requests, we convert
++ * the extents representing the range of the file actually being written
++ * (as opposed to extra pieces created for the cowextsize hint) to real
++ * extents.  This will become important in the next step:
++ *
++ * D: --RRRRRRSSSRRRRRRRR---
++ * C: ------UUrrUUU---------
++ *
+  * CoW remapping must be done after the data block write completes,
+  * because we don't want to destroy the old data fork map until we're sure
+  * the new block has been written.  Since the new mappings are kept in a
+  * separate fork, we can simply iterate these mappings to find the ones
+  * that cover the file blocks that we just CoW'd.  For each extent, simply
+  * unmap the corresponding range in the data fork, map the new range into
+- * the data fork, and remove the extent from the CoW fork.
++ * the data fork, and remove the extent from the CoW fork.  Because of
++ * the presence of the cowextsize hint, however, we must be careful
++ * only to remap the blocks that we've actually written out --  we must
++ * never remap delalloc reservations nor CoW staging blocks that have
++ * yet to be written.  This corresponds exactly to the real extents in
++ * the CoW fork:
++ *
++ * D: --RRRRRRrrSRRRRRRRR---
++ * C: ------UU--UUU---------
+  *
+  * Since the remapping operation can be applied to an arbitrary file
+  * range, we record the need for the remap step as a flag in the ioend
+@@ -296,6 +323,65 @@ xfs_reflink_reserve_cow(
+       return 0;
+ }
++/* Convert part of an unwritten CoW extent to a real one. */
++STATIC int
++xfs_reflink_convert_cow_extent(
++      struct xfs_inode                *ip,
++      struct xfs_bmbt_irec            *imap,
++      xfs_fileoff_t                   offset_fsb,
++      xfs_filblks_t                   count_fsb,
++      struct xfs_defer_ops            *dfops)
++{
++      struct xfs_bmbt_irec            irec = *imap;
++      xfs_fsblock_t                   first_block;
++      int                             nimaps = 1;
++
++      if (imap->br_state == XFS_EXT_NORM)
++              return 0;
++
++      xfs_trim_extent(&irec, offset_fsb, count_fsb);
++      trace_xfs_reflink_convert_cow(ip, &irec);
++      if (irec.br_blockcount == 0)
++              return 0;
++      return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount,
++                      XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
++                      0, &irec, &nimaps, dfops);
++}
++
++/* Convert all of the unwritten CoW extents in a file's range to real ones. */
++int
++xfs_reflink_convert_cow(
++      struct xfs_inode        *ip,
++      xfs_off_t               offset,
++      xfs_off_t               count)
++{
++      struct xfs_bmbt_irec    got;
++      struct xfs_defer_ops    dfops;
++      struct xfs_mount        *mp = ip->i_mount;
++      struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
++      xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
++      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
++      xfs_extnum_t            idx;
++      bool                    found;
++      int                     error;
++
++      xfs_ilock(ip, XFS_ILOCK_EXCL);
++
++      /* Convert all the extents to real from unwritten. */
++      for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
++           found && got.br_startoff < end_fsb;
++           found = xfs_iext_get_extent(ifp, ++idx, &got)) {
++              error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
++                              end_fsb - offset_fsb, &dfops);
++              if (error)
++                      break;
++      }
++
++      /* Finish up. */
++      xfs_iunlock(ip, XFS_ILOCK_EXCL);
++      return error;
++}
++
+ /* Allocate all CoW reservations covering a range of blocks in a file. */
+ static int
+ __xfs_reflink_allocate_cow(
+@@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow(
+               goto out_unlock;
+       ASSERT(nimaps == 1);
++      /* Make sure there's a CoW reservation for it. */
+       error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+       if (error)
+               goto out_trans_cancel;
+@@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow(
+               goto out_trans_cancel;
+       }
++      /* Allocate the entire reservation as unwritten blocks. */
+       xfs_trans_ijoin(tp, ip, 0);
+       error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
+-                      XFS_BMAPI_COWFORK, &first_block,
++                      XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
+                       XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
+                       &imap, &nimaps, &dfops);
+       if (error)
+               goto out_trans_cancel;
++      /* Finish up. */
+       error = xfs_defer_finish(&tp, &dfops, NULL);
+       if (error)
+               goto out_trans_cancel;
+@@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range(
+               if (error) {
+                       trace_xfs_reflink_allocate_cow_range_error(ip, error,
+                                       _RET_IP_);
+-                      break;
++                      return error;
+               }
+       }
+-      return error;
++      /* Convert the CoW extents to regular. */
++      return xfs_reflink_convert_cow(ip, offset, count);
+ }
+ /*
+@@ -641,6 +731,16 @@ xfs_reflink_end_cow(
+               ASSERT(!isnullstartblock(got.br_startblock));
++              /*
++               * Don't remap unwritten extents; these are
++               * speculatively preallocated CoW extents that have been
++               * allocated but have not yet been involved in a write.
++               */
++              if (got.br_state == XFS_EXT_UNWRITTEN) {
++                      idx--;
++                      goto next_extent;
++              }
++
+               /* Unmap the old blocks in the data fork. */
+               xfs_defer_init(&dfops, &firstfsb);
+               rlen = del.br_blockcount;
+--- a/fs/xfs/xfs_reflink.h
++++ b/fs/xfs/xfs_reflink.h
+@@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struc
+               struct xfs_bmbt_irec *imap, bool *shared);
+ extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+               xfs_off_t offset, xfs_off_t count);
++extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
++              xfs_off_t count);
+ extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
+               struct xfs_bmbt_irec *imap);
+ extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3089,6 +3089,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class
+               __field(xfs_fileoff_t, lblk)
+               __field(xfs_extlen_t, len)
+               __field(xfs_fsblock_t, pblk)
++              __field(int, state)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+@@ -3096,13 +3097,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class
+               __entry->lblk = irec->br_startoff;
+               __entry->len = irec->br_blockcount;
+               __entry->pblk = irec->br_startblock;
++              __entry->state = irec->br_state;
+       ),
+-      TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
++      TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->lblk,
+                 __entry->len,
+-                __entry->pblk)
++                __entry->pblk,
++                __entry->state)
+ );
+ #define DEFINE_INODE_IREC_EVENT(name) \
+ DEFINE_EVENT(xfs_inode_irec_class, name, \
+@@ -3242,6 +3245,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
++DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
+ DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
+ DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
diff --git a/queue-4.10/xfs-only-reclaim-unwritten-cow-extents-periodically.patch b/queue-4.10/xfs-only-reclaim-unwritten-cow-extents-periodically.patch
new file mode 100644 (file)
index 0000000..2a4d24f
--- /dev/null
@@ -0,0 +1,159 @@
+From 3802a345321a08093ba2ddb1849e736f84e8d450 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 7 Mar 2017 16:45:58 -0800
+Subject: xfs: only reclaim unwritten COW extents periodically
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 3802a345321a08093ba2ddb1849e736f84e8d450 upstream.
+
+We only want to reclaim preallocations from our periodic work item.
+Currently this is archived by looking for a dirty inode, but that check
+is rather fragile.  Instead add a flag to xfs_reflink_cancel_cow_* so
+that the caller can ask for just cancelling unwritten extents in the COW
+fork.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: fix typos in commit message]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c    |    2 +-
+ fs/xfs/xfs_icache.c  |    2 +-
+ fs/xfs/xfs_inode.c   |    2 +-
+ fs/xfs/xfs_reflink.c |   23 ++++++++++++++++-------
+ fs/xfs/xfs_reflink.h |    4 ++--
+ fs/xfs/xfs_super.c   |    2 +-
+ 6 files changed, 22 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -293,7 +293,7 @@ xfs_end_io(
+                       goto done;
+               if (ioend->io_bio->bi_error) {
+                       error = xfs_reflink_cancel_cow_range(ip,
+-                                      ioend->io_offset, ioend->io_size);
++                                      ioend->io_offset, ioend->io_size, true);
+                       goto done;
+               }
+               error = xfs_reflink_end_cow(ip, ioend->io_offset,
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1608,7 +1608,7 @@ xfs_inode_free_cowblocks(
+       xfs_ilock(ip, XFS_IOLOCK_EXCL);
+       xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+-      ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
++      ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
+       xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1615,7 +1615,7 @@ xfs_itruncate_extents(
+       /* Remove all pending CoW reservations. */
+       error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
+-                      last_block);
++                      last_block, true);
+       if (error)
+               goto out;
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -549,14 +549,18 @@ xfs_reflink_trim_irec_to_next_cow(
+ }
+ /*
+- * Cancel all pending CoW reservations for some block range of an inode.
++ * Cancel CoW reservations for some block range of an inode.
++ *
++ * If cancel_real is true this function cancels all COW fork extents for the
++ * inode; if cancel_real is false, real extents are not cleared.
+  */
+ int
+ xfs_reflink_cancel_cow_blocks(
+       struct xfs_inode                *ip,
+       struct xfs_trans                **tpp,
+       xfs_fileoff_t                   offset_fsb,
+-      xfs_fileoff_t                   end_fsb)
++      xfs_fileoff_t                   end_fsb,
++      bool                            cancel_real)
+ {
+       struct xfs_ifork                *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+       struct xfs_bmbt_irec            got, del;
+@@ -580,7 +584,7 @@ xfs_reflink_cancel_cow_blocks(
+                                       &idx, &got, &del);
+                       if (error)
+                               break;
+-              } else {
++              } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
+                       xfs_trans_ijoin(*tpp, ip, 0);
+                       xfs_defer_init(&dfops, &firstfsb);
+@@ -622,13 +626,17 @@ xfs_reflink_cancel_cow_blocks(
+ }
+ /*
+- * Cancel all pending CoW reservations for some byte range of an inode.
++ * Cancel CoW reservations for some byte range of an inode.
++ *
++ * If cancel_real is true this function cancels all COW fork extents for the
++ * inode; if cancel_real is false, real extents are not cleared.
+  */
+ int
+ xfs_reflink_cancel_cow_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+-      xfs_off_t               count)
++      xfs_off_t               count,
++      bool                    cancel_real)
+ {
+       struct xfs_trans        *tp;
+       xfs_fileoff_t           offset_fsb;
+@@ -654,7 +662,8 @@ xfs_reflink_cancel_cow_range(
+       xfs_trans_ijoin(tp, ip, 0);
+       /* Scrape out the old CoW reservations */
+-      error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
++      error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
++                      cancel_real);
+       if (error)
+               goto out_cancel;
+@@ -1451,7 +1460,7 @@ next:
+        * We didn't find any shared blocks so turn off the reflink flag.
+        * First, get rid of any leftover CoW mappings.
+        */
+-      error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
++      error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
+       if (error)
+               return error;
+--- a/fs/xfs/xfs_reflink.h
++++ b/fs/xfs/xfs_reflink.h
+@@ -39,9 +39,9 @@ extern void xfs_reflink_trim_irec_to_nex
+ extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
+               struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
+-              xfs_fileoff_t end_fsb);
++              xfs_fileoff_t end_fsb, bool cancel_real);
+ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
+-              xfs_off_t count);
++              xfs_off_t count, bool cancel_real);
+ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+               xfs_off_t count);
+ extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -953,7 +953,7 @@ xfs_fs_destroy_inode(
+       XFS_STATS_INC(ip->i_mount, vn_remove);
+       if (xfs_is_reflink_inode(ip)) {
+-              error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
++              error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+               if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
+                       xfs_warn(ip->i_mount,
+ "Error %d while evicting CoW blocks for inode %llu.",
diff --git a/queue-4.10/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch b/queue-4.10/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
new file mode 100644 (file)
index 0000000..596ac9d
--- /dev/null
@@ -0,0 +1,84 @@
+From 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 6 Feb 2017 13:00:54 -0800
+Subject: xfs: reject all unaligned direct writes to reflinked files
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e upstream.
+
+We currently fall back from direct to buffered writes if we detect a
+remaining shared extent in the iomap_begin callback.  But by the time
+iomap_begin is called for the potentially unaligned end block we might
+have already written most of the data to disk, which we'd now write
+again using buffered I/O.  To avoid this reject all writes to reflinked
+files before starting I/O so that we are guaranteed to only write the
+data once.
+
+The alternative would be to unshare the unaligned start and/or end block
+before doing the I/O. I think that's doable, and will actually be
+required to support reflinks on DAX file system.  But it will take a
+little more time and I'd rather get rid of the double write ASAP.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c  |    9 +++++++++
+ fs/xfs/xfs_iomap.c |   12 +-----------
+ fs/xfs/xfs_trace.h |    2 +-
+ 3 files changed, 11 insertions(+), 12 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -527,6 +527,15 @@ xfs_file_dio_aio_write(
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + count) & mp->m_blockmask)) {
+               unaligned_io = 1;
++
++              /*
++               * We can't properly handle unaligned direct I/O to reflink
++               * files yet, as we can't unshare a partial block.
++               */
++              if (xfs_is_reflink_inode(ip)) {
++                      trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
++                      return -EREMCHG;
++              }
+               iolock = XFS_IOLOCK_EXCL;
+       } else {
+               iolock = XFS_IOLOCK_SHARED;
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1026,17 +1026,7 @@ xfs_file_iomap_begin(
+               if (error)
+                       goto out_unlock;
+-              /*
+-               * We're here because we're trying to do a directio write to a
+-               * region that isn't aligned to a filesystem block.  If the
+-               * extent is shared, fall back to buffered mode to handle the
+-               * RMW.
+-               */
+-              if (!(flags & IOMAP_REPORT) && shared) {
+-                      trace_xfs_reflink_bounce_dio_write(ip, &imap);
+-                      error = -EREMCHG;
+-                      goto out_unlock;
+-              }
++              ASSERT((flags & IOMAP_REPORT) || !shared);
+       }
+       if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3250,7 +3250,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_conv
+ DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
+ DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
+-DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
++DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
+ DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
diff --git a/queue-4.10/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch b/queue-4.10/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
new file mode 100644 (file)
index 0000000..970cc93
--- /dev/null
@@ -0,0 +1,32 @@
+From 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f Mon Sep 17 00:00:00 2001
+From: Hou Tao <houtao1@huawei.com>
+Date: Fri, 3 Feb 2017 14:39:07 -0800
+Subject: xfs: reset b_first_retry_time when clear the retry status of xfs_buf_t
+
+From: Hou Tao <houtao1@huawei.com>
+
+commit 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f upstream.
+
+After successful IO or permanent error, b_first_retry_time also
+needs to be cleared, else the invalid first retry time will be
+used by the next retry check.
+
+Signed-off-by: Hou Tao <houtao1@huawei.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf_item.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks(
+        */
+       bp->b_last_error = 0;
+       bp->b_retries = 0;
++      bp->b_first_retry_time = 0;
+       xfs_buf_do_callbacks(bp);
+       bp->b_fspriv = NULL;
diff --git a/queue-4.10/xfs-split-indlen-reservations-fairly-when-under-reserved.patch b/queue-4.10/xfs-split-indlen-reservations-fairly-when-under-reserved.patch
new file mode 100644 (file)
index 0000000..bfcf762
--- /dev/null
@@ -0,0 +1,118 @@
+From 75d65361cf3c0dae2af970c305e19c727b28a510 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Mon, 13 Feb 2017 22:48:30 -0800
+Subject: xfs: split indlen reservations fairly when under reserved
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 75d65361cf3c0dae2af970c305e19c727b28a510 upstream.
+
+Certain workoads that punch holes into speculative preallocation can
+cause delalloc indirect reservation splits when the delalloc extent is
+split in two. If further splits occur, an already short-handed extent
+can be split into two in a manner that leaves zero indirect blocks for
+one of the two new extents. This occurs because the shortage is large
+enough that the xfs_bmap_split_indlen() algorithm completely drains the
+requested indlen of one of the extents before it honors the existing
+reservation.
+
+This ultimately results in a warning from xfs_bmap_del_extent(). This
+has been observed during file copies of large, sparse files using 'cp
+--sparse=always.'
+
+To avoid this problem, update xfs_bmap_split_indlen() to explicitly
+apply the reservation shortage fairly between both extents. This smooths
+out the overall indlen shortage and defers the situation where we end up
+with a delalloc extent with zero indlen reservation to extreme
+circumstances.
+
+Reported-by: Patrick Dung <mpatdung@gmail.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   61 +++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 43 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4805,34 +4805,59 @@ xfs_bmap_split_indlen(
+       xfs_filblks_t                   len2 = *indlen2;
+       xfs_filblks_t                   nres = len1 + len2; /* new total res. */
+       xfs_filblks_t                   stolen = 0;
++      xfs_filblks_t                   resfactor;
+       /*
+        * Steal as many blocks as we can to try and satisfy the worst case
+        * indlen for both new extents.
+        */
+-      while (nres > ores && avail) {
+-              nres--;
+-              avail--;
+-              stolen++;
+-      }
++      if (ores < nres && avail)
++              stolen = XFS_FILBLKS_MIN(nres - ores, avail);
++      ores += stolen;
++
++       /* nothing else to do if we've satisfied the new reservation */
++      if (ores >= nres)
++              return stolen;
++
++      /*
++       * We can't meet the total required reservation for the two extents.
++       * Calculate the percent of the overall shortage between both extents
++       * and apply this percentage to each of the requested indlen values.
++       * This distributes the shortage fairly and reduces the chances that one
++       * of the two extents is left with nothing when extents are repeatedly
++       * split.
++       */
++      resfactor = (ores * 100);
++      do_div(resfactor, nres);
++      len1 *= resfactor;
++      do_div(len1, 100);
++      len2 *= resfactor;
++      do_div(len2, 100);
++      ASSERT(len1 + len2 <= ores);
++      ASSERT(len1 < *indlen1 && len2 < *indlen2);
+       /*
+-       * The only blocks available are those reserved for the original
+-       * extent and what we can steal from the extent being removed.
+-       * If this still isn't enough to satisfy the combined
+-       * requirements for the two new extents, skim blocks off of each
+-       * of the new reservations until they match what is available.
++       * Hand out the remainder to each extent. If one of the two reservations
++       * is zero, we want to make sure that one gets a block first. The loop
++       * below starts with len1, so hand len2 a block right off the bat if it
++       * is zero.
+        */
+-      while (nres > ores) {
+-              if (len1) {
+-                      len1--;
+-                      nres--;
++      ores -= (len1 + len2);
++      ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
++      if (ores && !len2 && *indlen2) {
++              len2++;
++              ores--;
++      }
++      while (ores) {
++              if (len1 < *indlen1) {
++                      len1++;
++                      ores--;
+               }
+-              if (nres == ores)
++              if (!ores)
+                       break;
+-              if (len2) {
+-                      len2--;
+-                      nres--;
++              if (len2 < *indlen2) {
++                      len2++;
++                      ores--;
+               }
+       }
diff --git a/queue-4.10/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch b/queue-4.10/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
new file mode 100644 (file)
index 0000000..2cd8d04
--- /dev/null
@@ -0,0 +1,97 @@
+From 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Wed, 8 Mar 2017 10:38:53 -0800
+Subject: xfs: try any AG when allocating the first btree block when reflinking
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e upstream.
+
+When a reflink operation causes the bmap code to allocate a btree block
+we're currently doing single-AG allocations due to having ->firstblock
+set and then try any higher AG due a little reflink quirk we've put in
+when adding the reflink code.  But given that we do not have a minleft
+reservation of any kind in this AG we can still not have any space in
+the same or higher AG even if the file system has enough free space.
+To fix this use a XFS_ALLOCTYPE_FIRST_AG allocation in this fall back
+path instead.
+
+[And yes, we need to redo this properly instead of piling hacks over
+ hacks.  I'm working on that, but it's not going to be a small series.
+ In the meantime this fixes the customer reported issue]
+
+Also add a warning for failing allocations to make it easier to debug.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c       |   10 +++++++---
+ fs/xfs/libxfs/xfs_bmap_btree.c |    6 +++---
+ 2 files changed, 10 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -769,8 +769,8 @@ xfs_bmap_extents_to_btree(
+               args.type = XFS_ALLOCTYPE_START_BNO;
+               args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+       } else if (dfops->dop_low) {
+-try_another_ag:
+               args.type = XFS_ALLOCTYPE_START_BNO;
++try_another_ag:
+               args.fsbno = *firstblock;
+       } else {
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+@@ -796,13 +796,17 @@ try_another_ag:
+       if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+           args.fsbno == NULLFSBLOCK &&
+           args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+-              dfops->dop_low = true;
++              args.type = XFS_ALLOCTYPE_FIRST_AG;
+               goto try_another_ag;
+       }
++      if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
++              xfs_iroot_realloc(ip, -1, whichfork);
++              xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
++              return -ENOSPC;
++      }
+       /*
+        * Allocation can't fail, the space was reserved.
+        */
+-      ASSERT(args.fsbno != NULLFSBLOCK);
+       ASSERT(*firstblock == NULLFSBLOCK ||
+              args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
+       *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -453,8 +453,8 @@ xfs_bmbt_alloc_block(
+       if (args.fsbno == NULLFSBLOCK) {
+               args.fsbno = be64_to_cpu(start->l);
+-try_another_ag:
+               args.type = XFS_ALLOCTYPE_START_BNO;
++try_another_ag:
+               /*
+                * Make sure there is sufficient room left in the AG to
+                * complete a full tree split for an extent insert.  If
+@@ -494,8 +494,8 @@ try_another_ag:
+       if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+           args.fsbno == NULLFSBLOCK &&
+           args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+-              cur->bc_private.b.dfops->dop_low = true;
+               args.fsbno = cur->bc_private.b.firstblock;
++              args.type = XFS_ALLOCTYPE_FIRST_AG;
+               goto try_another_ag;
+       }
+@@ -512,7 +512,7 @@ try_another_ag:
+                       goto error0;
+               cur->bc_private.b.dfops->dop_low = true;
+       }
+-      if (args.fsbno == NULLFSBLOCK) {
++      if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
diff --git a/queue-4.10/xfs-tune-down-agno-asserts-in-the-bmap-code.patch b/queue-4.10/xfs-tune-down-agno-asserts-in-the-bmap-code.patch
new file mode 100644 (file)
index 0000000..12f6ed8
--- /dev/null
@@ -0,0 +1,83 @@
+From 410d17f67e583559be3a922f8b6cc336331893f3 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 16 Feb 2017 17:12:51 -0800
+Subject: xfs: tune down agno asserts in the bmap code
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 410d17f67e583559be3a922f8b6cc336331893f3 upstream.
+
+In various places we currently assert that xfs_bmap_btalloc allocates
+from the same as the firstblock value passed in, unless it's either
+NULLAGNO or the dop_low flag is set.  But the reflink code does not
+fully follow this convention as it passes in firstblock purely as
+a hint for the allocator without actually having previous allocations
+in the transaction, and without having a minleft check on the current
+AG, leading to the assert firing on a very full and heavily used
+file system.  As even the reflink code only allocates from equal or
+higher AGs for now we can simply the check to always allow for equal
+or higher AGs.
+
+Note that we need to eventually split the two meanings of the firstblock
+value.  At that point we can also allow the reflink code to allocate
+from any AG instead of limiting it in any way.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   22 ++++++----------------
+ 1 file changed, 6 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -804,9 +804,7 @@ try_another_ag:
+        */
+       ASSERT(args.fsbno != NULLFSBLOCK);
+       ASSERT(*firstblock == NULLFSBLOCK ||
+-             args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+-             (dfops->dop_low &&
+-              args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
++             args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
+       *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+       cur->bc_private.b.allocated++;
+       ip->i_d.di_nblocks++;
+@@ -3832,17 +3830,13 @@ xfs_bmap_btalloc(
+                * the first block that was allocated.
+                */
+               ASSERT(*ap->firstblock == NULLFSBLOCK ||
+-                     XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+-                     XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+-                     (ap->dfops->dop_low &&
+-                      XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+-                      XFS_FSB_TO_AGNO(mp, args.fsbno)));
++                     XFS_FSB_TO_AGNO(mp, *ap->firstblock) <=
++                     XFS_FSB_TO_AGNO(mp, args.fsbno));
+               ap->blkno = args.fsbno;
+               if (*ap->firstblock == NULLFSBLOCK)
+                       *ap->firstblock = args.fsbno;
+-              ASSERT(nullfb || fb_agno == args.agno ||
+-                     (ap->dfops->dop_low && fb_agno < args.agno));
++              ASSERT(nullfb || fb_agno <= args.agno);
+               ap->length = args.len;
+               if (!(ap->flags & XFS_BMAPI_COWFORK))
+                       ap->ip->i_d.di_nblocks += args.len;
+@@ -4764,13 +4758,9 @@ error0:
+       if (bma.cur) {
+               if (!error) {
+                       ASSERT(*firstblock == NULLFSBLOCK ||
+-                             XFS_FSB_TO_AGNO(mp, *firstblock) ==
++                             XFS_FSB_TO_AGNO(mp, *firstblock) <=
+                              XFS_FSB_TO_AGNO(mp,
+-                                     bma.cur->bc_private.b.firstblock) ||
+-                             (dfops->dop_low &&
+-                              XFS_FSB_TO_AGNO(mp, *firstblock) <
+-                              XFS_FSB_TO_AGNO(mp,
+-                                      bma.cur->bc_private.b.firstblock)));
++                                     bma.cur->bc_private.b.firstblock));
+                       *firstblock = bma.cur->bc_private.b.firstblock;
+               }
+               xfs_btree_del_cursor(bma.cur,
diff --git a/queue-4.10/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch b/queue-4.10/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
new file mode 100644 (file)
index 0000000..1bf295f
--- /dev/null
@@ -0,0 +1,66 @@
+From c5ecb42342852892f978572ddc6dca703460f25a Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 6 Feb 2017 17:45:51 -0800
+Subject: xfs: update ctime and mtime on clone destinatation inodes
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit c5ecb42342852892f978572ddc6dca703460f25a upstream.
+
+We're changing both metadata and data, so we need to update the
+timestamps for clone operations.  Dedupe on the other hand does
+not change file data, and only changes invisible metadata so the
+timestamps should not be updated.
+
+This follows existing btrfs behavior.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: remove redundant is_dedupe test]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_reflink.c |   12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -955,13 +955,14 @@ STATIC int
+ xfs_reflink_update_dest(
+       struct xfs_inode        *dest,
+       xfs_off_t               newlen,
+-      xfs_extlen_t            cowextsize)
++      xfs_extlen_t            cowextsize,
++      bool                    is_dedupe)
+ {
+       struct xfs_mount        *mp = dest->i_mount;
+       struct xfs_trans        *tp;
+       int                     error;
+-      if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
++      if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+               return 0;
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+@@ -982,6 +983,10 @@ xfs_reflink_update_dest(
+               dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+       }
++      if (!is_dedupe) {
++              xfs_trans_ichgtime(tp, dest,
++                                 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
++      }
+       xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+       error = xfs_trans_commit(tp);
+@@ -1295,7 +1300,8 @@ xfs_reflink_remap_range(
+           !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+               cowextsize = src->i_d.di_cowextsize;
+-      ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
++      ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
++                      is_dedupe);
+ out_unlock:
+       xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
diff --git a/queue-4.10/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch b/queue-4.10/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
new file mode 100644 (file)
index 0000000..0de9993
--- /dev/null
@@ -0,0 +1,150 @@
+From f65e6fad293b3a5793b7fa2044800506490e7a2e Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 8 Mar 2017 09:58:08 -0800
+Subject: xfs: use iomap new flag for newly allocated delalloc blocks
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit f65e6fad293b3a5793b7fa2044800506490e7a2e upstream.
+
+Commit fa7f138 ("xfs: clear delalloc and cache on buffered write
+failure") fixed one regression in the iomap error handling code and
+exposed another. The fundamental problem is that if a buffered write
+is a rewrite of preexisting delalloc blocks and the write fails, the
+failure handling code can punch out preexisting blocks with valid
+file data.
+
+This was reproduced directly by sub-block writes in the LTP
+kernel/syscalls/write/write03 test. A first 100 byte write allocates
+a single block in a file. A subsequent 100 byte write fails and
+punches out the block, including the data successfully written by
+the previous write.
+
+To address this problem, update the ->iomap_begin() handler to
+distinguish newly allocated delalloc blocks from preexisting
+delalloc blocks via the IOMAP_F_NEW flag. Use this flag in the
+->iomap_end() handler to decide when a failed or short write should
+punch out delalloc blocks.
+
+This introduces the subtle requirement that ->iomap_begin() should
+never combine newly allocated delalloc blocks with existing blocks
+in the resulting iomap descriptor. This can occur when a new
+delalloc reservation merges with a neighboring extent that is part
+of the current write, for example. Therefore, drop the
+post-allocation extent lookup from xfs_bmapi_reserve_delalloc() and
+just return the record inserted into the fork. This ensures only new
+blocks are returned and thus that preexisting delalloc blocks are
+always handled as "found" blocks and not punched out on a failed
+rewrite.
+
+Reported-by: Xiong Zhou <xzhou@redhat.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   24 ++++++++++++++----------
+ fs/xfs/xfs_iomap.c       |   16 +++++++++++-----
+ 2 files changed, 25 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4160,6 +4160,19 @@ xfs_bmapi_read(
+       return 0;
+ }
++/*
++ * Add a delayed allocation extent to an inode. Blocks are reserved from the
++ * global pool and the extent inserted into the inode in-core extent tree.
++ *
++ * On entry, got refers to the first extent beyond the offset of the extent to
++ * allocate or eof is specified if no such extent exists. On return, got refers
++ * to the extent record that was inserted to the inode fork.
++ *
++ * Note that the allocated extent may have been merged with contiguous extents
++ * during insertion into the inode fork. Thus, got does not reflect the current
++ * state of the inode fork on return. If necessary, the caller can use lastx to
++ * look up the updated record in the inode fork.
++ */
+ int
+ xfs_bmapi_reserve_delalloc(
+       struct xfs_inode        *ip,
+@@ -4246,13 +4259,8 @@ xfs_bmapi_reserve_delalloc(
+       got->br_startblock = nullstartblock(indlen);
+       got->br_blockcount = alen;
+       got->br_state = XFS_EXT_NORM;
+-      xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+-      /*
+-       * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+-       * might have merged it into one of the neighbouring ones.
+-       */
+-      xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
++      xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+       /*
+        * Tag the inode if blocks were preallocated. Note that COW fork
+@@ -4264,10 +4272,6 @@ xfs_bmapi_reserve_delalloc(
+       if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+               xfs_inode_set_cowblocks_tag(ip);
+-      ASSERT(got->br_startoff <= aoff);
+-      ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+-      ASSERT(isnullstartblock(got->br_startblock));
+-      ASSERT(got->br_state == XFS_EXT_NORM);
+       return 0;
+ out_unreserve_blocks:
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -637,6 +637,11 @@ retry:
+               goto out_unlock;
+       }
++      /*
++       * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
++       * them out if the write happens to fail.
++       */
++      iomap->flags = IOMAP_F_NEW;
+       trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ done:
+       if (isnullstartblock(got.br_startblock))
+@@ -1085,7 +1090,8 @@ xfs_file_iomap_end_delalloc(
+       struct xfs_inode        *ip,
+       loff_t                  offset,
+       loff_t                  length,
+-      ssize_t                 written)
++      ssize_t                 written,
++      struct iomap            *iomap)
+ {
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           start_fsb;
+@@ -1104,14 +1110,14 @@ xfs_file_iomap_end_delalloc(
+       end_fsb = XFS_B_TO_FSB(mp, offset + length);
+       /*
+-       * Trim back delalloc blocks if we didn't manage to write the whole
+-       * range reserved.
++       * Trim delalloc blocks if they were allocated by this write and we
++       * didn't manage to write the whole range.
+        *
+        * We don't need to care about racing delalloc as we hold i_mutex
+        * across the reserve/allocate/unreserve calls. If there are delalloc
+        * blocks in the range, they are ours.
+        */
+-      if (start_fsb < end_fsb) {
++      if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
+               truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+                                        XFS_FSB_TO_B(mp, end_fsb) - 1);
+@@ -1141,7 +1147,7 @@ xfs_file_iomap_end(
+ {
+       if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+               return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+-                              length, written);
++                              length, written, iomap);
+       return 0;
+ }
diff --git a/queue-4.10/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch b/queue-4.10/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
new file mode 100644 (file)
index 0000000..a73615e
--- /dev/null
@@ -0,0 +1,44 @@
+From d5825712ee98d68a2c17bc89dad2c30276894cba Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Thu, 2 Mar 2017 15:06:33 -0800
+Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode alignment mask
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit d5825712ee98d68a2c17bc89dad2c30276894cba upstream.
+
+When block size is larger than inode cluster size, the call to
+XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs
+would have set xfs_sb->sb_inoalignmt to 0. Hence in
+xfs_set_inoalignment(), xfs_mount->m_inoalign_mask gets initialized to
+-1 instead of 0. However, xfs_mount->m_sinoalign would get correctly
+intialized to 0 because for every positive value of xfs_mount->m_dalign,
+the condition "!(mp->m_dalign & mp->m_inoalign_mask)" would evaluate to
+false.
+
+Also, xfs_imap() worked fine even with xfs_mount->m_inoalign_mask having
+-1 as the value because blks_per_cluster variable would have the value 1
+and hence we would never have a need to use xfs_mount->m_inoalign_mask
+to compute the inode chunk's agbno and offset within the chunk.
+
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_mount.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -511,8 +511,7 @@ STATIC void
+ xfs_set_inoalignment(xfs_mount_t *mp)
+ {
+       if (xfs_sb_version_hasalign(&mp->m_sb) &&
+-          mp->m_sb.sb_inoalignmt >=
+-          XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
++              mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
+               mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+       else
+               mp->m_inoalign_mask = 0;
diff --git a/queue-4.10/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch b/queue-4.10/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
new file mode 100644 (file)
index 0000000..6bde120
--- /dev/null
@@ -0,0 +1,91 @@
+From 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Thu, 16 Feb 2017 17:12:16 -0800
+Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa upstream.
+
+On a ppc64 system, executing generic/256 test with 32k block size gives the following call trace,
+
+XFS: Assertion failed: args->maxlen > 0, file: /root/repos/linux/fs/xfs/libxfs/xfs_alloc.c, line: 2026
+
+kernel BUG at /root/repos/linux/fs/xfs/xfs_message.c:113!
+Oops: Exception in kernel mode, sig: 5 [#1]
+SMP NR_CPUS=2048
+DEBUG_PAGEALLOC
+NUMA
+pSeries
+Modules linked in:
+CPU: 2 PID: 19361 Comm: mkdir Not tainted 4.10.0-rc5 #58
+task: c000000102606d80 task.stack: c0000001026b8000
+NIP: c0000000004ef798 LR: c0000000004ef798 CTR: c00000000082b290
+REGS: c0000001026bb090 TRAP: 0700   Not tainted  (4.10.0-rc5)
+MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>
+CR: 28004428  XER: 00000000
+CFAR: c0000000004ef180 SOFTE: 1
+GPR00: c0000000004ef798 c0000001026bb310 c000000001157300 ffffffffffffffea
+GPR04: 000000000000000a c0000001026bb130 0000000000000000 ffffffffffffffc0
+GPR08: 00000000000000d1 0000000000000021 00000000ffffffd1 c000000000dd4990
+GPR12: 0000000022004444 c00000000fe00800 0000000020000000 0000000000000000
+GPR16: 0000000000000000 0000000043a606fc 0000000043a76c08 0000000043a1b3d0
+GPR20: 000001002a35cd60 c0000001026bbb80 0000000000000000 0000000000000001
+GPR24: 0000000000000240 0000000000000004 c00000062dc55000 0000000000000000
+GPR28: 0000000000000004 c00000062ecd9200 0000000000000000 c0000001026bb6c0
+NIP [c0000000004ef798] .assfail+0x28/0x30
+LR [c0000000004ef798] .assfail+0x28/0x30
+Call Trace:
+[c0000001026bb310] [c0000000004ef798] .assfail+0x28/0x30 (unreliable)
+[c0000001026bb380] [c000000000455d74] .xfs_alloc_space_available+0x194/0x1b0
+[c0000001026bb410] [c00000000045b914] .xfs_alloc_fix_freelist+0x144/0x480
+[c0000001026bb580] [c00000000045c368] .xfs_alloc_vextent+0x698/0xa90
+[c0000001026bb650] [c0000000004a6200] .xfs_ialloc_ag_alloc+0x170/0x820
+[c0000001026bb7c0] [c0000000004a9098] .xfs_dialloc+0x158/0x320
+[c0000001026bb8a0] [c0000000004e628c] .xfs_ialloc+0x7c/0x610
+[c0000001026bb990] [c0000000004e8138] .xfs_dir_ialloc+0xa8/0x2f0
+[c0000001026bbaa0] [c0000000004e8814] .xfs_create+0x494/0x790
+[c0000001026bbbf0] [c0000000004e5ebc] .xfs_generic_create+0x2bc/0x410
+[c0000001026bbce0] [c0000000002b4a34] .vfs_mkdir+0x154/0x230
+[c0000001026bbd70] [c0000000002bc444] .SyS_mkdirat+0x94/0x120
+[c0000001026bbe30] [c00000000000b760] system_call+0x38/0xfc
+Instruction dump:
+4e800020 60000000 7c0802a6 7c862378 3c82ffca 7ca72b78 38841c18 7c651b78
+38600000 f8010010 f821ff91 4bfff94d <0fe00000> 60000000 7c0802a6 7c892378
+
+When block size is larger than inode cluster size, the call to
+XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs
+would have set xfs_sb->sb_inoalignmt to 0. This causes
+xfs_ialloc_cluster_alignment() to return 0.  Due to this
+args.minalignslop (in xfs_ialloc_ag_alloc()) gets the unsigned
+equivalent of -1 assigned to it. This later causes alloc_len in
+xfs_alloc_space_available() to have a value of 0. In such a scenario
+when args.total is also 0, the assert statement "ASSERT(args->maxlen >
+0);" fails.
+
+This commit fixes the bug by replacing the call to XFS_B_TO_FSBT() in
+xfs_ialloc_cluster_alignment() with a call to xfs_icluster_size_fsb().
+
+Suggested-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_ialloc.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment(
+       struct xfs_mount        *mp)
+ {
+       if (xfs_sb_version_hasalign(&mp->m_sb) &&
+-          mp->m_sb.sb_inoalignmt >=
+-                      XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
++          mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
+               return mp->m_sb.sb_inoalignmt;
+       return 1;
+ }
diff --git a/queue-4.10/xfs-verify-free-block-header-fields.patch b/queue-4.10/xfs-verify-free-block-header-fields.patch
new file mode 100644 (file)
index 0000000..7e99527
--- /dev/null
@@ -0,0 +1,93 @@
+From de14c5f541e78c59006bee56f6c5c2ef1ca07272 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:00 -0800
+Subject: xfs: verify free block header fields
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit de14c5f541e78c59006bee56f6c5c2ef1ca07272 upstream.
+
+Perform basic sanity checking of the directory free block header
+fields so that we avoid hanging the system on invalid data.
+
+(Granted that just means that now we shutdown on directory write,
+but that seems better than hanging...)
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_dir2_node.c |   51 ++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 49 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_dir2_node.c
++++ b/fs/xfs/libxfs/xfs_dir2_node.c
+@@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_b
+       .verify_write = xfs_dir3_free_write_verify,
+ };
++/* Everything ok in the free block header? */
++static bool
++xfs_dir3_free_header_check(
++      struct xfs_inode        *dp,
++      xfs_dablk_t             fbno,
++      struct xfs_buf          *bp)
++{
++      struct xfs_mount        *mp = dp->i_mount;
++      unsigned int            firstdb;
++      int                     maxbests;
++
++      maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
++      firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
++                 xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
++                      maxbests;
++      if (xfs_sb_version_hascrc(&mp->m_sb)) {
++              struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
++
++              if (be32_to_cpu(hdr3->firstdb) != firstdb)
++                      return false;
++              if (be32_to_cpu(hdr3->nvalid) > maxbests)
++                      return false;
++              if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
++                      return false;
++      } else {
++              struct xfs_dir2_free_hdr *hdr = bp->b_addr;
++
++              if (be32_to_cpu(hdr->firstdb) != firstdb)
++                      return false;
++              if (be32_to_cpu(hdr->nvalid) > maxbests)
++                      return false;
++              if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
++                      return false;
++      }
++      return true;
++}
+ static int
+ __xfs_dir3_free_read(
+@@ -168,11 +204,22 @@ __xfs_dir3_free_read(
+       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
++      if (err || !*bpp)
++              return err;
++
++      /* Check things that we can't do in the verifier. */
++      if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) {
++              xfs_buf_ioerror(*bpp, -EFSCORRUPTED);
++              xfs_verifier_error(*bpp);
++              xfs_trans_brelse(tp, *bpp);
++              return -EFSCORRUPTED;
++      }
+       /* try read returns without an error or *bpp if it lands in a hole */
+-      if (!err && tp && *bpp)
++      if (tp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+-      return err;
++
++      return 0;
+ }
+ int