]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 5 Jun 2017 15:08:43 +0000 (17:08 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 5 Jun 2017 15:08:43 +0000 (17:08 +0200)
added patches:
xfs-actually-report-xattr-extents-via-iomap.patch
xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch
xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch
xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch
xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch
xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch
xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch
xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch
xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch
xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch
xfs-fix-up-quotacheck-buffer-list-error-handling.patch
xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch
xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch
xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch
xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch
xfs-support-ability-to-wait-on-new-inodes.patch
xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch
xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch
xfs-xfs_trans_alloc_empty.patch

20 files changed:
queue-4.9/series
queue-4.9/xfs-actually-report-xattr-extents-via-iomap.patch [new file with mode: 0644]
queue-4.9/xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch [new file with mode: 0644]
queue-4.9/xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch [new file with mode: 0644]
queue-4.9/xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch [new file with mode: 0644]
queue-4.9/xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch [new file with mode: 0644]
queue-4.9/xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch [new file with mode: 0644]
queue-4.9/xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch [new file with mode: 0644]
queue-4.9/xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch [new file with mode: 0644]
queue-4.9/xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch [new file with mode: 0644]
queue-4.9/xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch [new file with mode: 0644]
queue-4.9/xfs-fix-up-quotacheck-buffer-list-error-handling.patch [new file with mode: 0644]
queue-4.9/xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch [new file with mode: 0644]
queue-4.9/xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch [new file with mode: 0644]
queue-4.9/xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch [new file with mode: 0644]
queue-4.9/xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch [new file with mode: 0644]
queue-4.9/xfs-support-ability-to-wait-on-new-inodes.patch [new file with mode: 0644]
queue-4.9/xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch [new file with mode: 0644]
queue-4.9/xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch [new file with mode: 0644]
queue-4.9/xfs-xfs_trans_alloc_empty.patch [new file with mode: 0644]

index 02663fc2301dea6fdf109e3a0932828f2e94ac7e..8b897a2932493d58b8759663666ea80520e7e67a 100644 (file)
@@ -73,3 +73,22 @@ xfs-verify-inline-directory-data-forks.patch
 xfs-rework-the-inline-directory-verifiers.patch
 xfs-fix-kernel-memory-exposure-problems.patch
 xfs-use-dedicated-log-worker-wq-to-avoid-deadlock-with-cil-wq.patch
+xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch
+xfs-actually-report-xattr-extents-via-iomap.patch
+xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch
+xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch
+xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch
+xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch
+xfs-fix-up-quotacheck-buffer-list-error-handling.patch
+xfs-support-ability-to-wait-on-new-inodes.patch
+xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch
+xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch
+xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch
+xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch
+xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch
+xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch
+xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch
+xfs-xfs_trans_alloc_empty.patch
+xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch
+xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch
+xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch
diff --git a/queue-4.9/xfs-actually-report-xattr-extents-via-iomap.patch b/queue-4.9/xfs-actually-report-xattr-extents-via-iomap.patch
new file mode 100644 (file)
index 0000000..be4ce1a
--- /dev/null
@@ -0,0 +1,37 @@
+From 84358536dc355a9c8978ee425f87e116186bed16 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 6 Apr 2017 16:00:39 -0700
+Subject: xfs: actually report xattr extents via iomap
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 84358536dc355a9c8978ee425f87e116186bed16 upstream.
+
+Apparently FIEMAP for xattrs has been broken since we switched to
+the iomap backend because of an incorrect check for xattr presence.
+Also fix the broken locking.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_iomap.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1151,10 +1151,10 @@ xfs_xattr_iomap_begin(
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+-      lockmode = xfs_ilock_data_map_shared(ip);
++      lockmode = xfs_ilock_attr_map_shared(ip);
+       /* if there are no attribute fork or extents, return ENOENT */
+-      if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
++      if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+               error = -ENOENT;
+               goto out_unlock;
+       }
diff --git a/queue-4.9/xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch b/queue-4.9/xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch
new file mode 100644 (file)
index 0000000..ea577bb
--- /dev/null
@@ -0,0 +1,102 @@
+From 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 15 May 2017 19:16:15 -0700
+Subject: xfs: avoid mount-time deadlock in CoW extent recovery
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 upstream.
+
+If a malicious user corrupts the refcount btree to cause a cycle between
+different levels of the tree, the next mount attempt will deadlock in
+the CoW recovery routine while grabbing buffer locks.  We can use the
+ability to re-grab a buffer that was previous locked to a transaction to
+avoid deadlocks, so do that here.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_refcount.c |   43 +++++++++++++++++++++++++++++++------------
+ 1 file changed, 31 insertions(+), 12 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -1629,13 +1629,28 @@ xfs_refcount_recover_cow_leftovers(
+       if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+               return -EOPNOTSUPP;
+-      error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
++      INIT_LIST_HEAD(&debris);
++
++      /*
++       * In this first part, we use an empty transaction to gather up
++       * all the leftover CoW extents so that we can subsequently
++       * delete them.  The empty transaction is used to avoid
++       * a buffer lock deadlock if there happens to be a loop in the
++       * refcountbt because we're allowed to re-grab a buffer that is
++       * already attached to our transaction.  When we're done
++       * recording the CoW debris we cancel the (empty) transaction
++       * and everything goes away cleanly.
++       */
++      error = xfs_trans_alloc_empty(mp, &tp);
+       if (error)
+               return error;
+-      cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
++
++      error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
++      if (error)
++              goto out_trans;
++      cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
+       /* Find all the leftover CoW staging extents. */
+-      INIT_LIST_HEAD(&debris);
+       memset(&low, 0, sizeof(low));
+       memset(&high, 0, sizeof(high));
+       low.rc.rc_startblock = XFS_REFC_COW_START;
+@@ -1645,10 +1660,11 @@ xfs_refcount_recover_cow_leftovers(
+       if (error)
+               goto out_cursor;
+       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+-      xfs_buf_relse(agbp);
++      xfs_trans_brelse(tp, agbp);
++      xfs_trans_cancel(tp);
+       /* Now iterate the list to free the leftovers */
+-      list_for_each_entry(rr, &debris, rr_list) {
++      list_for_each_entry_safe(rr, n, &debris, rr_list) {
+               /* Set up transaction. */
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+               if (error)
+@@ -1676,8 +1692,16 @@ xfs_refcount_recover_cow_leftovers(
+               error = xfs_trans_commit(tp);
+               if (error)
+                       goto out_free;
++
++              list_del(&rr->rr_list);
++              kmem_free(rr);
+       }
++      return error;
++out_defer:
++      xfs_defer_cancel(&dfops);
++out_trans:
++      xfs_trans_cancel(tp);
+ out_free:
+       /* Free the leftover list */
+       list_for_each_entry_safe(rr, n, &debris, rr_list) {
+@@ -1688,11 +1712,6 @@ out_free:
+ out_cursor:
+       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+-      xfs_buf_relse(agbp);
+-      goto out_free;
+-
+-out_defer:
+-      xfs_defer_cancel(&dfops);
+-      xfs_trans_cancel(tp);
+-      goto out_free;
++      xfs_trans_brelse(tp, agbp);
++      goto out_trans;
+ }
diff --git a/queue-4.9/xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch b/queue-4.9/xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch
new file mode 100644 (file)
index 0000000..3d9b6b6
--- /dev/null
@@ -0,0 +1,46 @@
+From 892d2a5f705723b2cb488bfb38bcbdcf83273184 Mon Sep 17 00:00:00 2001
+From: Zorro Lang <zlang@redhat.com>
+Date: Mon, 15 May 2017 08:40:02 -0700
+Subject: xfs: bad assertion for delalloc an extent that start at i_size
+
+From: Zorro Lang <zlang@redhat.com>
+
+commit 892d2a5f705723b2cb488bfb38bcbdcf83273184 upstream.
+
+By run fsstress long enough time enough in RHEL-7, I find an
+assertion failure (harder to reproduce on linux-4.11, but problem
+is still there):
+
+  XFS: Assertion failed: (iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c
+
+The assertion is in xfs_getbmap() funciton:
+
+  if (map[i].br_startblock == DELAYSTARTBLOCK &&
+-->   map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+          ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
+When map[i].br_startoff == XFS_B_TO_FSB(mp, XFS_ISIZE(ip)), the
+startoff is just at EOF. But we only need to make sure delalloc
+extents that are within EOF, not include EOF.
+
+Signed-off-by: Zorro Lang <zlang@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -722,7 +722,7 @@ xfs_getbmap(
+                        * extents.
+                        */
+                       if (map[i].br_startblock == DELAYSTARTBLOCK &&
+-                          map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
++                          map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+                               ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+                         if (map[i].br_startblock == HOLESTARTBLOCK &&
diff --git a/queue-4.9/xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch b/queue-4.9/xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch
new file mode 100644 (file)
index 0000000..0d191d3
--- /dev/null
@@ -0,0 +1,42 @@
+From 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Fri, 12 May 2017 10:44:08 -0700
+Subject: xfs: BMAPX shouldn't barf on inline-format directories
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 upstream.
+
+When we're fulfilling a BMAPX request, jump out early if the data fork
+is in local format.  This prevents us from hitting a debugging check in
+bmapi_read and barfing errors back to userspace.  The on-disk extent
+count check later isn't sufficient for IF_DELALLOC mode because da
+extents are in memory and not on disk.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -588,9 +588,13 @@ xfs_getbmap(
+               }
+               break;
+       default:
++              /* Local format data forks report no extents. */
++              if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
++                      bmv->bmv_entries = 0;
++                      return 0;
++              }
+               if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+-                  ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+-                  ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
++                  ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+                       return -EINVAL;
+               if (xfs_get_extsz_hint(ip) ||
diff --git a/queue-4.9/xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch b/queue-4.9/xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch
new file mode 100644 (file)
index 0000000..c90e721
--- /dev/null
@@ -0,0 +1,74 @@
+From 3b4683c294095b5f777c03307ef8c60f47320e12 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Tue, 11 Apr 2017 10:50:05 -0700
+Subject: xfs: drop iolock from reclaim context to appease lockdep
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 3b4683c294095b5f777c03307ef8c60f47320e12 upstream.
+
+Lockdep complains about use of the iolock in inode reclaim context
+because it doesn't understand that reclaim has the last reference to
+the inode, and thus an iolock->reclaim->iolock deadlock is not
+possible.
+
+The iolock is technically not necessary in xfs_inactive() and was
+only added to appease an assert in xfs_free_eofblocks(), which can
+be called from other non-reclaim contexts. Therefore, just kill the
+assert and drop the use of the iolock from reclaim context to quiet
+lockdep.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |    8 +++-----
+ fs/xfs/xfs_inode.c     |    9 +++++----
+ 2 files changed, 8 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -911,9 +911,9 @@ xfs_can_free_eofblocks(struct xfs_inode
+ }
+ /*
+- * This is called by xfs_inactive to free any blocks beyond eof
+- * when the link count isn't zero and by xfs_dm_punch_hole() when
+- * punching a hole to EOF.
++ * This is called to free any blocks beyond eof. The caller must hold
++ * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
++ * reference to the inode.
+  */
+ int
+ xfs_free_eofblocks(
+@@ -928,8 +928,6 @@ xfs_free_eofblocks(
+       struct xfs_bmbt_irec    imap;
+       struct xfs_mount        *mp = ip->i_mount;
+-      ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+-
+       /*
+        * Figure out if there are any blocks beyond the end
+        * of the file.  If not, then there is nothing to do.
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1915,12 +1915,13 @@ xfs_inactive(
+                * force is true because we are evicting an inode from the
+                * cache. Post-eof blocks must be freed, lest we end up with
+                * broken free space accounting.
++               *
++               * Note: don't bother with iolock here since lockdep complains
++               * about acquiring it in reclaim context. We have the only
++               * reference to the inode at this point anyways.
+                */
+-              if (xfs_can_free_eofblocks(ip, true)) {
+-                      xfs_ilock(ip, XFS_IOLOCK_EXCL);
++              if (xfs_can_free_eofblocks(ip, true))
+                       xfs_free_eofblocks(ip);
+-                      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+-              }
+               return;
+       }
diff --git a/queue-4.9/xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch b/queue-4.9/xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch
new file mode 100644 (file)
index 0000000..ed0591c
--- /dev/null
@@ -0,0 +1,71 @@
+From 0daaecacb83bc6b656a56393ab77a31c28139bc7 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 12 May 2017 10:44:08 -0700
+Subject: xfs: fix indlen accounting error on partial delalloc conversion
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 0daaecacb83bc6b656a56393ab77a31c28139bc7 upstream.
+
+The delalloc -> real block conversion path uses an incorrect
+calculation in the case where the middle part of a delalloc extent
+is being converted. This is documented as a rare situation because
+XFS generally attempts to maximize contiguity by converting as much
+of a delalloc extent as possible.
+
+If this situation does occur, the indlen reservation for the two new
+delalloc extents left behind by the conversion of the middle range
+is calculated and compared with the original reservation. If more
+blocks are required, the delta is allocated from the global block
+pool. This delta value can be characterized as the difference
+between the new total requirement (temp + temp2) and the currently
+available reservation minus those blocks that have already been
+allocated (startblockval(PREV.br_startblock) - allocated).
+
+The problem is that the current code does not account for previously
+allocated blocks correctly. It subtracts the current allocation
+count from the (new - old) delta rather than the old indlen
+reservation. This means that more indlen blocks than have been
+allocated end up stashed in the remaining extents and free space
+accounting is broken as a result.
+
+Fix up the calculation to subtract the allocated block count from
+the original extent indlen and thus correctly allocate the
+reservation delta based on the difference between the new total
+requirement and the unused blocks from the original reservation.
+Also remove a bogus assert that contradicts the fact that the new
+indlen reservation can be larger than the original indlen
+reservation.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -2208,8 +2208,10 @@ xfs_bmap_add_extent_delay_real(
+               }
+               temp = xfs_bmap_worst_indlen(bma->ip, temp);
+               temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
+-              diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+-                      (bma->cur ? bma->cur->bc_private.b.allocated : 0));
++              diff = (int)(temp + temp2 -
++                           (startblockval(PREV.br_startblock) -
++                            (bma->cur ?
++                             bma->cur->bc_private.b.allocated : 0)));
+               if (diff > 0) {
+                       error = xfs_mod_fdblocks(bma->ip->i_mount,
+                                                -((int64_t)diff), false);
+@@ -2266,7 +2268,6 @@ xfs_bmap_add_extent_delay_real(
+               temp = da_new;
+               if (bma->cur)
+                       temp += bma->cur->bc_private.b.allocated;
+-              ASSERT(temp <= da_old);
+               if (temp < da_old)
+                       xfs_mod_fdblocks(bma->ip->i_mount,
+                                       (int64_t)(da_old - temp), false);
diff --git a/queue-4.9/xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch b/queue-4.9/xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch
new file mode 100644 (file)
index 0000000..bfa8c61
--- /dev/null
@@ -0,0 +1,32 @@
+From 52813fb13ff90bd9c39a93446cbf1103c290b6e9 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 11 Apr 2017 16:45:52 -0700
+Subject: xfs: fix integer truncation in xfs_bmap_remap_alloc
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 52813fb13ff90bd9c39a93446cbf1103c290b6e9 upstream.
+
+bno should be a xfs_fsblock_t, which is 64-bit wides instead of a
+xfs_aglock_t, which truncates the value to 32 bits.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -3964,7 +3964,7 @@ xfs_bmap_remap_alloc(
+ {
+       struct xfs_trans        *tp = ap->tp;
+       struct xfs_mount        *mp = tp->t_mountp;
+-      xfs_agblock_t           bno;
++      xfs_fsblock_t           bno;
+       struct xfs_alloc_arg    args;
+       int                     error;
diff --git a/queue-4.9/xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch b/queue-4.9/xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch
new file mode 100644 (file)
index 0000000..9a71a44
--- /dev/null
@@ -0,0 +1,34 @@
+From d7fd24257aa60316bf81093f7f909dc9475ae974 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 18 May 2017 16:36:23 -0700
+Subject: xfs: Fix off-by-in in loop termination in xfs_find_get_desired_pgoff()
+
+From: Jan Kara <jack@suse.cz>
+
+commit d7fd24257aa60316bf81093f7f909dc9475ae974 upstream.
+
+There is an off-by-one error in loop termination conditions in
+xfs_find_get_desired_pgoff() since 'end' may index a page beyond end of
+desired range if 'endoff' is page aligned. It doesn't have any visible
+effects but still it is good to fix it.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -1130,7 +1130,7 @@ xfs_find_get_desired_pgoff(
+       index = startoff >> PAGE_SHIFT;
+       endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
+-      end = endoff >> PAGE_SHIFT;
++      end = (endoff - 1) >> PAGE_SHIFT;
+       do {
+               int             want;
+               unsigned        nr_pages;
diff --git a/queue-4.9/xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch b/queue-4.9/xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch
new file mode 100644 (file)
index 0000000..c66d150
--- /dev/null
@@ -0,0 +1,38 @@
+From be6324c00c4d1e0e665f03ed1fc18863a88da119 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 3 Apr 2017 15:17:57 -0700
+Subject: xfs: fix over-copying of getbmap parameters from userspace
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit be6324c00c4d1e0e665f03ed1fc18863a88da119 upstream.
+
+In xfs_ioc_getbmap, we should only copy the fields of struct getbmap
+from userspace, or else we end up copying random stack contents into the
+kernel.  struct getbmap is a strict subset of getbmapx, so a partial
+structure copy should work fine.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_ioctl.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -1542,10 +1542,11 @@ xfs_ioc_getbmap(
+       unsigned int            cmd,
+       void                    __user *arg)
+ {
+-      struct getbmapx         bmx;
++      struct getbmapx         bmx = { 0 };
+       int                     error;
+-      if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
++      /* struct getbmap is a strict subset of struct getbmapx. */
++      if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags)))
+               return -EFAULT;
+       if (bmx.bmv_count < 2)
diff --git a/queue-4.9/xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch b/queue-4.9/xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch
new file mode 100644 (file)
index 0000000..751f9e1
--- /dev/null
@@ -0,0 +1,35 @@
+From a4d768e702de224cc85e0c8eac9311763403b368 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@sandeen.net>
+Date: Mon, 22 May 2017 19:54:10 -0700
+Subject: xfs: fix unaligned access in xfs_btree_visit_blocks
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit a4d768e702de224cc85e0c8eac9311763403b368 upstream.
+
+This structure copy was throwing unaligned access warnings on sparc64:
+
+Kernel unaligned access at TPC[1043c088] xfs_btree_visit_blocks+0x88/0xe0 [xfs]
+
+xfs_btree_copy_ptrs does a memcpy, which avoids it.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_btree.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -4376,7 +4376,7 @@ xfs_btree_visit_blocks(
+                       xfs_btree_readahead_ptr(cur, ptr, 1);
+                       /* save for the next iteration of the loop */
+-                      lptr = *ptr;
++                      xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
+               }
+               /* for each buffer in the level */
diff --git a/queue-4.9/xfs-fix-up-quotacheck-buffer-list-error-handling.patch b/queue-4.9/xfs-fix-up-quotacheck-buffer-list-error-handling.patch
new file mode 100644 (file)
index 0000000..6307207
--- /dev/null
@@ -0,0 +1,96 @@
+From 20e8a063786050083fe05b4f45be338c60b49126 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 21 Apr 2017 12:40:44 -0700
+Subject: xfs: fix up quotacheck buffer list error handling
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 20e8a063786050083fe05b4f45be338c60b49126 upstream.
+
+The quotacheck error handling of the delwri buffer list assumes the
+resident buffers are locked and doesn't clear the _XBF_DELWRI_Q flag
+on the buffers that are dequeued. This can lead to assert failures
+on buffer release and possibly other locking problems.
+
+Move this code to a delwri queue cancel helper function to
+encapsulate the logic required to properly release buffers from a
+delwri queue. Update the helper to clear the delwri queue flag and
+call it from quotacheck.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf.c |   24 ++++++++++++++++++++++++
+ fs/xfs/xfs_buf.h |    1 +
+ fs/xfs/xfs_qm.c  |    7 +------
+ 3 files changed, 26 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -1066,6 +1066,8 @@ void
+ xfs_buf_unlock(
+       struct xfs_buf          *bp)
+ {
++      ASSERT(xfs_buf_islocked(bp));
++
+       XB_CLEAR_OWNER(bp);
+       up(&bp->b_sema);
+@@ -1804,6 +1806,28 @@ error:
+ }
+ /*
++ * Cancel a delayed write list.
++ *
++ * Remove each buffer from the list, clear the delwri queue flag and drop the
++ * associated buffer reference.
++ */
++void
++xfs_buf_delwri_cancel(
++      struct list_head        *list)
++{
++      struct xfs_buf          *bp;
++
++      while (!list_empty(list)) {
++              bp = list_first_entry(list, struct xfs_buf, b_list);
++
++              xfs_buf_lock(bp);
++              bp->b_flags &= ~_XBF_DELWRI_Q;
++              list_del_init(&bp->b_list);
++              xfs_buf_relse(bp);
++      }
++}
++
++/*
+  * Add a buffer to the delayed write list.
+  *
+  * This queues a buffer for writeout if it hasn't already been.  Note that
+--- a/fs/xfs/xfs_buf.h
++++ b/fs/xfs/xfs_buf.h
+@@ -329,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_b
+ extern void xfs_buf_stale(struct xfs_buf *bp);
+ /* Delayed Write Buffer Routines */
++extern void xfs_buf_delwri_cancel(struct list_head *);
+ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+ extern int xfs_buf_delwri_submit(struct list_head *);
+ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1384,12 +1384,7 @@ xfs_qm_quotacheck(
+       mp->m_qflags |= flags;
+  error_return:
+-      while (!list_empty(&buffer_list)) {
+-              struct xfs_buf *bp =
+-                      list_first_entry(&buffer_list, struct xfs_buf, b_list);
+-              list_del_init(&bp->b_list);
+-              xfs_buf_relse(bp);
+-      }
++      xfs_buf_delwri_cancel(&buffer_list);
+       if (error) {
+               xfs_warn(mp,
diff --git a/queue-4.9/xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch b/queue-4.9/xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch
new file mode 100644 (file)
index 0000000..b77227e
--- /dev/null
@@ -0,0 +1,113 @@
+From 161f55efba5ddccc690139fae9373cafc3447a97 Mon Sep 17 00:00:00 2001
+From: Eryu Guan <eguan@redhat.com>
+Date: Tue, 2 May 2017 13:54:47 -0700
+Subject: xfs: fix use-after-free in xfs_finish_page_writeback
+
+From: Eryu Guan <eguan@redhat.com>
+
+commit 161f55efba5ddccc690139fae9373cafc3447a97 upstream.
+
+Commit 28b783e47ad7 ("xfs: bufferhead chains are invalid after
+end_page_writeback") fixed one use-after-free issue by
+pre-calculating the loop conditionals before calling bh->b_end_io()
+in the end_io processing loop, but it assigned 'next' pointer before
+checking end offset boundary & breaking the loop, at which point the
+bh might be freed already, and caused use-after-free.
+
+This is caught by KASAN when running fstests generic/127 on sub-page
+block size XFS.
+
+[ 2517.244502] run fstests generic/127 at 2017-04-27 07:30:50
+[ 2747.868840] ==================================================================
+[ 2747.876949] BUG: KASAN: use-after-free in xfs_destroy_ioend+0x3d3/0x4e0 [xfs] at addr ffff8801395ae698
+...
+[ 2747.918245] Call Trace:
+[ 2747.920975]  dump_stack+0x63/0x84
+[ 2747.924673]  kasan_object_err+0x21/0x70
+[ 2747.928950]  kasan_report+0x271/0x530
+[ 2747.933064]  ? xfs_destroy_ioend+0x3d3/0x4e0 [xfs]
+[ 2747.938409]  ? end_page_writeback+0xce/0x110
+[ 2747.943171]  __asan_report_load8_noabort+0x19/0x20
+[ 2747.948545]  xfs_destroy_ioend+0x3d3/0x4e0 [xfs]
+[ 2747.953724]  xfs_end_io+0x1af/0x2b0 [xfs]
+[ 2747.958197]  process_one_work+0x5ff/0x1000
+[ 2747.962766]  worker_thread+0xe4/0x10e0
+[ 2747.966946]  kthread+0x2d3/0x3d0
+[ 2747.970546]  ? process_one_work+0x1000/0x1000
+[ 2747.975405]  ? kthread_create_on_node+0xc0/0xc0
+[ 2747.980457]  ? syscall_return_slowpath+0xe6/0x140
+[ 2747.985706]  ? do_page_fault+0x30/0x80
+[ 2747.989887]  ret_from_fork+0x2c/0x40
+[ 2747.993874] Object at ffff8801395ae690, in cache buffer_head size: 104
+[ 2748.001155] Allocated:
+[ 2748.003782] PID = 8327
+[ 2748.006411]  save_stack_trace+0x1b/0x20
+[ 2748.010688]  save_stack+0x46/0xd0
+[ 2748.014383]  kasan_kmalloc+0xad/0xe0
+[ 2748.018370]  kasan_slab_alloc+0x12/0x20
+[ 2748.022648]  kmem_cache_alloc+0xb8/0x1b0
+[ 2748.027024]  alloc_buffer_head+0x22/0xc0
+[ 2748.031399]  alloc_page_buffers+0xd1/0x250
+[ 2748.035968]  create_empty_buffers+0x30/0x410
+[ 2748.040730]  create_page_buffers+0x120/0x1b0
+[ 2748.045493]  __block_write_begin_int+0x17a/0x1800
+[ 2748.050740]  iomap_write_begin+0x100/0x2f0
+[ 2748.055308]  iomap_zero_range_actor+0x253/0x5c0
+[ 2748.060362]  iomap_apply+0x157/0x270
+[ 2748.064347]  iomap_zero_range+0x5a/0x80
+[ 2748.068624]  iomap_truncate_page+0x6b/0xa0
+[ 2748.073227]  xfs_setattr_size+0x1f7/0xa10 [xfs]
+[ 2748.078312]  xfs_vn_setattr_size+0x68/0x140 [xfs]
+[ 2748.083589]  xfs_file_fallocate+0x4ac/0x820 [xfs]
+[ 2748.088838]  vfs_fallocate+0x2cf/0x780
+[ 2748.093021]  SyS_fallocate+0x48/0x80
+[ 2748.097006]  do_syscall_64+0x18a/0x430
+[ 2748.101186]  return_from_SYSCALL_64+0x0/0x6a
+[ 2748.105948] Freed:
+[ 2748.108189] PID = 8327
+[ 2748.110816]  save_stack_trace+0x1b/0x20
+[ 2748.115093]  save_stack+0x46/0xd0
+[ 2748.118788]  kasan_slab_free+0x73/0xc0
+[ 2748.122969]  kmem_cache_free+0x7a/0x200
+[ 2748.127247]  free_buffer_head+0x41/0x80
+[ 2748.131524]  try_to_free_buffers+0x178/0x250
+[ 2748.136316]  xfs_vm_releasepage+0x2e9/0x3d0 [xfs]
+[ 2748.141563]  try_to_release_page+0x100/0x180
+[ 2748.146325]  invalidate_inode_pages2_range+0x7da/0xcf0
+[ 2748.152087]  xfs_shift_file_space+0x37d/0x6e0 [xfs]
+[ 2748.157557]  xfs_collapse_file_space+0x49/0x120 [xfs]
+[ 2748.163223]  xfs_file_fallocate+0x2a7/0x820 [xfs]
+[ 2748.168462]  vfs_fallocate+0x2cf/0x780
+[ 2748.172642]  SyS_fallocate+0x48/0x80
+[ 2748.176629]  do_syscall_64+0x18a/0x430
+[ 2748.180810]  return_from_SYSCALL_64+0x0/0x6a
+
+Fixed it by checking on offset against end & breaking out first,
+dereference bh only if there're still bufferheads to process.
+
+Signed-off-by: Eryu Guan <eguan@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -116,11 +116,11 @@ xfs_finish_page_writeback(
+       bsize = bh->b_size;
+       do {
++              if (off > end)
++                      break;
+               next = bh->b_this_page;
+               if (off < bvec->bv_offset)
+                       goto next_bh;
+-              if (off > end)
+-                      break;
+               bh->b_end_io(bh, !error);
+ next_bh:
+               off += bsize;
diff --git a/queue-4.9/xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch b/queue-4.9/xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch
new file mode 100644 (file)
index 0000000..4c2e7b3
--- /dev/null
@@ -0,0 +1,103 @@
+From 023cc840b40fad95c6fe26fff1d380a8c9d45939 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Thu, 13 Apr 2017 15:15:47 -0700
+Subject: xfs: handle array index overrun in xfs_dir2_leaf_readbuf()
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+commit 023cc840b40fad95c6fe26fff1d380a8c9d45939 upstream.
+
+Carlos had a case where "find" seemed to start spinning
+forever and never return.
+
+This was on a filesystem with non-default multi-fsb (8k)
+directory blocks, and a fragmented directory with extents
+like this:
+
+0:[0,133646,2,0]
+1:[2,195888,1,0]
+2:[3,195890,1,0]
+3:[4,195892,1,0]
+4:[5,195894,1,0]
+5:[6,195896,1,0]
+6:[7,195898,1,0]
+7:[8,195900,1,0]
+8:[9,195902,1,0]
+9:[10,195908,1,0]
+10:[11,195910,1,0]
+11:[12,195912,1,0]
+12:[13,195914,1,0]
+...
+
+i.e. the first extent is a contiguous 2-fsb dir block, but
+after that it is fragmented into 1 block extents.
+
+At the top of the readdir path, we allocate a mapping array
+which (for this filesystem geometry) can hold 10 extents; see
+the assignment to map_info->map_size.  During readdir, we are
+therefore able to map extents 0 through 9 above into the array
+for readahead purposes.  If we count by 2, we see that the last
+mapped index (9) is the first block of a 2-fsb directory block.
+
+At the end of xfs_dir2_leaf_readbuf() we have 2 loops to fill
+more readahead; the outer loop assumes one full dir block is
+processed each loop iteration, and an inner loop that ensures
+that this is so by advancing to the next extent until a full
+directory block is mapped.
+
+The problem is that this inner loop may step past the last
+extent in the mapping array as it tries to reach the end of
+the directory block.  This will read garbage for the extent
+length, and as a result the loop control variable 'j' may
+become corrupted and never fail the loop conditional.
+
+The number of valid mappings we have in our array is stored
+in map->map_valid, so stop this inner loop based on that limit.
+
+There is an ASSERT at the top of the outer loop for this
+same condition, but we never made it out of the inner loop,
+so the ASSERT never fired.
+
+Huge appreciation for Carlos for debugging and isolating
+the problem.
+
+Debugged-and-analyzed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Tested-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Bill O'Donnell <billodo@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_dir2_readdir.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_dir2_readdir.c
++++ b/fs/xfs/xfs_dir2_readdir.c
+@@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf(
+       /*
+        * Do we need more readahead?
++       * Each loop tries to process 1 full dir blk; last may be partial.
+        */
+       blk_start_plug(&plug);
+       for (mip->ra_index = mip->ra_offset = i = 0;
+@@ -425,9 +426,14 @@ xfs_dir2_leaf_readbuf(
+               }
+               /*
+-               * Advance offset through the mapping table.
++               * Advance offset through the mapping table, processing a full
++               * dir block even if it is fragmented into several extents.
++               * But stop if we have consumed all valid mappings, even if
++               * it's not yet a full directory block.
+                */
+-              for (j = 0; j < geo->fsbcount; j += length ) {
++              for (j = 0;
++                   j < geo->fsbcount && mip->ra_index < mip->map_valid;
++                   j += length ) {
+                       /*
+                        * The rest of this extent but not more than a dir
+                        * block.
diff --git a/queue-4.9/xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch b/queue-4.9/xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch
new file mode 100644 (file)
index 0000000..4a8cc69
--- /dev/null
@@ -0,0 +1,80 @@
+From cb52ee334a45ae6c78a3999e4b473c43ddc528f4 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 20 Apr 2017 08:06:47 -0700
+Subject: xfs: prevent multi-fsb dir readahead from reading random blocks
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit cb52ee334a45ae6c78a3999e4b473c43ddc528f4 upstream.
+
+Directory block readahead uses a complex iteration mechanism to map
+between high-level directory blocks and underlying physical extents.
+This mechanism attempts to traverse the higher-level dir blocks in a
+manner that handles multi-fsb directory blocks and simultaneously
+maintains a reference to the corresponding physical blocks.
+
+This logic doesn't handle certain (discontiguous) physical extent
+layouts correctly with multi-fsb directory blocks. For example,
+consider the case of a 4k FSB filesystem with a 2 FSB (8k) directory
+block size and a directory with the following extent layout:
+
+ EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL
+   0: [0..7]:          88..95            0 (88..95)             8
+   1: [8..15]:         80..87            0 (80..87)             8
+   2: [16..39]:        168..191          0 (168..191)          24
+   3: [40..63]:        5242952..5242975  1 (72..95)            24
+
+Directory block 0 spans physical extents 0 and 1, dirblk 1 lies
+entirely within extent 2 and dirblk 2 spans extents 2 and 3. Because
+extent 2 is larger than the directory block size, the readahead code
+erroneously assumes the block is contiguous and issues a readahead
+based on the physical mapping of the first fsb of the dirblk. This
+results in read verifier failure and a spurious corruption or crc
+failure, depending on the filesystem format.
+
+Further, the subsequent readahead code responsible for walking
+through the physical table doesn't correctly advance the physical
+block reference for dirblk 2. Instead of advancing two physical
+filesystem blocks, the first iteration of the loop advances 1 block
+(correctly), but the subsequent iteration advances 2 more physical
+blocks because the next physical extent (extent 3, above) happens to
+cover more than dirblk 2. At this point, the higher-level directory
+block walking is completely off the rails of the actual physical
+layout of the directory for the respective mapping table.
+
+Update the contiguous dirblock logic to consider the current offset
+in the physical extent to avoid issuing directory readahead to
+unrelated blocks. Also, update the mapping table advancing code to
+consider the current offset within the current dirblock to avoid
+advancing the mapping reference too far beyond the dirblock.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_dir2_readdir.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_dir2_readdir.c
++++ b/fs/xfs/xfs_dir2_readdir.c
+@@ -405,7 +405,8 @@ xfs_dir2_leaf_readbuf(
+                * Read-ahead a contiguous directory block.
+                */
+               if (i > mip->ra_current &&
+-                  map[mip->ra_index].br_blockcount >= geo->fsbcount) {
++                  (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
++                  geo->fsbcount) {
+                       xfs_dir3_data_readahead(dp,
+                               map[mip->ra_index].br_startoff + mip->ra_offset,
+                               XFS_FSB_TO_DADDR(dp->i_mount,
+@@ -438,7 +439,7 @@ xfs_dir2_leaf_readbuf(
+                        * The rest of this extent but not more than a dir
+                        * block.
+                        */
+-                      length = min_t(int, geo->fsbcount,
++                      length = min_t(int, geo->fsbcount - j,
+                                       map[mip->ra_index].br_blockcount -
+                                                       mip->ra_offset);
+                       mip->ra_offset += length;
diff --git a/queue-4.9/xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch b/queue-4.9/xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch
new file mode 100644 (file)
index 0000000..fe5eb33
--- /dev/null
@@ -0,0 +1,123 @@
+From fe0be23e68200573de027de9b8cc2b27e7fce35e Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Wed, 12 Apr 2017 12:26:07 -0700
+Subject: xfs: reserve enough blocks to handle btree splits when remapping
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit fe0be23e68200573de027de9b8cc2b27e7fce35e upstream.
+
+In xfs_reflink_end_cow, we erroneously reserve only enough blocks to
+handle adding 1 extent.  This is problematic if we fragment free space,
+have to do CoW, and then have to perform multiple bmap btree expansions.
+Furthermore, the BUI recovery routine doesn't reserve /any/ blocks to
+handle btree splits, so log recovery fails after our first error causes
+the filesystem to go down.
+
+Therefore, refactor the transaction block reservation macros until we
+have a macro that works for our deferred (re)mapping activities, and fix
+both problems by using that macro.
+
+With 1k blocks we can hit this fairly often in g/187 if the scratch fs
+is big enough.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_trans_space.h |   23 +++++++++++++++++------
+ fs/xfs/xfs_bmap_item.c          |    5 ++++-
+ fs/xfs/xfs_reflink.c            |   18 ++++++++++++++++--
+ 3 files changed, 37 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_trans_space.h
++++ b/fs/xfs/libxfs/xfs_trans_space.h
+@@ -21,8 +21,20 @@
+ /*
+  * Components of space reservations.
+  */
++
++/* Worst case number of rmaps that can be held in a block. */
+ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
+               (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
++
++/* Adding one rmap could split every level up to the top of the tree. */
++#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
++
++/* Blocks we might need to add "b" rmaps to a tree. */
++#define XFS_NRMAPADD_SPACE_RES(mp, b)\
++      (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
++        XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
++        XFS_RMAPADD_SPACE_RES(mp))
++
+ #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
+               (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+ #define       XFS_EXTENTADD_SPACE_RES(mp,w)   (XFS_BM_MAXLEVELS(mp,w) - 1)
+@@ -30,13 +42,12 @@
+       (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+         XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+         XFS_EXTENTADD_SPACE_RES(mp,w))
++
++/* Blocks we might need to add "b" mappings & rmappings to a file. */
+ #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
+-      (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+-        XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+-        XFS_EXTENTADD_SPACE_RES(mp,w) + \
+-       ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+-        XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+-        (mp)->m_rmap_maxlevels)
++      (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \
++       XFS_NRMAPADD_SPACE_RES((mp), (b)))
++
+ #define       XFS_DAENTER_1B(mp,w)    \
+       ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
+ #define       XFS_DAENTER_DBS(mp,w)   \
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -34,6 +34,8 @@
+ #include "xfs_bmap.h"
+ #include "xfs_icache.h"
+ #include "xfs_trace.h"
++#include "xfs_bmap_btree.h"
++#include "xfs_trans_space.h"
+ kmem_zone_t   *xfs_bui_zone;
+@@ -446,7 +448,8 @@ xfs_bui_recover(
+               return -EIO;
+       }
+-      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
++      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
++                      XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+       if (error)
+               return error;
+       budp = xfs_trans_get_bud(tp, buip);
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -736,8 +736,22 @@ xfs_reflink_end_cow(
+       offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+       end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+-      /* Start a rolling transaction to switch the mappings */
+-      resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
++      /*
++       * Start a rolling transaction to switch the mappings.  We're
++       * unlikely ever to have to remap 16T worth of single-block
++       * extents, so just cap the worst case extent count to 2^32-1.
++       * Stick a warning in just in case, and avoid 64-bit division.
++       */
++      BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
++      if (end_fsb - offset_fsb > UINT_MAX) {
++              error = -EFSCORRUPTED;
++              xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
++              ASSERT(0);
++              goto out;
++      }
++      resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
++                      (unsigned int)(end_fsb - offset_fsb),
++                      XFS_DATA_FORK);
+       error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+                       resblks, 0, 0, &tp);
+       if (error)
diff --git a/queue-4.9/xfs-support-ability-to-wait-on-new-inodes.patch b/queue-4.9/xfs-support-ability-to-wait-on-new-inodes.patch
new file mode 100644 (file)
index 0000000..21a2e6f
--- /dev/null
@@ -0,0 +1,71 @@
+From 756baca27fff3ecaeab9dbc7a5ee35a1d7bc0c7f Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 26 Apr 2017 08:30:39 -0700
+Subject: xfs: support ability to wait on new inodes
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 756baca27fff3ecaeab9dbc7a5ee35a1d7bc0c7f upstream.
+
+Inodes that are inserted into the perag tree but still under
+construction are flagged with the XFS_INEW bit. Most contexts either
+skip such inodes when they are encountered or have the ability to
+handle them.
+
+The runtime quotaoff sequence introduces a context that must wait
+for construction of such inodes to correctly ensure that all dquots
+in the fs are released. In anticipation of this, support the ability
+to wait on new inodes. Wake the appropriate bit when XFS_INEW is
+cleared.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_icache.c |    5 ++++-
+ fs/xfs/xfs_inode.h  |    4 +++-
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -368,14 +368,17 @@ xfs_iget_cache_hit(
+               error = xfs_reinit_inode(mp, inode);
+               if (error) {
++                      bool wake;
+                       /*
+                        * Re-initializing the inode failed, and we are in deep
+                        * trouble.  Try to re-add it to the reclaim list.
+                        */
+                       rcu_read_lock();
+                       spin_lock(&ip->i_flags_lock);
+-
++                      wake = !!__xfs_iflags_test(ip, XFS_INEW);
+                       ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
++                      if (wake)
++                              wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
+                       ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+                       trace_xfs_iget_reclaim_fail(ip);
+                       goto out_error;
+--- a/fs/xfs/xfs_inode.h
++++ b/fs/xfs/xfs_inode.h
+@@ -217,7 +217,8 @@ static inline bool xfs_is_reflink_inode(
+ #define XFS_IRECLAIM          (1 << 0) /* started reclaiming this inode */
+ #define XFS_ISTALE            (1 << 1) /* inode has been staled */
+ #define XFS_IRECLAIMABLE      (1 << 2) /* inode can be reclaimed */
+-#define XFS_INEW              (1 << 3) /* inode has just been allocated */
++#define __XFS_INEW_BIT                3        /* inode has just been allocated */
++#define XFS_INEW              (1 << __XFS_INEW_BIT)
+ #define XFS_ITRUNCATED                (1 << 5) /* truncated down so flush-on-close */
+ #define XFS_IDIRTY_RELEASE    (1 << 6) /* dirty release already seen */
+ #define __XFS_IFLOCK_BIT      7        /* inode is being flushed right now */
+@@ -467,6 +468,7 @@ static inline void xfs_finish_inode_setu
+       xfs_iflags_clear(ip, XFS_INEW);
+       barrier();
+       unlock_new_inode(VFS_I(ip));
++      wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
+ }
+ static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
diff --git a/queue-4.9/xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch b/queue-4.9/xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch
new file mode 100644 (file)
index 0000000..891c3fa
--- /dev/null
@@ -0,0 +1,188 @@
+From ae2c4ac2dd39b23a87ddb14ceddc3f2872c6aef5 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 26 Apr 2017 08:30:39 -0700
+Subject: xfs: update ag iterator to support wait on new inodes
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit ae2c4ac2dd39b23a87ddb14ceddc3f2872c6aef5 upstream.
+
+The AG inode iterator currently skips new inodes as such inodes are
+inserted into the inode radix tree before they are fully
+constructed. Certain contexts require the ability to wait on the
+construction of new inodes, however. The fs-wide dquot release from
+the quotaoff sequence is an example of this.
+
+Update the AG inode iterator to support the ability to wait on
+inodes flagged with XFS_INEW upon request. Create a new
+xfs_inode_ag_iterator_flags() interface and support a set of
+iteration flags to modify the iteration behavior. When the
+XFS_AGITER_INEW_WAIT flag is set, include XFS_INEW flags in the
+radix tree inode lookup and wait on them before the callback is
+executed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_icache.c |   53 ++++++++++++++++++++++++++++++++++++++++++++--------
+ fs/xfs/xfs_icache.h |    8 +++++++
+ 2 files changed, 53 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -264,6 +264,22 @@ xfs_inode_clear_reclaim_tag(
+       xfs_perag_clear_reclaim_tag(pag);
+ }
++static void
++xfs_inew_wait(
++      struct xfs_inode        *ip)
++{
++      wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
++      DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
++
++      do {
++              prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
++              if (!xfs_iflags_test(ip, XFS_INEW))
++                      break;
++              schedule();
++      } while (true);
++      finish_wait(wq, &wait.wait);
++}
++
+ /*
+  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
+  * part of the structure. This is made more complex by the fact we store
+@@ -628,9 +644,11 @@ out_error_or_again:
+ STATIC int
+ xfs_inode_ag_walk_grab(
+-      struct xfs_inode        *ip)
++      struct xfs_inode        *ip,
++      int                     flags)
+ {
+       struct inode            *inode = VFS_I(ip);
++      bool                    newinos = !!(flags & XFS_AGITER_INEW_WAIT);
+       ASSERT(rcu_read_lock_held());
+@@ -648,7 +666,8 @@ xfs_inode_ag_walk_grab(
+               goto out_unlock_noent;
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+-      if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
++      if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
++          __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+@@ -676,7 +695,8 @@ xfs_inode_ag_walk(
+                                          void *args),
+       int                     flags,
+       void                    *args,
+-      int                     tag)
++      int                     tag,
++      int                     iter_flags)
+ {
+       uint32_t                first_index;
+       int                     last_error = 0;
+@@ -718,7 +738,7 @@ restart:
+               for (i = 0; i < nr_found; i++) {
+                       struct xfs_inode *ip = batch[i];
+-                      if (done || xfs_inode_ag_walk_grab(ip))
++                      if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
+                               batch[i] = NULL;
+                       /*
+@@ -746,6 +766,9 @@ restart:
+               for (i = 0; i < nr_found; i++) {
+                       if (!batch[i])
+                               continue;
++                      if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
++                          xfs_iflags_test(batch[i], XFS_INEW))
++                              xfs_inew_wait(batch[i]);
+                       error = execute(batch[i], flags, args);
+                       IRELE(batch[i]);
+                       if (error == -EAGAIN) {
+@@ -825,12 +848,13 @@ xfs_cowblocks_worker(
+ }
+ int
+-xfs_inode_ag_iterator(
++xfs_inode_ag_iterator_flags(
+       struct xfs_mount        *mp,
+       int                     (*execute)(struct xfs_inode *ip, int flags,
+                                          void *args),
+       int                     flags,
+-      void                    *args)
++      void                    *args,
++      int                     iter_flags)
+ {
+       struct xfs_perag        *pag;
+       int                     error = 0;
+@@ -840,7 +864,8 @@ xfs_inode_ag_iterator(
+       ag = 0;
+       while ((pag = xfs_perag_get(mp, ag))) {
+               ag = pag->pag_agno + 1;
+-              error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
++              error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
++                                        iter_flags);
+               xfs_perag_put(pag);
+               if (error) {
+                       last_error = error;
+@@ -852,6 +877,17 @@ xfs_inode_ag_iterator(
+ }
+ int
++xfs_inode_ag_iterator(
++      struct xfs_mount        *mp,
++      int                     (*execute)(struct xfs_inode *ip, int flags,
++                                         void *args),
++      int                     flags,
++      void                    *args)
++{
++      return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
++}
++
++int
+ xfs_inode_ag_iterator_tag(
+       struct xfs_mount        *mp,
+       int                     (*execute)(struct xfs_inode *ip, int flags,
+@@ -868,7 +904,8 @@ xfs_inode_ag_iterator_tag(
+       ag = 0;
+       while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
+               ag = pag->pag_agno + 1;
+-              error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
++              error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
++                                        0);
+               xfs_perag_put(pag);
+               if (error) {
+                       last_error = error;
+--- a/fs/xfs/xfs_icache.h
++++ b/fs/xfs/xfs_icache.h
+@@ -48,6 +48,11 @@ struct xfs_eofblocks {
+ #define XFS_IGET_UNTRUSTED    0x2
+ #define XFS_IGET_DONTCACHE    0x4
++/*
++ * flags for AG inode iterator
++ */
++#define XFS_AGITER_INEW_WAIT  0x1     /* wait on new inodes */
++
+ int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+            uint flags, uint lock_flags, xfs_inode_t **ipp);
+@@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_st
+ int xfs_inode_ag_iterator(struct xfs_mount *mp,
+       int (*execute)(struct xfs_inode *ip, int flags, void *args),
+       int flags, void *args);
++int xfs_inode_ag_iterator_flags(struct xfs_mount *mp,
++      int (*execute)(struct xfs_inode *ip, int flags, void *args),
++      int flags, void *args, int iter_flags);
+ int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+       int (*execute)(struct xfs_inode *ip, int flags, void *args),
+       int flags, void *args, int tag);
diff --git a/queue-4.9/xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch b/queue-4.9/xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch
new file mode 100644 (file)
index 0000000..fd321dc
--- /dev/null
@@ -0,0 +1,54 @@
+From e20c8a517f259cb4d258e10b0cd5d4b30d4167a0 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 26 Apr 2017 08:30:40 -0700
+Subject: xfs: wait on new inodes during quotaoff dquot release
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e20c8a517f259cb4d258e10b0cd5d4b30d4167a0 upstream.
+
+The quotaoff operation has a race with inode allocation that results
+in a livelock. An inode allocation that occurs before the quota
+status flags are updated acquires the appropriate dquots for the
+inode via xfs_qm_vop_dqalloc(). It then inserts the XFS_INEW inode
+into the perag radix tree, sometime later attaches the dquots to the
+inode and finally clears the XFS_INEW flag. Quotaoff expects to
+release the dquots from all inodes in the filesystem via
+xfs_qm_dqrele_all_inodes(). This invokes the AG inode iterator,
+which skips inodes in the XFS_INEW state because they are not fully
+constructed. If the scan occurs after dquots have been attached to
+an inode, but before XFS_INEW is cleared, the newly allocated inode
+will continue to hold a reference to the applicable dquots. When
+quotaoff invokes xfs_qm_dqpurge_all(), the reference count of those
+dquot(s) remain elevated and the dqpurge scan spins indefinitely.
+
+To address this problem, update the xfs_qm_dqrele_all_inodes() scan
+to wait on inodes marked on the XFS_INEW state. We wait on the
+inodes explicitly rather than skip and retry to avoid continuous
+retry loops due to a parallel inode allocation workload. Since
+quotaoff updates the quota state flags and uses a synchronous
+transaction before the dqrele scan, and dquots are attached to
+inodes after radix tree insertion iff quota is enabled, one INEW
+waiting pass through the AG guarantees that the scan has processed
+all inodes that could possibly hold dquot references.
+
+Reported-by: Eryu Guan <eguan@redhat.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_qm_syscalls.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_qm_syscalls.c
++++ b/fs/xfs/xfs_qm_syscalls.c
+@@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes(
+       uint             flags)
+ {
+       ASSERT(mp->m_quotainfo);
+-      xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
++      xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL,
++                                  XFS_AGITER_INEW_WAIT);
+ }
diff --git a/queue-4.9/xfs-xfs_trans_alloc_empty.patch b/queue-4.9/xfs-xfs_trans_alloc_empty.patch
new file mode 100644 (file)
index 0000000..6fd4ffb
--- /dev/null
@@ -0,0 +1,65 @@
+From hch@lst.de  Mon Jun  5 17:05:12 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sat,  3 Jun 2017 15:18:31 +0200
+Subject: xfs: xfs_trans_alloc_empty
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170603131836.26661-21-hch@lst.de>
+
+From: Christoph Hellwig <hch@lst.de>
+
+This is a partial cherry-pick of commit e89c041338
+("xfs: implement the GETFSMAP ioctl"), which also adds this helper, and
+a great example of why feature patches should be properly split into
+their parts.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+[hch: split from the larger patch for -stable]
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+---
+ fs/xfs/xfs_trans.c |   22 ++++++++++++++++++++++
+ fs/xfs/xfs_trans.h |    2 ++
+ 2 files changed, 24 insertions(+)
+
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -263,6 +263,28 @@ xfs_trans_alloc(
+ }
+ /*
++ * Create an empty transaction with no reservation.  This is a defensive
++ * mechanism for routines that query metadata without actually modifying
++ * them -- if the metadata being queried is somehow cross-linked (think a
++ * btree block pointer that points higher in the tree), we risk deadlock.
++ * However, blocks grabbed as part of a transaction can be re-grabbed.
++ * The verifiers will notice the corrupt block and the operation will fail
++ * back to userspace without deadlocking.
++ *
++ * Note the zero-length reservation; this transaction MUST be cancelled
++ * without any dirty data.
++ */
++int
++xfs_trans_alloc_empty(
++      struct xfs_mount                *mp,
++      struct xfs_trans                **tpp)
++{
++      struct xfs_trans_res            resv = {0};
++
++      return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
++}
++
++/*
+  * Record the indicated change to the given field for application
+  * to the file system's superblock when the transaction commits.
+  * For now, just store the change in the transaction structure.
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -159,6 +159,8 @@ typedef struct xfs_trans {
+ int           xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
+                       uint blocks, uint rtextents, uint flags,
+                       struct xfs_trans **tpp);
++int           xfs_trans_alloc_empty(struct xfs_mount *mp,
++                      struct xfs_trans **tpp);
+ void          xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
+ struct xfs_buf        *xfs_trans_get_buf_map(struct xfs_trans *tp,