]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Sep 2017 08:11:55 +0000 (10:11 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Sep 2017 08:11:55 +0000 (10:11 +0200)
added patches:
xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
xfs-check-_btree_check_block-value.patch
xfs-don-t-allow-bmap-on-rt-files.patch
xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
xfs-fix-inobt-inode-allocation-search-optimization.patch
xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
xfs-fix-per-inode-dax-flag-inheritance.patch
xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
xfs-free-uncommitted-transactions-during-log-recovery.patch
xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch

18 files changed:
queue-4.9/series
queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch [new file with mode: 0644]
queue-4.9/xfs-check-_btree_check_block-value.patch [new file with mode: 0644]
queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch [new file with mode: 0644]
queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch [new file with mode: 0644]
queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch [new file with mode: 0644]
queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch [new file with mode: 0644]
queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch [new file with mode: 0644]
queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch [new file with mode: 0644]
queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch [new file with mode: 0644]
queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch [new file with mode: 0644]
queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch [new file with mode: 0644]
queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch [new file with mode: 0644]
queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch [new file with mode: 0644]
queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch [new file with mode: 0644]
queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch [new file with mode: 0644]
queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch [new file with mode: 0644]
queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch [new file with mode: 0644]

index e5d0b0047c4f307318a9158b8677abc2aaf42554..cc556c6deeafee4c2a1edb008fd70249dc0599a4 100644 (file)
@@ -28,3 +28,20 @@ f2fs-check-hot_data-for-roll-forward-recovery.patch
 x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
 x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
 x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch
+xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
+xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
+xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
+xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch
+xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
+xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
+xfs-don-t-allow-bmap-on-rt-files.patch
+xfs-free-uncommitted-transactions-during-log-recovery.patch
+xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
+xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
+xfs-check-_btree_check_block-value.patch
+xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
+xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
+xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
+xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
+xfs-fix-per-inode-dax-flag-inheritance.patch
+xfs-fix-inobt-inode-allocation-search-optimization.patch
diff --git a/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch b/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
new file mode 100644 (file)
index 0000000..1947cb9
--- /dev/null
@@ -0,0 +1,50 @@
+From hch@lst.de  Mon Sep 18 10:08:45 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:38 -0700
+Subject: xfs: check _alloc_read_agf buffer pointer before using
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-14-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 10479e2dea83d4c421ad05dfc55d918aa8dfc0cd upstream.
+
+In some circumstances, _alloc_read_agf can return an error code of zero
+but also a null AGF buffer pointer.  Check for this and jump out.
+
+Fixes-coverity-id: 1415250
+Fixes-coverity-id: 1415320
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_refcount.c |    4 ++++
+ fs/xfs/xfs_reflink.c         |    2 ++
+ 2 files changed, 6 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -1640,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
+       error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+       if (error)
+               goto out_trans;
++      if (!agbp) {
++              error = -ENOMEM;
++              goto out_trans;
++      }
+       cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
+       /* Find all the leftover CoW staging extents. */
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -169,6 +169,8 @@ xfs_reflink_find_shared(
+       error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+       if (error)
+               return error;
++      if (!agbp)
++              return -ENOMEM;
+       cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
diff --git a/queue-4.9/xfs-check-_btree_check_block-value.patch b/queue-4.9/xfs-check-_btree_check_block-value.patch
new file mode 100644 (file)
index 0000000..f4e3ccc
--- /dev/null
@@ -0,0 +1,48 @@
+From hch@lst.de  Mon Sep 18 10:08:11 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:36 -0700
+Subject: xfs: check _btree_check_block value
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-12-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 1e86eabe73b73c82e1110c746ed3ec6d5e1c0a0d upstream.
+
+Check the _btree_check_block return value for the firstrec and lastrec
+functions, since we have the ability to signal that the repositioning
+did not succeed.
+
+Fixes-coverity-id: 114067
+Fixes-coverity-id: 114068
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -714,7 +714,8 @@ xfs_btree_firstrec(
+        * Get the block pointer for this level.
+        */
+       block = xfs_btree_get_block(cur, level, &bp);
+-      xfs_btree_check_block(cur, block, level, bp);
++      if (xfs_btree_check_block(cur, block, level, bp))
++              return 0;
+       /*
+        * It's empty, there is no such record.
+        */
+@@ -743,7 +744,8 @@ xfs_btree_lastrec(
+        * Get the block pointer for this level.
+        */
+       block = xfs_btree_get_block(cur, level, &bp);
+-      xfs_btree_check_block(cur, block, level, bp);
++      if (xfs_btree_check_block(cur, block, level, bp))
++              return 0;
+       /*
+        * It's empty, there is no such record.
+        */
diff --git a/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch b/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch
new file mode 100644 (file)
index 0000000..b0d1f4d
--- /dev/null
@@ -0,0 +1,42 @@
+From 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 19 Jun 2017 13:19:08 -0700
+Subject: xfs: don't allow bmap on rt files
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c upstream.
+
+bmap returns a dumb LBA address but not the block device that goes with
+that LBA.  Swapfiles don't care about this and will blindly assume that
+the data volume is the correct blockdev, which is totally bogus for
+files on the rt subvolume.  This results in the swap code doing IOs to
+arbitrary locations on the data device(!) if the passed in mapping is a
+realtime file, so just turn off bmap for rt files.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/xfs/xfs_aops.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -1566,9 +1566,12 @@ xfs_vm_bmap(
+        * The swap code (ab-)uses ->bmap to get a block mapping and then
+        * bypasseŃ• the file system for actual I/O.  We really can't allow
+        * that on reflinks inodes, so we have to skip out here.  And yes,
+-       * 0 is the magic code for a bmap error..
++       * 0 is the magic code for a bmap error.
++       *
++       * Since we don't pass back blockdev info, we can't return bmap
++       * information for rt files either.
+        */
+-      if (xfs_is_reflink_inode(ip)) {
++      if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
+               xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+               return 0;
+       }
diff --git a/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch b/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
new file mode 100644 (file)
index 0000000..f3bf13b
--- /dev/null
@@ -0,0 +1,87 @@
+From hch@lst.de  Mon Sep 18 10:07:57 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:35 -0700
+Subject: xfs: don't crash on unexpected holes in dir/attr btrees
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-11-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit cd87d867920155911d0d2e6485b769d853547750 upstream.
+
+In quite a few places we call xfs_da_read_buf with a mappedbno that we
+don't control, then assume that the function passes back either an error
+code or a buffer pointer.  Unfortunately, if mappedbno == -2 and bno
+maps to a hole, we get a return code of zero and a NULL buffer, which
+means that we crash if we actually try to use that buffer pointer.  This
+happens immediately when we set the buffer type for transaction context.
+
+Therefore, check that we have no error code and a non-NULL bp before
+trying to use bp.  This patch is a follow-up to an incomplete fix in
+96a3aefb8ffde231 ("xfs: don't crash if reading a directory results in an
+unexpected hole").
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_attr_leaf.c  |    2 +-
+ fs/xfs/libxfs/xfs_da_btree.c   |    2 +-
+ fs/xfs/libxfs/xfs_dir2_block.c |    2 +-
+ fs/xfs/libxfs/xfs_dir2_leaf.c  |    4 ++--
+ 4 files changed, 5 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_attr_leaf.c
++++ b/fs/xfs/libxfs/xfs_attr_leaf.c
+@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
+       err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                               XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+-      if (!err && tp)
++      if (!err && tp && *bpp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+       return err;
+ }
+--- a/fs/xfs/libxfs/xfs_da_btree.c
++++ b/fs/xfs/libxfs/xfs_da_btree.c
+@@ -263,7 +263,7 @@ xfs_da3_node_read(
+       err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                       which_fork, &xfs_da3_node_buf_ops);
+-      if (!err && tp) {
++      if (!err && tp && *bpp) {
+               struct xfs_da_blkinfo   *info = (*bpp)->b_addr;
+               int                     type;
+--- a/fs/xfs/libxfs/xfs_dir2_block.c
++++ b/fs/xfs/libxfs/xfs_dir2_block.c
+@@ -139,7 +139,7 @@ xfs_dir3_block_read(
+       err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+-      if (!err && tp)
++      if (!err && tp && *bpp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+       return err;
+ }
+--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
++++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
+@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
+       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+-      if (!err && tp)
++      if (!err && tp && *bpp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+       return err;
+ }
+@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
+       err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+-      if (!err && tp)
++      if (!err && tp && *bpp)
+               xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+       return err;
+ }
diff --git a/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch b/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch
new file mode 100644 (file)
index 0000000..9b8eea3
--- /dev/null
@@ -0,0 +1,58 @@
+From hch@lst.de  Mon Sep 18 10:09:48 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:42 -0700
+Subject: xfs: fix inobt inode allocation search optimization
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Omar Sandoval <osandov@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-18-hch@lst.de>
+
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit c44245b3d5435f533ca8346ece65918f84c057f9 upstream.
+
+When we try to allocate a free inode by searching the inobt, we try to
+find the inode nearest the parent inode by searching chunks both left
+and right of the chunk containing the parent. As an optimization, we
+cache the leftmost and rightmost records that we previously searched; if
+we do another allocation with the same parent inode, we'll pick up the
+search where it last left off.
+
+There's a bug in the case where we found a free inode to the left of the
+parent's chunk: we need to update the cached left and right records, but
+because we already reassigned the right record to point to the left, we
+end up assigning the left record to both the cached left and right
+records.
+
+This isn't a correctness problem strictly, but it can result in the next
+allocation rechecking chunks unnecessarily or allocating inodes further
+away from the parent than it needs to. Fix it by swapping the record
+pointer after we update the cached left and right records.
+
+Fixes: bd169565993b ("xfs: speed up free inode search")
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ialloc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -1236,13 +1236,13 @@ xfs_dialloc_ag_inobt(
+                       /* free inodes to the left? */
+                       if (useleft && trec.ir_freecount) {
+-                              rec = trec;
+                               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                               cur = tcur;
+                               pag->pagl_leftrec = trec.ir_startino;
+                               pag->pagl_rightrec = rec.ir_startino;
+                               pag->pagl_pagino = pagino;
++                              rec = trec;
+                               goto alloc_inode;
+                       }
diff --git a/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch b/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
new file mode 100644 (file)
index 0000000..6014e22
--- /dev/null
@@ -0,0 +1,52 @@
+From hch@lst.de  Mon Sep 18 10:09:24 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:40 -0700
+Subject: xfs: fix multi-AG deadlock in xfs_bunmapi
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-16-hch@lst.de>
+
+
+commit 5b094d6dac0451ad89b1dc088395c7b399b7e9e8 upstream.
+
+Just like in the allocator we must avoid touching multiple AGs out of
+order when freeing blocks, as freeing still locks the AGF and can cause
+the same AB-BA deadlocks as in the allocation path.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c |   12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -5556,6 +5556,7 @@ __xfs_bunmapi(
+       xfs_fsblock_t           sum;
+       xfs_filblks_t           len = *rlen;    /* length to unmap in file */
+       xfs_fileoff_t           max_len;
++      xfs_agnumber_t          prev_agno = NULLAGNUMBER, agno;
+       trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+@@ -5658,6 +5659,17 @@ __xfs_bunmapi(
+               ASSERT(ep != NULL);
+               del = got;
+               wasdel = isnullstartblock(del.br_startblock);
++
++              /*
++               * Make sure we don't touch multiple AGF headers out of order
++               * in a single transaction, as that could cause AB-BA deadlocks.
++               */
++              if (!wasdel) {
++                      agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
++                      if (prev_agno != NULLAGNUMBER && prev_agno > agno)
++                              break;
++                      prev_agno = agno;
++              }
+               if (got.br_startoff < start) {
+                       del.br_startoff = start;
+                       del.br_blockcount -= start - got.br_startoff;
diff --git a/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch b/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch
new file mode 100644 (file)
index 0000000..018c7fd
--- /dev/null
@@ -0,0 +1,72 @@
+From hch@lst.de  Mon Sep 18 10:09:34 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:41 -0700
+Subject: xfs: Fix per-inode DAX flag inheritance
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Lukas Czerner <lczerner@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-17-hch@lst.de>
+
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+commit 56bdf855e676f1f2ed7033f288f57dfd315725ba upstream.
+
+According to the commit that implemented per-inode DAX flag:
+commit 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
+the flag is supposed to act as "inherit flag".
+
+Currently this only works in the situations where parent directory
+already has a flag in di_flags set, otherwise inheritance does not
+work. This is because setting the XFS_DIFLAG2_DAX flag is done in a
+wrong branch designated for di_flags, not di_flags2.
+
+Fix this by moving the code to branch designated for setting di_flags2,
+which does test for flags in di_flags2.
+
+Fixes: 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -881,7 +881,6 @@ xfs_ialloc(
+       case S_IFREG:
+       case S_IFDIR:
+               if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+-                      uint64_t        di_flags2 = 0;
+                       uint            di_flags = 0;
+                       if (S_ISDIR(mode)) {
+@@ -918,20 +917,23 @@ xfs_ialloc(
+                               di_flags |= XFS_DIFLAG_NODEFRAG;
+                       if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+                               di_flags |= XFS_DIFLAG_FILESTREAM;
+-                      if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+-                              di_flags2 |= XFS_DIFLAG2_DAX;
+                       ip->i_d.di_flags |= di_flags;
+-                      ip->i_d.di_flags2 |= di_flags2;
+               }
+               if (pip &&
+                   (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
+                   pip->i_d.di_version == 3 &&
+                   ip->i_d.di_version == 3) {
++                      uint64_t        di_flags2 = 0;
++
+                       if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+-                              ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
++                              di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+                               ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+                       }
++                      if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
++                              di_flags2 |= XFS_DIFLAG2_DAX;
++
++                      ip->i_d.di_flags2 |= di_flags2;
+               }
+               /* FALLTHROUGH */
+       case S_IFLNK:
diff --git a/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch b/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
new file mode 100644 (file)
index 0000000..5888b8c
--- /dev/null
@@ -0,0 +1,41 @@
+From hch@lst.de  Mon Sep 18 10:09:04 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:39 -0700
+Subject: xfs: fix quotacheck dquot id overflow infinite loop
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-15-hch@lst.de>
+
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit cfaf2d034360166e569a4929dd83ae9698bed856 upstream.
+
+If a dquot has an id of U32_MAX, the next lookup index increment
+overflows the uint32_t back to 0. This starts the lookup sequence
+over from the beginning, repeats indefinitely and results in a
+livelock.
+
+Update xfs_qm_dquot_walk() to explicitly check for the lookup
+overflow and exit the loop.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_qm.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -111,6 +111,9 @@ restart:
+                       skipped = 0;
+                       break;
+               }
++              /* we're done if id overflows back to zero */
++              if (!next_index)
++                      break;
+       }
+       if (skipped) {
diff --git a/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch b/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
new file mode 100644 (file)
index 0000000..c93869c
--- /dev/null
@@ -0,0 +1,73 @@
+From 95989c46d2a156365867b1d795fdefce71bce378 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 8 Jun 2017 08:23:07 -0700
+Subject: xfs: fix spurious spin_is_locked() assert failures on non-smp kernels
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 95989c46d2a156365867b1d795fdefce71bce378 upstream.
+
+The 0-day kernel test robot reports assertion failures on
+!CONFIG_SMP kernels due to failed spin_is_locked() checks. As it
+turns out, spin_is_locked() is hardcoded to return zero on
+!CONFIG_SMP kernels and so this function cannot be relied on to
+verify spinlock state in this configuration.
+
+To avoid this problem, replace the associated asserts with lockdep
+variants that do the right thing regardless of kernel configuration.
+Drop the one assert that checks for an unlocked lock as there is no
+suitable lockdep variant for that case. This moves the spinlock
+checks from XFS debug code to lockdep, but generally provides the
+same level of protection.
+
+Reported-by: kbuild test robot <fengguang.wu@intel.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf.c    |    2 +-
+ fs/xfs/xfs_icache.c |    5 ++---
+ 2 files changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -116,7 +116,7 @@ static inline void
+ __xfs_buf_ioacct_dec(
+       struct xfs_buf  *bp)
+ {
+-      ASSERT(spin_is_locked(&bp->b_lock));
++      lockdep_assert_held(&bp->b_lock);
+       if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
+               bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -66,7 +66,6 @@ xfs_inode_alloc(
+       XFS_STATS_INC(mp, vn_active);
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+-      ASSERT(!spin_is_locked(&ip->i_flags_lock));
+       ASSERT(!xfs_isiflocked(ip));
+       ASSERT(ip->i_ino == 0);
+@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
+ {
+       struct xfs_mount        *mp = pag->pag_mount;
+-      ASSERT(spin_is_locked(&pag->pag_ici_lock));
++      lockdep_assert_held(&pag->pag_ici_lock);
+       if (pag->pag_ici_reclaimable++)
+               return;
+@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
+ {
+       struct xfs_mount        *mp = pag->pag_mount;
+-      ASSERT(spin_is_locked(&pag->pag_ici_lock));
++      lockdep_assert_held(&pag->pag_ici_lock);
+       if (--pag->pag_ici_reclaimable)
+               return;
diff --git a/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch b/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
new file mode 100644 (file)
index 0000000..201a257
--- /dev/null
@@ -0,0 +1,39 @@
+From hch@lst.de  Mon Sep 18 10:07:40 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:34 -0700
+Subject: xfs: free cowblocks and retry on buffered write ENOSPC
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-10-hch@lst.de>
+
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit cf2cb7845d6e101cb17bd62f8aa08cd514fc8988 upstream.
+
+XFS runs an eofblocks reclaim scan before returning an ENOSPC error to
+userspace for buffered writes. This facilitates aggressive speculative
+preallocation without causing user visible side effects such as
+premature ENOSPC.
+
+Run a cowblocks scan in the same situation to reclaim lingering COW fork
+preallocation throughout the filesystem.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -729,6 +729,7 @@ write_retry:
+               xfs_rw_iunlock(ip, iolock);
+               eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+               xfs_icache_free_eofblocks(ip->i_mount, &eofb);
++              xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+               goto write_retry;
+       }
diff --git a/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch b/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch
new file mode 100644 (file)
index 0000000..e820bd0
--- /dev/null
@@ -0,0 +1,102 @@
+From hch@lst.de  Mon Sep 18 10:07:17 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:33 -0700
+Subject: xfs: free uncommitted transactions during log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 39775431f82f890f4aaa08860a30883d081bffc7 upstream.
+
+Log recovery allocates in-core transaction and member item data
+structures on-demand as it processes the on-disk log. Transactions
+are allocated on first encounter on-disk and stored in a hash table
+structure where they are easily accessible for subsequent lookups.
+Transaction items are also allocated on demand and are attached to
+the associated transactions.
+
+When a commit record is encountered in the log, the transaction is
+committed to the fs and the in-core structures are freed. If a
+filesystem crashes or shuts down before all in-core log buffers are
+flushed to the log, however, not all transactions may have commit
+records in the log. As expected, the modifications in such an
+incomplete transaction are not replayed to the fs. The in-core data
+structures for the partial transaction are never freed, however,
+resulting in a memory leak.
+
+Update xlog_do_recovery_pass() to first correctly initialize the
+hash table array so empty lists can be distinguished from populated
+lists on function exit. Update xlog_recover_free_trans() to always
+remove the transaction from the list prior to freeing the associated
+memory. Finally, walk the hash table of transaction lists as the
+last step before it goes out of scope and free any transactions that
+may remain on the lists. This prevents a memory leak of partial
+transactions in the log.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   21 ++++++++++++++++++++-
+ 1 file changed, 20 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4152,7 +4152,7 @@ xlog_recover_commit_trans(
+       #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
+-      hlist_del(&trans->r_list);
++      hlist_del_init(&trans->r_list);
+       error = xlog_recover_reorder_trans(log, trans, pass);
+       if (error)
+@@ -4354,6 +4354,8 @@ xlog_recover_free_trans(
+       xlog_recover_item_t     *item, *n;
+       int                     i;
++      hlist_del_init(&trans->r_list);
++
+       list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+               /* Free the regions in the item. */
+               list_del(&item->ri_list);
+@@ -5222,12 +5224,16 @@ xlog_do_recovery_pass(
+       int                     error2 = 0;
+       int                     bblks, split_bblks;
+       int                     hblks, split_hblks, wrapped_hblks;
++      int                     i;
+       struct hlist_head       rhash[XLOG_RHASH_SIZE];
+       LIST_HEAD               (buffer_list);
+       ASSERT(head_blk != tail_blk);
+       rhead_blk = 0;
++      for (i = 0; i < XLOG_RHASH_SIZE; i++)
++              INIT_HLIST_HEAD(&rhash[i]);
++
+       /*
+        * Read the header of the tail block and get the iclog buffer size from
+        * h_size.  Use this to tell how many sectors make up the log header.
+@@ -5464,6 +5470,19 @@ xlog_do_recovery_pass(
+       if (error && first_bad)
+               *first_bad = rhead_blk;
++      /*
++       * Transactions are freed at commit time but transactions without commit
++       * records on disk are never committed. Free any that may be left in the
++       * hash table.
++       */
++      for (i = 0; i < XLOG_RHASH_SIZE; i++) {
++              struct hlist_node       *tmp;
++              struct xlog_recover     *trans;
++
++              hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
++                      xlog_recover_free_trans(trans);
++      }
++
+       return error ? error : error2;
+ }
diff --git a/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch b/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
new file mode 100644 (file)
index 0000000..ac55c57
--- /dev/null
@@ -0,0 +1,85 @@
+From a54fba8f5a0dc36161cacdf2aa90f007f702ec1a Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 18 May 2017 16:36:24 -0700
+Subject: xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff()
+
+From: Jan Kara <jack@suse.cz>
+
+commit a54fba8f5a0dc36161cacdf2aa90f007f702ec1a upstream.
+
+Currently several places in xfs_find_get_desired_pgoff() handle the case
+of a missing page. Make them all handled in one place after the loop has
+terminated.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c |   38 ++++++++------------------------------
+ 1 file changed, 8 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -1139,29 +1139,8 @@ xfs_find_get_desired_pgoff(
+               want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
+               nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+                                         want);
+-              /*
+-               * No page mapped into given range.  If we are searching holes
+-               * and if this is the first time we got into the loop, it means
+-               * that the given offset is landed in a hole, return it.
+-               *
+-               * If we have already stepped through some block buffers to find
+-               * holes but they all contains data.  In this case, the last
+-               * offset is already updated and pointed to the end of the last
+-               * mapped page, if it does not reach the endpoint to search,
+-               * that means there should be a hole between them.
+-               */
+-              if (nr_pages == 0) {
+-                      /* Data search found nothing */
+-                      if (type == DATA_OFF)
+-                              break;
+-
+-                      ASSERT(type == HOLE_OFF);
+-                      if (lastoff == startoff || lastoff < endoff) {
+-                              found = true;
+-                              *offset = lastoff;
+-                      }
++              if (nr_pages == 0)
+                       break;
+-              }
+               for (i = 0; i < nr_pages; i++) {
+                       struct page     *page = pvec.pages[i];
+@@ -1227,21 +1206,20 @@ xfs_find_get_desired_pgoff(
+               /*
+                * The number of returned pages less than our desired, search
+-               * done.  In this case, nothing was found for searching data,
+-               * but we found a hole behind the last offset.
++               * done.
+                */
+-              if (nr_pages < want) {
+-                      if (type == HOLE_OFF) {
+-                              *offset = lastoff;
+-                              found = true;
+-                      }
++              if (nr_pages < want)
+                       break;
+-              }
+               index = pvec.pages[i - 1]->index + 1;
+               pagevec_release(&pvec);
+       } while (index <= end);
++      /* No page at lastoff and we are not done - we found a hole. */
++      if (type == HOLE_OFF && lastoff < endoff) {
++              *offset = lastoff;
++              found = true;
++      }
+ out:
+       pagevec_release(&pvec);
+       return found;
diff --git a/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch b/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
new file mode 100644 (file)
index 0000000..8972473
--- /dev/null
@@ -0,0 +1,197 @@
+From 7912e7fef2aebe577f0b46d3cba261f2783c5695 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:21:45 -0700
+Subject: xfs: push buffer of flush locked dquot to avoid quotacheck deadlock
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 7912e7fef2aebe577f0b46d3cba261f2783c5695 upstream.
+
+Reclaim during quotacheck can lead to deadlocks on the dquot flush
+lock:
+
+ - Quotacheck populates a local delwri queue with the physical dquot
+   buffers.
+ - Quotacheck performs the xfs_qm_dqusage_adjust() bulkstat and
+   dirties all of the dquots.
+ - Reclaim kicks in and attempts to flush a dquot whose buffer is
+   already queud on the quotacheck queue. The flush succeeds but
+   queueing to the reclaim delwri queue fails as the backing buffer is
+   already queued. The flush unlock is now deferred to I/O completion
+   of the buffer from the quotacheck queue.
+ - The dqadjust bulkstat continues and dirties the recently flushed
+   dquot once again.
+ - Quotacheck proceeds to the xfs_qm_flush_one() walk which requires
+   the flush lock to update the backing buffers with the in-core
+   recalculated values. It deadlocks on the redirtied dquot as the
+   flush lock was already acquired by reclaim, but the buffer resides
+   on the local delwri queue which isn't submitted until the end of
+   quotacheck.
+
+This is reproduced by running quotacheck on a filesystem with a
+couple million inodes in low memory (512MB-1GB) situations. This is
+a regression as of commit 43ff2122e6 ("xfs: on-stack delayed write
+buffer lists"), which removed a trylock and buffer I/O submission
+from the quotacheck dquot flush sequence.
+
+Quotacheck first resets and collects the physical dquot buffers in a
+delwri queue. Then, it traverses the filesystem inodes via bulkstat,
+updates the in-core dquots, flushes the corrected dquots to the
+backing buffers and finally submits the delwri queue for I/O. Since
+the backing buffers are queued across the entire quotacheck
+operation, dquot reclaim cannot possibly complete a dquot flush
+before quotacheck completes.
+
+Therefore, quotacheck must submit the buffer for I/O in order to
+cycle the flush lock and flush the dirty in-core dquot to the
+buffer. Add a delwri queue buffer push mechanism to submit an
+individual buffer for I/O without losing the delwri queue status and
+use it from quotacheck to avoid the deadlock. This restores
+quotacheck behavior to as before the regression was introduced.
+
+Reported-by: Martin Svec <martin.svec@zoner.cz>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf.c   |   60 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_buf.h   |    1 
+ fs/xfs/xfs_qm.c    |   28 +++++++++++++++++++++++-
+ fs/xfs/xfs_trace.h |    1 
+ 4 files changed, 89 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
+       return error;
+ }
++/*
++ * Push a single buffer on a delwri queue.
++ *
++ * The purpose of this function is to submit a single buffer of a delwri queue
++ * and return with the buffer still on the original queue. The waiting delwri
++ * buffer submission infrastructure guarantees transfer of the delwri queue
++ * buffer reference to a temporary wait list. We reuse this infrastructure to
++ * transfer the buffer back to the original queue.
++ *
++ * Note the buffer transitions from the queued state, to the submitted and wait
++ * listed state and back to the queued state during this call. The buffer
++ * locking and queue management logic between _delwri_pushbuf() and
++ * _delwri_queue() guarantee that the buffer cannot be queued to another list
++ * before returning.
++ */
++int
++xfs_buf_delwri_pushbuf(
++      struct xfs_buf          *bp,
++      struct list_head        *buffer_list)
++{
++      LIST_HEAD               (submit_list);
++      int                     error;
++
++      ASSERT(bp->b_flags & _XBF_DELWRI_Q);
++
++      trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
++
++      /*
++       * Isolate the buffer to a new local list so we can submit it for I/O
++       * independently from the rest of the original list.
++       */
++      xfs_buf_lock(bp);
++      list_move(&bp->b_list, &submit_list);
++      xfs_buf_unlock(bp);
++
++      /*
++       * Delwri submission clears the DELWRI_Q buffer flag and returns with
++       * the buffer on the wait list with an associated reference. Rather than
++       * bounce the buffer from a local wait list back to the original list
++       * after I/O completion, reuse the original list as the wait list.
++       */
++      xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
++
++      /*
++       * The buffer is now under I/O and wait listed as during typical delwri
++       * submission. Lock the buffer to wait for I/O completion. Rather than
++       * remove the buffer from the wait list and release the reference, we
++       * want to return with the buffer queued to the original list. The
++       * buffer already sits on the original list with a wait list reference,
++       * however. If we let the queue inherit that wait list reference, all we
++       * need to do is reset the DELWRI_Q flag.
++       */
++      xfs_buf_lock(bp);
++      error = bp->b_error;
++      bp->b_flags |= _XBF_DELWRI_Q;
++      xfs_buf_unlock(bp);
++
++      return error;
++}
++
+ int __init
+ xfs_buf_init(void)
+ {
+--- a/fs/xfs/xfs_buf.h
++++ b/fs/xfs/xfs_buf.h
+@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct
+ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+ extern int xfs_buf_delwri_submit(struct list_head *);
+ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
++extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
+ /* Buffer Daemon Setup Routines */
+ extern int xfs_buf_init(void);
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1247,6 +1247,7 @@ xfs_qm_flush_one(
+       struct xfs_dquot        *dqp,
+       void                    *data)
+ {
++      struct xfs_mount        *mp = dqp->q_mount;
+       struct list_head        *buffer_list = data;
+       struct xfs_buf          *bp = NULL;
+       int                     error = 0;
+@@ -1257,7 +1258,32 @@ xfs_qm_flush_one(
+       if (!XFS_DQ_IS_DIRTY(dqp))
+               goto out_unlock;
+-      xfs_dqflock(dqp);
++      /*
++       * The only way the dquot is already flush locked by the time quotacheck
++       * gets here is if reclaim flushed it before the dqadjust walk dirtied
++       * it for the final time. Quotacheck collects all dquot bufs in the
++       * local delwri queue before dquots are dirtied, so reclaim can't have
++       * possibly queued it for I/O. The only way out is to push the buffer to
++       * cycle the flush lock.
++       */
++      if (!xfs_dqflock_nowait(dqp)) {
++              /* buf is pinned in-core by delwri list */
++              DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
++                                    mp->m_quotainfo->qi_dqchunklen);
++              bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
++              if (!bp) {
++                      error = -EINVAL;
++                      goto out_unlock;
++              }
++              xfs_buf_unlock(bp);
++
++              xfs_buf_delwri_pushbuf(bp, buffer_list);
++              xfs_buf_rele(bp);
++
++              error = -EAGAIN;
++              goto out_unlock;
++      }
++
+       error = xfs_qm_dqflush(dqp, &bp);
+       if (error)
+               goto out_unlock;
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
++DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
+ DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+ DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+ DEFINE_BUF_EVENT(xfs_buf_item_relse);
diff --git a/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch b/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
new file mode 100644 (file)
index 0000000..2cd5655
--- /dev/null
@@ -0,0 +1,87 @@
+From 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:35:35 -0700
+Subject: xfs: release bli from transaction properly on fs shutdown
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 upstream.
+
+If a filesystem shutdown occurs with a buffer log item in the CIL
+and a log force occurs, the ->iop_unpin() handler is generally
+expected to tear down the bli properly. This entails freeing the bli
+memory and releasing the associated hold on the buffer so it can be
+released and the filesystem unmounted.
+
+If this sequence occurs while ->bli_refcount is elevated (i.e.,
+another transaction is open and attempting to modify the buffer),
+however, ->iop_unpin() may not be responsible for releasing the bli.
+Instead, the transaction may release the final ->bli_refcount
+reference and thus xfs_trans_brelse() is responsible for tearing
+down the bli.
+
+While xfs_trans_brelse() does drop the reference count, it only
+attempts to release the bli if it is clean (i.e., not in the
+CIL/AIL). If the filesystem is shutdown and the bli is sitting dirty
+in the CIL as noted above, this ends up skipping the last
+opportunity to release the bli. In turn, this leaves the hold on the
+buffer and causes an unmount hang. This can be reproduced by running
+generic/388 in repetition.
+
+Update xfs_trans_brelse() to handle this shutdown corner case
+correctly. If the final bli reference is dropped and the filesystem
+is shutdown, remove the bli from the AIL (if necessary) and release
+the bli to drop the buffer hold and ensure an unmount does not hang.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_trans_buf.c |   23 +++++++++++++++--------
+ 1 file changed, 15 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t       *tp,
+                xfs_buf_t      *bp)
+ {
+       xfs_buf_log_item_t      *bip;
++      int                     freed;
+       /*
+        * Default to a normal brelse() call if the tp is NULL.
+@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t     *tp,
+       /*
+        * Drop our reference to the buf log item.
+        */
+-      atomic_dec(&bip->bli_refcount);
++      freed = atomic_dec_and_test(&bip->bli_refcount);
+       /*
+-       * If the buf item is not tracking data in the log, then
+-       * we must free it before releasing the buffer back to the
+-       * free pool.  Before releasing the buffer to the free pool,
+-       * clear the transaction pointer in b_fsprivate2 to dissolve
+-       * its relation to this transaction.
+-       */
+-      if (!xfs_buf_item_dirty(bip)) {
++       * If the buf item is not tracking data in the log, then we must free it
++       * before releasing the buffer back to the free pool.
++       *
++       * If the fs has shutdown and we dropped the last reference, it may fall
++       * on us to release a (possibly dirty) bli if it never made it to the
++       * AIL (e.g., the aborted unpin already happened and didn't release it
++       * due to our reference). Since we're already shutdown and need xa_lock,
++       * just force remove from the AIL and release the bli here.
++       */
++      if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
++              xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
++              xfs_buf_item_relse(bp);
++      } else if (!xfs_buf_item_dirty(bip)) {
+ /***
+               ASSERT(bp->b_pincount == 0);
+ ***/
diff --git a/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch b/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
new file mode 100644 (file)
index 0000000..0c8d3ca
--- /dev/null
@@ -0,0 +1,79 @@
+From 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:35:35 -0700
+Subject: xfs: remove bli from AIL before release on transaction abort
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 upstream.
+
+When a buffer is modified, logged and committed, it ultimately ends
+up sitting on the AIL with a dirty bli waiting for metadata
+writeback. If another transaction locks and invalidates the buffer
+(freeing an inode chunk, for example) in the meantime, the bli is
+flagged as stale, the dirty state is cleared and the bli remains in
+the AIL.
+
+If a shutdown occurs before the transaction that has invalidated the
+buffer is committed, the transaction is ultimately aborted. The log
+items are flagged as such and ->iop_unlock() handles the aborted
+items. Because the bli is clean (due to the invalidation),
+->iop_unlock() unconditionally releases it. The log item may still
+reside in the AIL, however, which means the I/O completion handler
+may still run and attempt to access it. This results in assert
+failure due to the release of the bli while still present in the AIL
+and a subsequent NULL dereference and panic in the buffer I/O
+completion handling. This can be reproduced by running generic/388
+in repetition.
+
+To avoid this problem, update xfs_buf_item_unlock() to first check
+whether the bli is aborted and if so, remove it from the AIL before
+it is released. This ensures that the bli is no longer accessed
+during the shutdown sequence after it has been freed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf_item.c |   21 ++++++++++++---------
+ 1 file changed, 12 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -636,20 +636,23 @@ xfs_buf_item_unlock(
+       /*
+        * Clean buffers, by definition, cannot be in the AIL. However, aborted
+-       * buffers may be dirty and hence in the AIL. Therefore if we are
+-       * aborting a buffer and we've just taken the last refernce away, we
+-       * have to check if it is in the AIL before freeing it. We need to free
+-       * it in this case, because an aborted transaction has already shut the
+-       * filesystem down and this is the last chance we will have to do so.
++       * buffers may be in the AIL regardless of dirty state. An aborted
++       * transaction that invalidates a buffer already in the AIL may have
++       * marked it stale and cleared the dirty state, for example.
++       *
++       * Therefore if we are aborting a buffer and we've just taken the last
++       * reference away, we have to check if it is in the AIL before freeing
++       * it. We need to free it in this case, because an aborted transaction
++       * has already shut the filesystem down and this is the last chance we
++       * will have to do so.
+        */
+       if (atomic_dec_and_test(&bip->bli_refcount)) {
+-              if (clean)
+-                      xfs_buf_item_relse(bp);
+-              else if (aborted) {
++              if (aborted) {
+                       ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+                       xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+                       xfs_buf_item_relse(bp);
+-              }
++              } else if (clean)
++                      xfs_buf_item_relse(bp);
+       }
+       if (!(flags & XFS_BLI_HOLD))
diff --git a/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch b/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
new file mode 100644 (file)
index 0000000..dc8ed59
--- /dev/null
@@ -0,0 +1,56 @@
+From hch@lst.de  Mon Sep 18 10:08:30 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:37 -0700
+Subject: xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-13-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 4c1a67bd3606540b9b42caff34a1d5cd94b1cf65 upstream.
+
+We must initialize the firstfsb parameter to _bmapi_write so that it
+doesn't incorrectly treat stack garbage as a restriction on which AGs
+it can search for free space.
+
+Fixes-coverity-id: 1402025
+Fixes-coverity-id: 1415167
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c |    9 +++++++++
+ fs/xfs/xfs_reflink.c     |    2 +-
+ 2 files changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -6639,6 +6639,15 @@ xfs_bmap_finish_one(
+       bmap.br_blockcount = *blockcount;
+       bmap.br_state = state;
++      /*
++       * firstfsb is tied to the transaction lifetime and is used to
++       * ensure correct AG locking order and schedule work item
++       * continuations.  XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
++       * to only making one bmap call per transaction, so it should
++       * be safe to have it as a local variable here.
++       */
++      firstfsb = NULLFSBLOCK;
++
+       trace_xfs_bmap_deferred(tp->t_mountp,
+                       XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+                       XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -333,7 +333,7 @@ xfs_reflink_convert_cow_extent(
+       struct xfs_defer_ops            *dfops)
+ {
+       struct xfs_bmbt_irec            irec = *imap;
+-      xfs_fsblock_t                   first_block;
++      xfs_fsblock_t                   first_block = NULLFSBLOCK;
+       int                             nimaps = 1;
+       if (imap->br_state == XFS_EXT_NORM)
diff --git a/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch b/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch
new file mode 100644 (file)
index 0000000..3015622
--- /dev/null
@@ -0,0 +1,294 @@
+From e1a4e37cc7b665b6804fba812aca2f4d7402c249 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Wed, 14 Jun 2017 21:25:57 -0700
+Subject: xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit e1a4e37cc7b665b6804fba812aca2f4d7402c249 upstream.
+
+In a pathological scenario where we are trying to bunmapi a single
+extent in which every other block is shared, it's possible that trying
+to unmap the entire large extent in a single transaction can generate so
+many EFIs that we overflow the transaction reservation.
+
+Therefore, use a heuristic to guess at the number of blocks we can
+safely unmap from a reflink file's data fork in an single transaction.
+This should prevent problems such as the log head slamming into the tail
+and ASSERTs that trigger because we've exceeded the transaction
+reservation.
+
+Note that since bunmapi can fail to unmap the entire range, we must also
+teach the deferred unmap code to roll into a new transaction whenever we
+get low on reservation.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+[hch: random edits, all bugs are my fault]
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c     |   37 ++++++++++++++++++++++++++++---------
+ fs/xfs/libxfs/xfs_bmap.h     |    2 +-
+ fs/xfs/libxfs/xfs_refcount.c |   10 +---------
+ fs/xfs/libxfs/xfs_refcount.h |   16 ++++++++++++++++
+ fs/xfs/xfs_bmap_item.c       |   17 +++++++++++++++--
+ fs/xfs/xfs_trans.h           |    2 +-
+ fs/xfs/xfs_trans_bmap.c      |   11 +++++++++--
+ 7 files changed, 71 insertions(+), 24 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -5555,6 +5555,7 @@ __xfs_bunmapi(
+       int                     whichfork;      /* data or attribute fork */
+       xfs_fsblock_t           sum;
+       xfs_filblks_t           len = *rlen;    /* length to unmap in file */
++      xfs_fileoff_t           max_len;
+       trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+@@ -5576,6 +5577,16 @@ __xfs_bunmapi(
+       ASSERT(len > 0);
+       ASSERT(nexts >= 0);
++      /*
++       * Guesstimate how many blocks we can unmap without running the risk of
++       * blowing out the transaction with a mix of EFIs and reflink
++       * adjustments.
++       */
++      if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
++              max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
++      else
++              max_len = len;
++
+       if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+           (error = xfs_iread_extents(tp, ip, whichfork)))
+               return error;
+@@ -5621,7 +5632,7 @@ __xfs_bunmapi(
+       extno = 0;
+       while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+-             (nexts == 0 || extno < nexts)) {
++             (nexts == 0 || extno < nexts) && max_len > 0) {
+               /*
+                * Is the found extent after a hole in which bno lives?
+                * Just back up to the previous extent, if so.
+@@ -5655,6 +5666,15 @@ __xfs_bunmapi(
+               }
+               if (del.br_startoff + del.br_blockcount > bno + 1)
+                       del.br_blockcount = bno + 1 - del.br_startoff;
++
++              /* How much can we safely unmap? */
++              if (max_len < del.br_blockcount) {
++                      del.br_startoff += del.br_blockcount - max_len;
++                      if (!wasdel)
++                              del.br_startblock += del.br_blockcount - max_len;
++                      del.br_blockcount = max_len;
++              }
++
+               sum = del.br_startblock + del.br_blockcount;
+               if (isrt &&
+                   (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+@@ -5835,6 +5855,7 @@ __xfs_bunmapi(
+               if (!isrt && wasdel)
+                       xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
++              max_len -= del.br_blockcount;
+               bno = del.br_startoff - 1;
+ nodelete:
+               /*
+@@ -6604,25 +6625,24 @@ xfs_bmap_finish_one(
+       int                             whichfork,
+       xfs_fileoff_t                   startoff,
+       xfs_fsblock_t                   startblock,
+-      xfs_filblks_t                   blockcount,
++      xfs_filblks_t                   *blockcount,
+       xfs_exntst_t                    state)
+ {
+       struct xfs_bmbt_irec            bmap;
+       int                             nimaps = 1;
+       xfs_fsblock_t                   firstfsb;
+       int                             flags = XFS_BMAPI_REMAP;
+-      int                             done;
+       int                             error = 0;
+       bmap.br_startblock = startblock;
+       bmap.br_startoff = startoff;
+-      bmap.br_blockcount = blockcount;
++      bmap.br_blockcount = *blockcount;
+       bmap.br_state = state;
+       trace_xfs_bmap_deferred(tp->t_mountp,
+                       XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+                       XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+-                      ip->i_ino, whichfork, startoff, blockcount, state);
++                      ip->i_ino, whichfork, startoff, *blockcount, state);
+       if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+               return -EFSCORRUPTED;
+@@ -6641,12 +6661,11 @@ xfs_bmap_finish_one(
+                                       bmap.br_blockcount, flags, &firstfsb,
+                                       bmap.br_blockcount, &bmap, &nimaps,
+                                       dfops);
++              *blockcount = 0;
+               break;
+       case XFS_BMAP_UNMAP:
+-              error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+-                              bmap.br_blockcount, flags, 1, &firstfsb,
+-                              dfops, &done);
+-              ASSERT(done);
++              error = __xfs_bunmapi(tp, ip, startoff, blockcount,
++                              XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
+               break;
+       default:
+               ASSERT(0);
+--- a/fs/xfs/libxfs/xfs_bmap.h
++++ b/fs/xfs/libxfs/xfs_bmap.h
+@@ -265,7 +265,7 @@ struct xfs_bmap_intent {
+ int   xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+               struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+               int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+-              xfs_filblks_t blockcount, xfs_exntst_t state);
++              xfs_filblks_t *blockcount, xfs_exntst_t state);
+ int   xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+               struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+ int   xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
+ }
+ /*
+- * While we're adjusting the refcounts records of an extent, we have
+- * to keep an eye on the number of extents we're dirtying -- run too
+- * many in a single transaction and we'll exceed the transaction's
+- * reservation and crash the fs.  Each record adds 12 bytes to the
+- * log (plus any key updates) so we'll conservatively assume 24 bytes
+- * per record.  We must also leave space for btree splits on both ends
+- * of the range and space for the CUD and a new CUI.
+- *
+  * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
+  * true incorrectly is a shutdown FS; the penalty for guessing false
+  * incorrectly is more transaction rolls than might be necessary.
+@@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
+       else if (overhead > cur->bc_tp->t_log_res)
+               return false;
+       return  cur->bc_tp->t_log_res - overhead >
+-              cur->bc_private.a.priv.refc.nr_ops * 32;
++              cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ }
+ /*
+--- a/fs/xfs/libxfs/xfs_refcount.h
++++ b/fs/xfs/libxfs/xfs_refcount.h
+@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(
+ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+               xfs_agnumber_t agno);
++/*
++ * While we're adjusting the refcounts records of an extent, we have
++ * to keep an eye on the number of extents we're dirtying -- run too
++ * many in a single transaction and we'll exceed the transaction's
++ * reservation and crash the fs.  Each record adds 12 bytes to the
++ * log (plus any key updates) so we'll conservatively assume 32 bytes
++ * per record.  We must also leave space for btree splits on both ends
++ * of the range and space for the CUD and a new CUI.
++ */
++#define XFS_REFCOUNT_ITEM_OVERHEAD    32
++
++static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
++{
++      return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
++}
++
+ #endif        /* __XFS_REFCOUNT_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -395,6 +395,7 @@ xfs_bui_recover(
+       struct xfs_map_extent           *bmap;
+       xfs_fsblock_t                   startblock_fsb;
+       xfs_fsblock_t                   inode_fsb;
++      xfs_filblks_t                   count;
+       bool                            op_ok;
+       struct xfs_bud_log_item         *budp;
+       enum xfs_bmap_intent_type       type;
+@@ -403,6 +404,7 @@ xfs_bui_recover(
+       struct xfs_trans                *tp;
+       struct xfs_inode                *ip = NULL;
+       struct xfs_defer_ops            dfops;
++      struct xfs_bmbt_irec            irec;
+       xfs_fsblock_t                   firstfsb;
+       ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+@@ -480,13 +482,24 @@ xfs_bui_recover(
+       }
+       xfs_trans_ijoin(tp, ip, 0);
++      count = bmap->me_len;
+       error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+                       ip, whichfork, bmap->me_startoff,
+-                      bmap->me_startblock, bmap->me_len,
+-                      state);
++                      bmap->me_startblock, &count, state);
+       if (error)
+               goto err_dfops;
++      if (count > 0) {
++              ASSERT(type == XFS_BMAP_UNMAP);
++              irec.br_startblock = bmap->me_startblock;
++              irec.br_blockcount = count;
++              irec.br_startoff = bmap->me_startoff;
++              irec.br_state = state;
++              error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
++              if (error)
++                      goto err_dfops;
++      }
++
+       /* Finish transaction, free inodes. */
+       error = xfs_defer_finish(&tp, &dfops, NULL);
+       if (error)
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -277,6 +277,6 @@ int xfs_trans_log_finish_bmap_update(str
+               struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
+               enum xfs_bmap_intent_type type, struct xfs_inode *ip,
+               int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+-              xfs_filblks_t blockcount, xfs_exntst_t state);
++              xfs_filblks_t *blockcount, xfs_exntst_t state);
+ #endif        /* __XFS_TRANS_H__ */
+--- a/fs/xfs/xfs_trans_bmap.c
++++ b/fs/xfs/xfs_trans_bmap.c
+@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
+       int                             whichfork,
+       xfs_fileoff_t                   startoff,
+       xfs_fsblock_t                   startblock,
+-      xfs_filblks_t                   blockcount,
++      xfs_filblks_t                   *blockcount,
+       xfs_exntst_t                    state)
+ {
+       int                             error;
+@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
+       void                            **state)
+ {
+       struct xfs_bmap_intent          *bmap;
++      xfs_filblks_t                   count;
+       int                             error;
+       bmap = container_of(item, struct xfs_bmap_intent, bi_list);
++      count = bmap->bi_bmap.br_blockcount;
+       error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
+                       bmap->bi_type,
+                       bmap->bi_owner, bmap->bi_whichfork,
+                       bmap->bi_bmap.br_startoff,
+                       bmap->bi_bmap.br_startblock,
+-                      bmap->bi_bmap.br_blockcount,
++                      &count,
+                       bmap->bi_bmap.br_state);
++      if (!error && count > 0) {
++              ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
++              bmap->bi_bmap.br_blockcount = count;
++              return -EAGAIN;
++      }
+       kmem_free(bmap);
+       return error;
+ }