From: Greg Kroah-Hartman Date: Mon, 18 Sep 2017 08:11:55 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.9.51~12 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=60a196c617841e32455d50264a7b72ab74480489;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch xfs-check-_btree_check_block-value.patch xfs-don-t-allow-bmap-on-rt-files.patch xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch xfs-fix-inobt-inode-allocation-search-optimization.patch xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch xfs-fix-per-inode-dax-flag-inheritance.patch xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch xfs-free-uncommitted-transactions-during-log-recovery.patch xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch --- diff --git a/queue-4.9/series b/queue-4.9/series index e5d0b0047c4..cc556c6deea 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -28,3 +28,20 @@ f2fs-check-hot_data-for-roll-forward-recovery.patch x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch +xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch +xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch +xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch +xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch +xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch +xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch +xfs-don-t-allow-bmap-on-rt-files.patch +xfs-free-uncommitted-transactions-during-log-recovery.patch +xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch +xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch +xfs-check-_btree_check_block-value.patch +xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch +xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch +xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch +xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch +xfs-fix-per-inode-dax-flag-inheritance.patch +xfs-fix-inobt-inode-allocation-search-optimization.patch diff --git a/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch b/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch new file mode 100644 index 00000000000..1947cb9447d --- /dev/null +++ b/queue-4.9/xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch @@ -0,0 +1,50 @@ +From hch@lst.de Mon Sep 18 10:08:45 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:38 -0700 +Subject: xfs: check _alloc_read_agf buffer pointer before using +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" +Message-ID: <20170917210712.10804-14-hch@lst.de> + + +From: "Darrick J. Wong" + +commit 10479e2dea83d4c421ad05dfc55d918aa8dfc0cd upstream. + +In some circumstances, _alloc_read_agf can return an error code of zero +but also a null AGF buffer pointer. Check for this and jump out. + +Fixes-coverity-id: 1415250 +Fixes-coverity-id: 1415320 +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_refcount.c | 4 ++++ + fs/xfs/xfs_reflink.c | 2 ++ + 2 files changed, 6 insertions(+) + +--- a/fs/xfs/libxfs/xfs_refcount.c ++++ b/fs/xfs/libxfs/xfs_refcount.c +@@ -1640,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers( + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; ++ if (!agbp) { ++ error = -ENOMEM; ++ goto out_trans; ++ } + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); + + /* Find all the leftover CoW staging extents. */ +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -169,6 +169,8 @@ xfs_reflink_find_shared( + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; ++ if (!agbp) ++ return -ENOMEM; + + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + diff --git a/queue-4.9/xfs-check-_btree_check_block-value.patch b/queue-4.9/xfs-check-_btree_check_block-value.patch new file mode 100644 index 00000000000..f4e3ccc74da --- /dev/null +++ b/queue-4.9/xfs-check-_btree_check_block-value.patch @@ -0,0 +1,48 @@ +From hch@lst.de Mon Sep 18 10:08:11 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:36 -0700 +Subject: xfs: check _btree_check_block value +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" +Message-ID: <20170917210712.10804-12-hch@lst.de> + + +From: "Darrick J. Wong" + +commit 1e86eabe73b73c82e1110c746ed3ec6d5e1c0a0d upstream. + +Check the _btree_check_block return value for the firstrec and lastrec +functions, since we have the ability to signal that the repositioning +did not succeed. + +Fixes-coverity-id: 114067 +Fixes-coverity-id: 114068 +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_btree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -714,7 +714,8 @@ xfs_btree_firstrec( + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); +- xfs_btree_check_block(cur, block, level, bp); ++ if (xfs_btree_check_block(cur, block, level, bp)) ++ return 0; + /* + * It's empty, there is no such record. + */ +@@ -743,7 +744,8 @@ xfs_btree_lastrec( + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); +- xfs_btree_check_block(cur, block, level, bp); ++ if (xfs_btree_check_block(cur, block, level, bp)) ++ return 0; + /* + * It's empty, there is no such record. + */ diff --git a/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch b/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch new file mode 100644 index 00000000000..b0d1f4dac3b --- /dev/null +++ b/queue-4.9/xfs-don-t-allow-bmap-on-rt-files.patch @@ -0,0 +1,42 @@ +From 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 19 Jun 2017 13:19:08 -0700 +Subject: xfs: don't allow bmap on rt files + +From: Darrick J. Wong + +commit 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c upstream. + +bmap returns a dumb LBA address but not the block device that goes with +that LBA. Swapfiles don't care about this and will blindly assume that +the data volume is the correct blockdev, which is totally bogus for +files on the rt subvolume. This results in the swap code doing IOs to +arbitrary locations on the data device(!) if the passed in mapping is a +realtime file, so just turn off bmap for rt files. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/xfs/xfs_aops.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -1566,9 +1566,12 @@ xfs_vm_bmap( + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseѕ the file system for actual I/O. We really can't allow + * that on reflinks inodes, so we have to skip out here. And yes, +- * 0 is the magic code for a bmap error.. ++ * 0 is the magic code for a bmap error. ++ * ++ * Since we don't pass back blockdev info, we can't return bmap ++ * information for rt files either. + */ +- if (xfs_is_reflink_inode(ip)) { ++ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } diff --git a/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch b/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch new file mode 100644 index 00000000000..f3bf13baced --- /dev/null +++ b/queue-4.9/xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch @@ -0,0 +1,87 @@ +From hch@lst.de Mon Sep 18 10:07:57 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:35 -0700 +Subject: xfs: don't crash on unexpected holes in dir/attr btrees +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" +Message-ID: <20170917210712.10804-11-hch@lst.de> + + +From: "Darrick J. Wong" + +commit cd87d867920155911d0d2e6485b769d853547750 upstream. + +In quite a few places we call xfs_da_read_buf with a mappedbno that we +don't control, then assume that the function passes back either an error +code or a buffer pointer. Unfortunately, if mappedbno == -2 and bno +maps to a hole, we get a return code of zero and a NULL buffer, which +means that we crash if we actually try to use that buffer pointer. This +happens immediately when we set the buffer type for transaction context. + +Therefore, check that we have no error code and a non-NULL bp before +trying to use bp. This patch is a follow-up to an incomplete fix in +96a3aefb8ffde231 ("xfs: don't crash if reading a directory results in an +unexpected hole"). + +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- + fs/xfs/libxfs/xfs_da_btree.c | 2 +- + fs/xfs/libxfs/xfs_dir2_block.c | 2 +- + fs/xfs/libxfs/xfs_dir2_leaf.c | 4 ++-- + 4 files changed, 5 insertions(+), 5 deletions(-) + +--- a/fs/xfs/libxfs/xfs_attr_leaf.c ++++ b/fs/xfs/libxfs/xfs_attr_leaf.c +@@ -351,7 +351,7 @@ xfs_attr3_leaf_read( + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); +- if (!err && tp) ++ if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; + } +--- a/fs/xfs/libxfs/xfs_da_btree.c ++++ b/fs/xfs/libxfs/xfs_da_btree.c +@@ -263,7 +263,7 @@ xfs_da3_node_read( + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); +- if (!err && tp) { ++ if (!err && tp && *bpp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + +--- a/fs/xfs/libxfs/xfs_dir2_block.c ++++ b/fs/xfs/libxfs/xfs_dir2_block.c +@@ -139,7 +139,7 @@ xfs_dir3_block_read( + + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); +- if (!err && tp) ++ if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; + } +--- a/fs/xfs/libxfs/xfs_dir2_leaf.c ++++ b/fs/xfs/libxfs/xfs_dir2_leaf.c +@@ -268,7 +268,7 @@ xfs_dir3_leaf_read( + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); +- if (!err && tp) ++ if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; + } +@@ -285,7 +285,7 @@ xfs_dir3_leafn_read( + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); +- if (!err && tp) ++ if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; + } diff --git a/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch b/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch new file mode 100644 index 00000000000..9b8eea399aa --- /dev/null +++ b/queue-4.9/xfs-fix-inobt-inode-allocation-search-optimization.patch @@ -0,0 +1,58 @@ +From hch@lst.de Mon Sep 18 10:09:48 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:42 -0700 +Subject: xfs: fix inobt inode allocation search optimization +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, Omar Sandoval , "Darrick J . Wong" +Message-ID: <20170917210712.10804-18-hch@lst.de> + + +From: Omar Sandoval + +commit c44245b3d5435f533ca8346ece65918f84c057f9 upstream. + +When we try to allocate a free inode by searching the inobt, we try to +find the inode nearest the parent inode by searching chunks both left +and right of the chunk containing the parent. As an optimization, we +cache the leftmost and rightmost records that we previously searched; if +we do another allocation with the same parent inode, we'll pick up the +search where it last left off. + +There's a bug in the case where we found a free inode to the left of the +parent's chunk: we need to update the cached left and right records, but +because we already reassigned the right record to point to the left, we +end up assigning the left record to both the cached left and right +records. + +This isn't a correctness problem strictly, but it can result in the next +allocation rechecking chunks unnecessarily or allocating inodes further +away from the parent than it needs to. Fix it by swapping the record +pointer after we update the cached left and right records. + +Fixes: bd169565993b ("xfs: speed up free inode search") +Signed-off-by: Omar Sandoval +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_ialloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_ialloc.c ++++ b/fs/xfs/libxfs/xfs_ialloc.c +@@ -1236,13 +1236,13 @@ xfs_dialloc_ag_inobt( + + /* free inodes to the left? */ + if (useleft && trec.ir_freecount) { +- rec = trec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = tcur; + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; ++ rec = trec; + goto alloc_inode; + } + diff --git a/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch b/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch new file mode 100644 index 00000000000..6014e22db42 --- /dev/null +++ b/queue-4.9/xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch @@ -0,0 +1,52 @@ +From hch@lst.de Mon Sep 18 10:09:24 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:40 -0700 +Subject: xfs: fix multi-AG deadlock in xfs_bunmapi +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" +Message-ID: <20170917210712.10804-16-hch@lst.de> + + +commit 5b094d6dac0451ad89b1dc088395c7b399b7e9e8 upstream. + +Just like in the allocator we must avoid touching multiple AGs out of +order when freeing blocks, as freeing still locks the AGF and can cause +the same AB-BA deadlocks as in the allocation path. + +Signed-off-by: Christoph Hellwig +Reported-by: Nikolay Borisov +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -5556,6 +5556,7 @@ __xfs_bunmapi( + xfs_fsblock_t sum; + xfs_filblks_t len = *rlen; /* length to unmap in file */ + xfs_fileoff_t max_len; ++ xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; + + trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + +@@ -5658,6 +5659,17 @@ __xfs_bunmapi( + ASSERT(ep != NULL); + del = got; + wasdel = isnullstartblock(del.br_startblock); ++ ++ /* ++ * Make sure we don't touch multiple AGF headers out of order ++ * in a single transaction, as that could cause AB-BA deadlocks. ++ */ ++ if (!wasdel) { ++ agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); ++ if (prev_agno != NULLAGNUMBER && prev_agno > agno) ++ break; ++ prev_agno = agno; ++ } + if (got.br_startoff < start) { + del.br_startoff = start; + del.br_blockcount -= start - got.br_startoff; diff --git a/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch b/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch new file mode 100644 index 00000000000..018c7fda13e --- /dev/null +++ b/queue-4.9/xfs-fix-per-inode-dax-flag-inheritance.patch @@ -0,0 +1,72 @@ +From hch@lst.de Mon Sep 18 10:09:34 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:41 -0700 +Subject: xfs: Fix per-inode DAX flag inheritance +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, Lukas Czerner , "Darrick J . Wong" +Message-ID: <20170917210712.10804-17-hch@lst.de> + + +From: Lukas Czerner + +commit 56bdf855e676f1f2ed7033f288f57dfd315725ba upstream. + +According to the commit that implemented per-inode DAX flag: +commit 58f88ca2df72 ("xfs: introduce per-inode DAX enablement") +the flag is supposed to act as "inherit flag". + +Currently this only works in the situations where parent directory +already has a flag in di_flags set, otherwise inheritance does not +work. This is because setting the XFS_DIFLAG2_DAX flag is done in a +wrong branch designated for di_flags, not di_flags2. + +Fix this by moving the code to branch designated for setting di_flags2, +which does test for flags in di_flags2. + +Fixes: 58f88ca2df72 ("xfs: introduce per-inode DAX enablement") +Signed-off-by: Lukas Czerner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -881,7 +881,6 @@ xfs_ialloc( + case S_IFREG: + case S_IFDIR: + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { +- uint64_t di_flags2 = 0; + uint di_flags = 0; + + if (S_ISDIR(mode)) { +@@ -918,20 +917,23 @@ xfs_ialloc( + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; +- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) +- di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags |= di_flags; +- ip->i_d.di_flags2 |= di_flags2; + } + if (pip && + (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && + pip->i_d.di_version == 3 && + ip->i_d.di_version == 3) { ++ uint64_t di_flags2 = 0; ++ + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { +- ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ++ di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } ++ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) ++ di_flags2 |= XFS_DIFLAG2_DAX; ++ ++ ip->i_d.di_flags2 |= di_flags2; + } + /* FALLTHROUGH */ + case S_IFLNK: diff --git a/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch b/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch new file mode 100644 index 00000000000..5888b8cf152 --- /dev/null +++ b/queue-4.9/xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch @@ -0,0 +1,41 @@ +From hch@lst.de Mon Sep 18 10:09:04 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:39 -0700 +Subject: xfs: fix quotacheck dquot id overflow infinite loop +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, Brian Foster , "Darrick J . Wong" +Message-ID: <20170917210712.10804-15-hch@lst.de> + + +From: Brian Foster + +commit cfaf2d034360166e569a4929dd83ae9698bed856 upstream. + +If a dquot has an id of U32_MAX, the next lookup index increment +overflows the uint32_t back to 0. This starts the lookup sequence +over from the beginning, repeats indefinitely and results in a +livelock. + +Update xfs_qm_dquot_walk() to explicitly check for the lookup +overflow and exit the loop. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_qm.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -111,6 +111,9 @@ restart: + skipped = 0; + break; + } ++ /* we're done if id overflows back to zero */ ++ if (!next_index) ++ break; + } + + if (skipped) { diff --git a/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch b/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch new file mode 100644 index 00000000000..c93869c496d --- /dev/null +++ b/queue-4.9/xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch @@ -0,0 +1,73 @@ +From 95989c46d2a156365867b1d795fdefce71bce378 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 8 Jun 2017 08:23:07 -0700 +Subject: xfs: fix spurious spin_is_locked() assert failures on non-smp kernels + +From: Brian Foster + +commit 95989c46d2a156365867b1d795fdefce71bce378 upstream. + +The 0-day kernel test robot reports assertion failures on +!CONFIG_SMP kernels due to failed spin_is_locked() checks. As it +turns out, spin_is_locked() is hardcoded to return zero on +!CONFIG_SMP kernels and so this function cannot be relied on to +verify spinlock state in this configuration. + +To avoid this problem, replace the associated asserts with lockdep +variants that do the right thing regardless of kernel configuration. +Drop the one assert that checks for an unlocked lock as there is no +suitable lockdep variant for that case. This moves the spinlock +checks from XFS debug code to lockdep, but generally provides the +same level of protection. + +Reported-by: kbuild test robot +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_buf.c | 2 +- + fs/xfs/xfs_icache.c | 5 ++--- + 2 files changed, 3 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_buf.c ++++ b/fs/xfs/xfs_buf.c +@@ -116,7 +116,7 @@ static inline void + __xfs_buf_ioacct_dec( + struct xfs_buf *bp) + { +- ASSERT(spin_is_locked(&bp->b_lock)); ++ lockdep_assert_held(&bp->b_lock); + + if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { + bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -66,7 +66,6 @@ xfs_inode_alloc( + + XFS_STATS_INC(mp, vn_active); + ASSERT(atomic_read(&ip->i_pincount) == 0); +- ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + +@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag( + { + struct xfs_mount *mp = pag->pag_mount; + +- ASSERT(spin_is_locked(&pag->pag_ici_lock)); ++ lockdep_assert_held(&pag->pag_ici_lock); + if (pag->pag_ici_reclaimable++) + return; + +@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag( + { + struct xfs_mount *mp = pag->pag_mount; + +- ASSERT(spin_is_locked(&pag->pag_ici_lock)); ++ lockdep_assert_held(&pag->pag_ici_lock); + if (--pag->pag_ici_reclaimable) + return; + diff --git a/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch b/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch new file mode 100644 index 00000000000..201a257d9c0 --- /dev/null +++ b/queue-4.9/xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch @@ -0,0 +1,39 @@ +From hch@lst.de Mon Sep 18 10:07:40 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:34 -0700 +Subject: xfs: free cowblocks and retry on buffered write ENOSPC +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, Brian Foster , "Darrick J . Wong" +Message-ID: <20170917210712.10804-10-hch@lst.de> + + +From: Brian Foster + +commit cf2cb7845d6e101cb17bd62f8aa08cd514fc8988 upstream. + +XFS runs an eofblocks reclaim scan before returning an ENOSPC error to +userspace for buffered writes. This facilitates aggressive speculative +preallocation without causing user visible side effects such as +premature ENOSPC. + +Run a cowblocks scan in the same situation to reclaim lingering COW fork +preallocation throughout the filesystem. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_file.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -729,6 +729,7 @@ write_retry: + xfs_rw_iunlock(ip, iolock); + eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + xfs_icache_free_eofblocks(ip->i_mount, &eofb); ++ xfs_icache_free_cowblocks(ip->i_mount, &eofb); + goto write_retry; + } + diff --git a/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch b/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch new file mode 100644 index 00000000000..e820bd04e9b --- /dev/null +++ b/queue-4.9/xfs-free-uncommitted-transactions-during-log-recovery.patch @@ -0,0 +1,102 @@ +From hch@lst.de Mon Sep 18 10:07:17 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:33 -0700 +Subject: xfs: free uncommitted transactions during log recovery +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, Brian Foster , "Darrick J . Wong" + +From: Brian Foster + +commit 39775431f82f890f4aaa08860a30883d081bffc7 upstream. + +Log recovery allocates in-core transaction and member item data +structures on-demand as it processes the on-disk log. Transactions +are allocated on first encounter on-disk and stored in a hash table +structure where they are easily accessible for subsequent lookups. +Transaction items are also allocated on demand and are attached to +the associated transactions. + +When a commit record is encountered in the log, the transaction is +committed to the fs and the in-core structures are freed. If a +filesystem crashes or shuts down before all in-core log buffers are +flushed to the log, however, not all transactions may have commit +records in the log. As expected, the modifications in such an +incomplete transaction are not replayed to the fs. The in-core data +structures for the partial transaction are never freed, however, +resulting in a memory leak. + +Update xlog_do_recovery_pass() to first correctly initialize the +hash table array so empty lists can be distinguished from populated +lists on function exit. Update xlog_recover_free_trans() to always +remove the transaction from the list prior to freeing the associated +memory. Finally, walk the hash table of transaction lists as the +last step before it goes out of scope and free any transactions that +may remain on the lists. This prevents a memory leak of partial +transactions in the log. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log_recover.c | 21 ++++++++++++++++++++- + 1 file changed, 20 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -4152,7 +4152,7 @@ xlog_recover_commit_trans( + + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 + +- hlist_del(&trans->r_list); ++ hlist_del_init(&trans->r_list); + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) +@@ -4354,6 +4354,8 @@ xlog_recover_free_trans( + xlog_recover_item_t *item, *n; + int i; + ++ hlist_del_init(&trans->r_list); ++ + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); +@@ -5222,12 +5224,16 @@ xlog_do_recovery_pass( + int error2 = 0; + int bblks, split_bblks; + int hblks, split_hblks, wrapped_hblks; ++ int i; + struct hlist_head rhash[XLOG_RHASH_SIZE]; + LIST_HEAD (buffer_list); + + ASSERT(head_blk != tail_blk); + rhead_blk = 0; + ++ for (i = 0; i < XLOG_RHASH_SIZE; i++) ++ INIT_HLIST_HEAD(&rhash[i]); ++ + /* + * Read the header of the tail block and get the iclog buffer size from + * h_size. Use this to tell how many sectors make up the log header. +@@ -5464,6 +5470,19 @@ xlog_do_recovery_pass( + if (error && first_bad) + *first_bad = rhead_blk; + ++ /* ++ * Transactions are freed at commit time but transactions without commit ++ * records on disk are never committed. Free any that may be left in the ++ * hash table. ++ */ ++ for (i = 0; i < XLOG_RHASH_SIZE; i++) { ++ struct hlist_node *tmp; ++ struct xlog_recover *trans; ++ ++ hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list) ++ xlog_recover_free_trans(trans); ++ } ++ + return error ? error : error2; + } + diff --git a/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch b/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch new file mode 100644 index 00000000000..ac55c579047 --- /dev/null +++ b/queue-4.9/xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch @@ -0,0 +1,85 @@ +From a54fba8f5a0dc36161cacdf2aa90f007f702ec1a Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 18 May 2017 16:36:24 -0700 +Subject: xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff() + +From: Jan Kara + +commit a54fba8f5a0dc36161cacdf2aa90f007f702ec1a upstream. + +Currently several places in xfs_find_get_desired_pgoff() handle the case +of a missing page. Make them all handled in one place after the loop has +terminated. + +Signed-off-by: Jan Kara +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 38 ++++++++------------------------------ + 1 file changed, 8 insertions(+), 30 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -1139,29 +1139,8 @@ xfs_find_get_desired_pgoff( + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); +- /* +- * No page mapped into given range. If we are searching holes +- * and if this is the first time we got into the loop, it means +- * that the given offset is landed in a hole, return it. +- * +- * If we have already stepped through some block buffers to find +- * holes but they all contains data. In this case, the last +- * offset is already updated and pointed to the end of the last +- * mapped page, if it does not reach the endpoint to search, +- * that means there should be a hole between them. +- */ +- if (nr_pages == 0) { +- /* Data search found nothing */ +- if (type == DATA_OFF) +- break; +- +- ASSERT(type == HOLE_OFF); +- if (lastoff == startoff || lastoff < endoff) { +- found = true; +- *offset = lastoff; +- } ++ if (nr_pages == 0) + break; +- } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; +@@ -1227,21 +1206,20 @@ xfs_find_get_desired_pgoff( + + /* + * The number of returned pages less than our desired, search +- * done. In this case, nothing was found for searching data, +- * but we found a hole behind the last offset. ++ * done. + */ +- if (nr_pages < want) { +- if (type == HOLE_OFF) { +- *offset = lastoff; +- found = true; +- } ++ if (nr_pages < want) + break; +- } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + ++ /* No page at lastoff and we are not done - we found a hole. */ ++ if (type == HOLE_OFF && lastoff < endoff) { ++ *offset = lastoff; ++ found = true; ++ } + out: + pagevec_release(&pvec); + return found; diff --git a/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch b/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch new file mode 100644 index 00000000000..8972473bf64 --- /dev/null +++ b/queue-4.9/xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch @@ -0,0 +1,197 @@ +From 7912e7fef2aebe577f0b46d3cba261f2783c5695 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 14 Jun 2017 21:21:45 -0700 +Subject: xfs: push buffer of flush locked dquot to avoid quotacheck deadlock + +From: Brian Foster + +commit 7912e7fef2aebe577f0b46d3cba261f2783c5695 upstream. + +Reclaim during quotacheck can lead to deadlocks on the dquot flush +lock: + + - Quotacheck populates a local delwri queue with the physical dquot + buffers. + - Quotacheck performs the xfs_qm_dqusage_adjust() bulkstat and + dirties all of the dquots. + - Reclaim kicks in and attempts to flush a dquot whose buffer is + already queud on the quotacheck queue. The flush succeeds but + queueing to the reclaim delwri queue fails as the backing buffer is + already queued. The flush unlock is now deferred to I/O completion + of the buffer from the quotacheck queue. + - The dqadjust bulkstat continues and dirties the recently flushed + dquot once again. + - Quotacheck proceeds to the xfs_qm_flush_one() walk which requires + the flush lock to update the backing buffers with the in-core + recalculated values. It deadlocks on the redirtied dquot as the + flush lock was already acquired by reclaim, but the buffer resides + on the local delwri queue which isn't submitted until the end of + quotacheck. + +This is reproduced by running quotacheck on a filesystem with a +couple million inodes in low memory (512MB-1GB) situations. This is +a regression as of commit 43ff2122e6 ("xfs: on-stack delayed write +buffer lists"), which removed a trylock and buffer I/O submission +from the quotacheck dquot flush sequence. + +Quotacheck first resets and collects the physical dquot buffers in a +delwri queue. Then, it traverses the filesystem inodes via bulkstat, +updates the in-core dquots, flushes the corrected dquots to the +backing buffers and finally submits the delwri queue for I/O. Since +the backing buffers are queued across the entire quotacheck +operation, dquot reclaim cannot possibly complete a dquot flush +before quotacheck completes. + +Therefore, quotacheck must submit the buffer for I/O in order to +cycle the flush lock and flush the dirty in-core dquot to the +buffer. Add a delwri queue buffer push mechanism to submit an +individual buffer for I/O without losing the delwri queue status and +use it from quotacheck to avoid the deadlock. This restores +quotacheck behavior to as before the regression was introduced. + +Reported-by: Martin Svec +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_buf.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/xfs/xfs_buf.h | 1 + fs/xfs/xfs_qm.c | 28 +++++++++++++++++++++++- + fs/xfs/xfs_trace.h | 1 + 4 files changed, 89 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_buf.c ++++ b/fs/xfs/xfs_buf.c +@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit( + return error; + } + ++/* ++ * Push a single buffer on a delwri queue. ++ * ++ * The purpose of this function is to submit a single buffer of a delwri queue ++ * and return with the buffer still on the original queue. The waiting delwri ++ * buffer submission infrastructure guarantees transfer of the delwri queue ++ * buffer reference to a temporary wait list. We reuse this infrastructure to ++ * transfer the buffer back to the original queue. ++ * ++ * Note the buffer transitions from the queued state, to the submitted and wait ++ * listed state and back to the queued state during this call. The buffer ++ * locking and queue management logic between _delwri_pushbuf() and ++ * _delwri_queue() guarantee that the buffer cannot be queued to another list ++ * before returning. ++ */ ++int ++xfs_buf_delwri_pushbuf( ++ struct xfs_buf *bp, ++ struct list_head *buffer_list) ++{ ++ LIST_HEAD (submit_list); ++ int error; ++ ++ ASSERT(bp->b_flags & _XBF_DELWRI_Q); ++ ++ trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); ++ ++ /* ++ * Isolate the buffer to a new local list so we can submit it for I/O ++ * independently from the rest of the original list. ++ */ ++ xfs_buf_lock(bp); ++ list_move(&bp->b_list, &submit_list); ++ xfs_buf_unlock(bp); ++ ++ /* ++ * Delwri submission clears the DELWRI_Q buffer flag and returns with ++ * the buffer on the wait list with an associated reference. Rather than ++ * bounce the buffer from a local wait list back to the original list ++ * after I/O completion, reuse the original list as the wait list. ++ */ ++ xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); ++ ++ /* ++ * The buffer is now under I/O and wait listed as during typical delwri ++ * submission. Lock the buffer to wait for I/O completion. Rather than ++ * remove the buffer from the wait list and release the reference, we ++ * want to return with the buffer queued to the original list. The ++ * buffer already sits on the original list with a wait list reference, ++ * however. If we let the queue inherit that wait list reference, all we ++ * need to do is reset the DELWRI_Q flag. ++ */ ++ xfs_buf_lock(bp); ++ error = bp->b_error; ++ bp->b_flags |= _XBF_DELWRI_Q; ++ xfs_buf_unlock(bp); ++ ++ return error; ++} ++ + int __init + xfs_buf_init(void) + { +--- a/fs/xfs/xfs_buf.h ++++ b/fs/xfs/xfs_buf.h +@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct + extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); + extern int xfs_buf_delwri_submit(struct list_head *); + extern int xfs_buf_delwri_submit_nowait(struct list_head *); ++extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); + + /* Buffer Daemon Setup Routines */ + extern int xfs_buf_init(void); +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1247,6 +1247,7 @@ xfs_qm_flush_one( + struct xfs_dquot *dqp, + void *data) + { ++ struct xfs_mount *mp = dqp->q_mount; + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; + int error = 0; +@@ -1257,7 +1258,32 @@ xfs_qm_flush_one( + if (!XFS_DQ_IS_DIRTY(dqp)) + goto out_unlock; + +- xfs_dqflock(dqp); ++ /* ++ * The only way the dquot is already flush locked by the time quotacheck ++ * gets here is if reclaim flushed it before the dqadjust walk dirtied ++ * it for the final time. Quotacheck collects all dquot bufs in the ++ * local delwri queue before dquots are dirtied, so reclaim can't have ++ * possibly queued it for I/O. The only way out is to push the buffer to ++ * cycle the flush lock. ++ */ ++ if (!xfs_dqflock_nowait(dqp)) { ++ /* buf is pinned in-core by delwri list */ ++ DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, ++ mp->m_quotainfo->qi_dqchunklen); ++ bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); ++ if (!bp) { ++ error = -EINVAL; ++ goto out_unlock; ++ } ++ xfs_buf_unlock(bp); ++ ++ xfs_buf_delwri_pushbuf(bp, buffer_list); ++ xfs_buf_rele(bp); ++ ++ error = -EAGAIN; ++ goto out_unlock; ++ } ++ + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); + DEFINE_BUF_EVENT(xfs_buf_delwri_queue); + DEFINE_BUF_EVENT(xfs_buf_delwri_queued); + DEFINE_BUF_EVENT(xfs_buf_delwri_split); ++DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); + DEFINE_BUF_EVENT(xfs_buf_get_uncached); + DEFINE_BUF_EVENT(xfs_bdstrat_shut); + DEFINE_BUF_EVENT(xfs_buf_item_relse); diff --git a/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch b/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch new file mode 100644 index 00000000000..2cd5655f883 --- /dev/null +++ b/queue-4.9/xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch @@ -0,0 +1,87 @@ +From 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 14 Jun 2017 21:35:35 -0700 +Subject: xfs: release bli from transaction properly on fs shutdown + +From: Brian Foster + +commit 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 upstream. + +If a filesystem shutdown occurs with a buffer log item in the CIL +and a log force occurs, the ->iop_unpin() handler is generally +expected to tear down the bli properly. This entails freeing the bli +memory and releasing the associated hold on the buffer so it can be +released and the filesystem unmounted. + +If this sequence occurs while ->bli_refcount is elevated (i.e., +another transaction is open and attempting to modify the buffer), +however, ->iop_unpin() may not be responsible for releasing the bli. +Instead, the transaction may release the final ->bli_refcount +reference and thus xfs_trans_brelse() is responsible for tearing +down the bli. + +While xfs_trans_brelse() does drop the reference count, it only +attempts to release the bli if it is clean (i.e., not in the +CIL/AIL). If the filesystem is shutdown and the bli is sitting dirty +in the CIL as noted above, this ends up skipping the last +opportunity to release the bli. In turn, this leaves the hold on the +buffer and causes an unmount hang. This can be reproduced by running +generic/388 in repetition. + +Update xfs_trans_brelse() to handle this shutdown corner case +correctly. If the final bli reference is dropped and the filesystem +is shutdown, remove the bli from the AIL (if necessary) and release +the bli to drop the buffer hold and ensure an unmount does not hang. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Carlos Maiolino +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_trans_buf.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +--- a/fs/xfs/xfs_trans_buf.c ++++ b/fs/xfs/xfs_trans_buf.c +@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp, + xfs_buf_t *bp) + { + xfs_buf_log_item_t *bip; ++ int freed; + + /* + * Default to a normal brelse() call if the tp is NULL. +@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp, + /* + * Drop our reference to the buf log item. + */ +- atomic_dec(&bip->bli_refcount); ++ freed = atomic_dec_and_test(&bip->bli_refcount); + + /* +- * If the buf item is not tracking data in the log, then +- * we must free it before releasing the buffer back to the +- * free pool. Before releasing the buffer to the free pool, +- * clear the transaction pointer in b_fsprivate2 to dissolve +- * its relation to this transaction. +- */ +- if (!xfs_buf_item_dirty(bip)) { ++ * If the buf item is not tracking data in the log, then we must free it ++ * before releasing the buffer back to the free pool. ++ * ++ * If the fs has shutdown and we dropped the last reference, it may fall ++ * on us to release a (possibly dirty) bli if it never made it to the ++ * AIL (e.g., the aborted unpin already happened and didn't release it ++ * due to our reference). Since we're already shutdown and need xa_lock, ++ * just force remove from the AIL and release the bli here. ++ */ ++ if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { ++ xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); ++ xfs_buf_item_relse(bp); ++ } else if (!xfs_buf_item_dirty(bip)) { + /*** + ASSERT(bp->b_pincount == 0); + ***/ diff --git a/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch b/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch new file mode 100644 index 00000000000..0c8d3ca7b4a --- /dev/null +++ b/queue-4.9/xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch @@ -0,0 +1,79 @@ +From 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 14 Jun 2017 21:35:35 -0700 +Subject: xfs: remove bli from AIL before release on transaction abort + +From: Brian Foster + +commit 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 upstream. + +When a buffer is modified, logged and committed, it ultimately ends +up sitting on the AIL with a dirty bli waiting for metadata +writeback. If another transaction locks and invalidates the buffer +(freeing an inode chunk, for example) in the meantime, the bli is +flagged as stale, the dirty state is cleared and the bli remains in +the AIL. + +If a shutdown occurs before the transaction that has invalidated the +buffer is committed, the transaction is ultimately aborted. The log +items are flagged as such and ->iop_unlock() handles the aborted +items. Because the bli is clean (due to the invalidation), +->iop_unlock() unconditionally releases it. The log item may still +reside in the AIL, however, which means the I/O completion handler +may still run and attempt to access it. This results in assert +failure due to the release of the bli while still present in the AIL +and a subsequent NULL dereference and panic in the buffer I/O +completion handling. This can be reproduced by running generic/388 +in repetition. + +To avoid this problem, update xfs_buf_item_unlock() to first check +whether the bli is aborted and if so, remove it from the AIL before +it is released. This ensures that the bli is no longer accessed +during the shutdown sequence after it has been freed. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Carlos Maiolino +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_buf_item.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +--- a/fs/xfs/xfs_buf_item.c ++++ b/fs/xfs/xfs_buf_item.c +@@ -636,20 +636,23 @@ xfs_buf_item_unlock( + + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted +- * buffers may be dirty and hence in the AIL. Therefore if we are +- * aborting a buffer and we've just taken the last refernce away, we +- * have to check if it is in the AIL before freeing it. We need to free +- * it in this case, because an aborted transaction has already shut the +- * filesystem down and this is the last chance we will have to do so. ++ * buffers may be in the AIL regardless of dirty state. An aborted ++ * transaction that invalidates a buffer already in the AIL may have ++ * marked it stale and cleared the dirty state, for example. ++ * ++ * Therefore if we are aborting a buffer and we've just taken the last ++ * reference away, we have to check if it is in the AIL before freeing ++ * it. We need to free it in this case, because an aborted transaction ++ * has already shut the filesystem down and this is the last chance we ++ * will have to do so. + */ + if (atomic_dec_and_test(&bip->bli_refcount)) { +- if (clean) +- xfs_buf_item_relse(bp); +- else if (aborted) { ++ if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bp); +- } ++ } else if (clean) ++ xfs_buf_item_relse(bp); + } + + if (!(flags & XFS_BLI_HOLD)) diff --git a/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch b/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch new file mode 100644 index 00000000000..dc8ed599aef --- /dev/null +++ b/queue-4.9/xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch @@ -0,0 +1,56 @@ +From hch@lst.de Mon Sep 18 10:08:30 2017 +From: Christoph Hellwig +Date: Sun, 17 Sep 2017 14:06:37 -0700 +Subject: xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" +Message-ID: <20170917210712.10804-13-hch@lst.de> + + +From: "Darrick J. Wong" + +commit 4c1a67bd3606540b9b42caff34a1d5cd94b1cf65 upstream. + +We must initialize the firstfsb parameter to _bmapi_write so that it +doesn't incorrectly treat stack garbage as a restriction on which AGs +it can search for free space. + +Fixes-coverity-id: 1402025 +Fixes-coverity-id: 1415167 +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 9 +++++++++ + fs/xfs/xfs_reflink.c | 2 +- + 2 files changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -6639,6 +6639,15 @@ xfs_bmap_finish_one( + bmap.br_blockcount = *blockcount; + bmap.br_state = state; + ++ /* ++ * firstfsb is tied to the transaction lifetime and is used to ++ * ensure correct AG locking order and schedule work item ++ * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us ++ * to only making one bmap call per transaction, so it should ++ * be safe to have it as a local variable here. ++ */ ++ firstfsb = NULLFSBLOCK; ++ + trace_xfs_bmap_deferred(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, + XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -333,7 +333,7 @@ xfs_reflink_convert_cow_extent( + struct xfs_defer_ops *dfops) + { + struct xfs_bmbt_irec irec = *imap; +- xfs_fsblock_t first_block; ++ xfs_fsblock_t first_block = NULLFSBLOCK; + int nimaps = 1; + + if (imap->br_state == XFS_EXT_NORM) diff --git a/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch b/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch new file mode 100644 index 00000000000..3015622b147 --- /dev/null +++ b/queue-4.9/xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch @@ -0,0 +1,294 @@ +From e1a4e37cc7b665b6804fba812aca2f4d7402c249 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 14 Jun 2017 21:25:57 -0700 +Subject: xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent + +From: Darrick J. Wong + +commit e1a4e37cc7b665b6804fba812aca2f4d7402c249 upstream. + +In a pathological scenario where we are trying to bunmapi a single +extent in which every other block is shared, it's possible that trying +to unmap the entire large extent in a single transaction can generate so +many EFIs that we overflow the transaction reservation. + +Therefore, use a heuristic to guess at the number of blocks we can +safely unmap from a reflink file's data fork in an single transaction. +This should prevent problems such as the log head slamming into the tail +and ASSERTs that trigger because we've exceeded the transaction +reservation. + +Note that since bunmapi can fail to unmap the entire range, we must also +teach the deferred unmap code to roll into a new transaction whenever we +get low on reservation. + +Signed-off-by: Darrick J. Wong +[hch: random edits, all bugs are my fault] +Signed-off-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 37 ++++++++++++++++++++++++++++--------- + fs/xfs/libxfs/xfs_bmap.h | 2 +- + fs/xfs/libxfs/xfs_refcount.c | 10 +--------- + fs/xfs/libxfs/xfs_refcount.h | 16 ++++++++++++++++ + fs/xfs/xfs_bmap_item.c | 17 +++++++++++++++-- + fs/xfs/xfs_trans.h | 2 +- + fs/xfs/xfs_trans_bmap.c | 11 +++++++++-- + 7 files changed, 71 insertions(+), 24 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -5555,6 +5555,7 @@ __xfs_bunmapi( + int whichfork; /* data or attribute fork */ + xfs_fsblock_t sum; + xfs_filblks_t len = *rlen; /* length to unmap in file */ ++ xfs_fileoff_t max_len; + + trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + +@@ -5576,6 +5577,16 @@ __xfs_bunmapi( + ASSERT(len > 0); + ASSERT(nexts >= 0); + ++ /* ++ * Guesstimate how many blocks we can unmap without running the risk of ++ * blowing out the transaction with a mix of EFIs and reflink ++ * adjustments. ++ */ ++ if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) ++ max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); ++ else ++ max_len = len; ++ + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; +@@ -5621,7 +5632,7 @@ __xfs_bunmapi( + + extno = 0; + while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && +- (nexts == 0 || extno < nexts)) { ++ (nexts == 0 || extno < nexts) && max_len > 0) { + /* + * Is the found extent after a hole in which bno lives? + * Just back up to the previous extent, if so. +@@ -5655,6 +5666,15 @@ __xfs_bunmapi( + } + if (del.br_startoff + del.br_blockcount > bno + 1) + del.br_blockcount = bno + 1 - del.br_startoff; ++ ++ /* How much can we safely unmap? */ ++ if (max_len < del.br_blockcount) { ++ del.br_startoff += del.br_blockcount - max_len; ++ if (!wasdel) ++ del.br_startblock += del.br_blockcount - max_len; ++ del.br_blockcount = max_len; ++ } ++ + sum = del.br_startblock + del.br_blockcount; + if (isrt && + (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { +@@ -5835,6 +5855,7 @@ __xfs_bunmapi( + if (!isrt && wasdel) + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); + ++ max_len -= del.br_blockcount; + bno = del.br_startoff - 1; + nodelete: + /* +@@ -6604,25 +6625,24 @@ xfs_bmap_finish_one( + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, +- xfs_filblks_t blockcount, ++ xfs_filblks_t *blockcount, + xfs_exntst_t state) + { + struct xfs_bmbt_irec bmap; + int nimaps = 1; + xfs_fsblock_t firstfsb; + int flags = XFS_BMAPI_REMAP; +- int done; + int error = 0; + + bmap.br_startblock = startblock; + bmap.br_startoff = startoff; +- bmap.br_blockcount = blockcount; ++ bmap.br_blockcount = *blockcount; + bmap.br_state = state; + + trace_xfs_bmap_deferred(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, + XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), +- ip->i_ino, whichfork, startoff, blockcount, state); ++ ip->i_ino, whichfork, startoff, *blockcount, state); + + if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + return -EFSCORRUPTED; +@@ -6641,12 +6661,11 @@ xfs_bmap_finish_one( + bmap.br_blockcount, flags, &firstfsb, + bmap.br_blockcount, &bmap, &nimaps, + dfops); ++ *blockcount = 0; + break; + case XFS_BMAP_UNMAP: +- error = xfs_bunmapi(tp, ip, bmap.br_startoff, +- bmap.br_blockcount, flags, 1, &firstfsb, +- dfops, &done); +- ASSERT(done); ++ error = __xfs_bunmapi(tp, ip, startoff, blockcount, ++ XFS_BMAPI_REMAP, 1, &firstfsb, dfops); + break; + default: + ASSERT(0); +--- a/fs/xfs/libxfs/xfs_bmap.h ++++ b/fs/xfs/libxfs/xfs_bmap.h +@@ -265,7 +265,7 @@ struct xfs_bmap_intent { + int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, enum xfs_bmap_intent_type type, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, +- xfs_filblks_t blockcount, xfs_exntst_t state); ++ xfs_filblks_t *blockcount, xfs_exntst_t state); + int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); + int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, +--- a/fs/xfs/libxfs/xfs_refcount.c ++++ b/fs/xfs/libxfs/xfs_refcount.c +@@ -784,14 +784,6 @@ xfs_refcount_merge_extents( + } + + /* +- * While we're adjusting the refcounts records of an extent, we have +- * to keep an eye on the number of extents we're dirtying -- run too +- * many in a single transaction and we'll exceed the transaction's +- * reservation and crash the fs. Each record adds 12 bytes to the +- * log (plus any key updates) so we'll conservatively assume 24 bytes +- * per record. We must also leave space for btree splits on both ends +- * of the range and space for the CUD and a new CUI. +- * + * XXX: This is a pretty hand-wavy estimate. The penalty for guessing + * true incorrectly is a shutdown FS; the penalty for guessing false + * incorrectly is more transaction rolls than might be necessary. +@@ -822,7 +814,7 @@ xfs_refcount_still_have_space( + else if (overhead > cur->bc_tp->t_log_res) + return false; + return cur->bc_tp->t_log_res - overhead > +- cur->bc_private.a.priv.refc.nr_ops * 32; ++ cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + } + + /* +--- a/fs/xfs/libxfs/xfs_refcount.h ++++ b/fs/xfs/libxfs/xfs_refcount.h +@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent( + extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, + xfs_agnumber_t agno); + ++/* ++ * While we're adjusting the refcounts records of an extent, we have ++ * to keep an eye on the number of extents we're dirtying -- run too ++ * many in a single transaction and we'll exceed the transaction's ++ * reservation and crash the fs. Each record adds 12 bytes to the ++ * log (plus any key updates) so we'll conservatively assume 32 bytes ++ * per record. We must also leave space for btree splits on both ends ++ * of the range and space for the CUD and a new CUI. ++ */ ++#define XFS_REFCOUNT_ITEM_OVERHEAD 32 ++ ++static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) ++{ ++ return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; ++} ++ + #endif /* __XFS_REFCOUNT_H__ */ +--- a/fs/xfs/xfs_bmap_item.c ++++ b/fs/xfs/xfs_bmap_item.c +@@ -395,6 +395,7 @@ xfs_bui_recover( + struct xfs_map_extent *bmap; + xfs_fsblock_t startblock_fsb; + xfs_fsblock_t inode_fsb; ++ xfs_filblks_t count; + bool op_ok; + struct xfs_bud_log_item *budp; + enum xfs_bmap_intent_type type; +@@ -403,6 +404,7 @@ xfs_bui_recover( + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_defer_ops dfops; ++ struct xfs_bmbt_irec irec; + xfs_fsblock_t firstfsb; + + ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); +@@ -480,13 +482,24 @@ xfs_bui_recover( + } + xfs_trans_ijoin(tp, ip, 0); + ++ count = bmap->me_len; + error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + ip, whichfork, bmap->me_startoff, +- bmap->me_startblock, bmap->me_len, +- state); ++ bmap->me_startblock, &count, state); + if (error) + goto err_dfops; + ++ if (count > 0) { ++ ASSERT(type == XFS_BMAP_UNMAP); ++ irec.br_startblock = bmap->me_startblock; ++ irec.br_blockcount = count; ++ irec.br_startoff = bmap->me_startoff; ++ irec.br_state = state; ++ error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); ++ if (error) ++ goto err_dfops; ++ } ++ + /* Finish transaction, free inodes. */ + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) +--- a/fs/xfs/xfs_trans.h ++++ b/fs/xfs/xfs_trans.h +@@ -277,6 +277,6 @@ int xfs_trans_log_finish_bmap_update(str + struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, struct xfs_inode *ip, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, +- xfs_filblks_t blockcount, xfs_exntst_t state); ++ xfs_filblks_t *blockcount, xfs_exntst_t state); + + #endif /* __XFS_TRANS_H__ */ +--- a/fs/xfs/xfs_trans_bmap.c ++++ b/fs/xfs/xfs_trans_bmap.c +@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update( + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, +- xfs_filblks_t blockcount, ++ xfs_filblks_t *blockcount, + xfs_exntst_t state) + { + int error; +@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item( + void **state) + { + struct xfs_bmap_intent *bmap; ++ xfs_filblks_t count; + int error; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); ++ count = bmap->bi_bmap.br_blockcount; + error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, + bmap->bi_type, + bmap->bi_owner, bmap->bi_whichfork, + bmap->bi_bmap.br_startoff, + bmap->bi_bmap.br_startblock, +- bmap->bi_bmap.br_blockcount, ++ &count, + bmap->bi_bmap.br_state); ++ if (!error && count > 0) { ++ ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); ++ bmap->bi_bmap.br_blockcount = count; ++ return -EAGAIN; ++ } + kmem_free(bmap); + return error; + }