x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch
+xfs-move-handling-of-missing-page-into-one-place-in-xfs_find_get_desired_pgoff.patch
+xfs-fix-spurious-spin_is_locked-assert-failures-on-non-smp-kernels.patch
+xfs-push-buffer-of-flush-locked-dquot-to-avoid-quotacheck-deadlock.patch
+xfs-try-to-avoid-blowing-out-the-transaction-reservation-when-bunmaping-a-shared-extent.patch
+xfs-release-bli-from-transaction-properly-on-fs-shutdown.patch
+xfs-remove-bli-from-ail-before-release-on-transaction-abort.patch
+xfs-don-t-allow-bmap-on-rt-files.patch
+xfs-free-uncommitted-transactions-during-log-recovery.patch
+xfs-free-cowblocks-and-retry-on-buffered-write-enospc.patch
+xfs-don-t-crash-on-unexpected-holes-in-dir-attr-btrees.patch
+xfs-check-_btree_check_block-value.patch
+xfs-set-firstfsb-to-nullfsblock-before-feeding-it-to-_bmapi_write.patch
+xfs-check-_alloc_read_agf-buffer-pointer-before-using.patch
+xfs-fix-quotacheck-dquot-id-overflow-infinite-loop.patch
+xfs-fix-multi-ag-deadlock-in-xfs_bunmapi.patch
+xfs-fix-per-inode-dax-flag-inheritance.patch
+xfs-fix-inobt-inode-allocation-search-optimization.patch
--- /dev/null
+From hch@lst.de Mon Sep 18 10:08:45 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:38 -0700
+Subject: xfs: check _alloc_read_agf buffer pointer before using
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-14-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 10479e2dea83d4c421ad05dfc55d918aa8dfc0cd upstream.
+
+In some circumstances, _alloc_read_agf can return an error code of zero
+but also a null AGF buffer pointer. Check for this and jump out.
+
+Fixes-coverity-id: 1415250
+Fixes-coverity-id: 1415320
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_refcount.c | 4 ++++
+ fs/xfs/xfs_reflink.c | 2 ++
+ 2 files changed, 6 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -1640,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ if (error)
+ goto out_trans;
++ if (!agbp) {
++ error = -ENOMEM;
++ goto out_trans;
++ }
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
+
+ /* Find all the leftover CoW staging extents. */
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -169,6 +169,8 @@ xfs_reflink_find_shared(
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error)
+ return error;
++ if (!agbp)
++ return -ENOMEM;
+
+ cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
--- /dev/null
+From hch@lst.de Mon Sep 18 10:08:11 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:36 -0700
+Subject: xfs: check _btree_check_block value
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-12-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 1e86eabe73b73c82e1110c746ed3ec6d5e1c0a0d upstream.
+
+Check the _btree_check_block return value for the firstrec and lastrec
+functions, since we have the ability to signal that the repositioning
+did not succeed.
+
+Fixes-coverity-id: 114067
+Fixes-coverity-id: 114068
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -714,7 +714,8 @@ xfs_btree_firstrec(
+ * Get the block pointer for this level.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+- xfs_btree_check_block(cur, block, level, bp);
++ if (xfs_btree_check_block(cur, block, level, bp))
++ return 0;
+ /*
+ * It's empty, there is no such record.
+ */
+@@ -743,7 +744,8 @@ xfs_btree_lastrec(
+ * Get the block pointer for this level.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+- xfs_btree_check_block(cur, block, level, bp);
++ if (xfs_btree_check_block(cur, block, level, bp))
++ return 0;
+ /*
+ * It's empty, there is no such record.
+ */
--- /dev/null
+From 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 19 Jun 2017 13:19:08 -0700
+Subject: xfs: don't allow bmap on rt files
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 61d819e7bcb7f33da710bf3f5dcb2bcf1e48203c upstream.
+
+bmap returns a dumb LBA address but not the block device that goes with
+that LBA. Swapfiles don't care about this and will blindly assume that
+the data volume is the correct blockdev, which is totally bogus for
+files on the rt subvolume. This results in the swap code doing IOs to
+arbitrary locations on the data device(!) if the passed in mapping is a
+realtime file, so just turn off bmap for rt files.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/xfs/xfs_aops.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -1566,9 +1566,12 @@ xfs_vm_bmap(
+ * The swap code (ab-)uses ->bmap to get a block mapping and then
+ * bypasseѕ the file system for actual I/O. We really can't allow
+ * that on reflinks inodes, so we have to skip out here. And yes,
+- * 0 is the magic code for a bmap error..
++ * 0 is the magic code for a bmap error.
++ *
++ * Since we don't pass back blockdev info, we can't return bmap
++ * information for rt files either.
+ */
+- if (xfs_is_reflink_inode(ip)) {
++ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return 0;
+ }
--- /dev/null
+From hch@lst.de Mon Sep 18 10:07:57 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:35 -0700
+Subject: xfs: don't crash on unexpected holes in dir/attr btrees
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-11-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit cd87d867920155911d0d2e6485b769d853547750 upstream.
+
+In quite a few places we call xfs_da_read_buf with a mappedbno that we
+don't control, then assume that the function passes back either an error
+code or a buffer pointer. Unfortunately, if mappedbno == -2 and bno
+maps to a hole, we get a return code of zero and a NULL buffer, which
+means that we crash if we actually try to use that buffer pointer. This
+happens immediately when we set the buffer type for transaction context.
+
+Therefore, check that we have no error code and a non-NULL bp before
+trying to use bp. This patch is a follow-up to an incomplete fix in
+96a3aefb8ffde231 ("xfs: don't crash if reading a directory results in an
+unexpected hole").
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_attr_leaf.c | 2 +-
+ fs/xfs/libxfs/xfs_da_btree.c | 2 +-
+ fs/xfs/libxfs/xfs_dir2_block.c | 2 +-
+ fs/xfs/libxfs/xfs_dir2_leaf.c | 4 ++--
+ 4 files changed, 5 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_attr_leaf.c
++++ b/fs/xfs/libxfs/xfs_attr_leaf.c
+@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+- if (!err && tp)
++ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+ return err;
+ }
+--- a/fs/xfs/libxfs/xfs_da_btree.c
++++ b/fs/xfs/libxfs/xfs_da_btree.c
+@@ -263,7 +263,7 @@ xfs_da3_node_read(
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ which_fork, &xfs_da3_node_buf_ops);
+- if (!err && tp) {
++ if (!err && tp && *bpp) {
+ struct xfs_da_blkinfo *info = (*bpp)->b_addr;
+ int type;
+
+--- a/fs/xfs/libxfs/xfs_dir2_block.c
++++ b/fs/xfs/libxfs/xfs_dir2_block.c
+@@ -139,7 +139,7 @@ xfs_dir3_block_read(
+
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+ XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+- if (!err && tp)
++ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ return err;
+ }
+--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
++++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
+@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+- if (!err && tp)
++ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+ return err;
+ }
+@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+- if (!err && tp)
++ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+ return err;
+ }
--- /dev/null
+From hch@lst.de Mon Sep 18 10:09:48 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:42 -0700
+Subject: xfs: fix inobt inode allocation search optimization
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Omar Sandoval <osandov@fb.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-18-hch@lst.de>
+
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit c44245b3d5435f533ca8346ece65918f84c057f9 upstream.
+
+When we try to allocate a free inode by searching the inobt, we try to
+find the inode nearest the parent inode by searching chunks both left
+and right of the chunk containing the parent. As an optimization, we
+cache the leftmost and rightmost records that we previously searched; if
+we do another allocation with the same parent inode, we'll pick up the
+search where it last left off.
+
+There's a bug in the case where we found a free inode to the left of the
+parent's chunk: we need to update the cached left and right records, but
+because we already reassigned the right record to point to the left, we
+end up assigning the left record to both the cached left and right
+records.
+
+This isn't a correctness problem strictly, but it can result in the next
+allocation rechecking chunks unnecessarily or allocating inodes further
+away from the parent than it needs to. Fix it by swapping the record
+pointer after we update the cached left and right records.
+
+Fixes: bd169565993b ("xfs: speed up free inode search")
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ialloc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -1236,13 +1236,13 @@ xfs_dialloc_ag_inobt(
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+- rec = trec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
++ rec = trec;
+ goto alloc_inode;
+ }
+
--- /dev/null
+From hch@lst.de Mon Sep 18 10:09:24 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:40 -0700
+Subject: xfs: fix multi-AG deadlock in xfs_bunmapi
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-16-hch@lst.de>
+
+
+commit 5b094d6dac0451ad89b1dc088395c7b399b7e9e8 upstream.
+
+Just like in the allocator we must avoid touching multiple AGs out of
+order when freeing blocks, as freeing still locks the AGF and can cause
+the same AB-BA deadlocks as in the allocation path.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -5556,6 +5556,7 @@ __xfs_bunmapi(
+ xfs_fsblock_t sum;
+ xfs_filblks_t len = *rlen; /* length to unmap in file */
+ xfs_fileoff_t max_len;
++ xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
+
+ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
+@@ -5658,6 +5659,17 @@ __xfs_bunmapi(
+ ASSERT(ep != NULL);
+ del = got;
+ wasdel = isnullstartblock(del.br_startblock);
++
++ /*
++ * Make sure we don't touch multiple AGF headers out of order
++ * in a single transaction, as that could cause AB-BA deadlocks.
++ */
++ if (!wasdel) {
++ agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
++ if (prev_agno != NULLAGNUMBER && prev_agno > agno)
++ break;
++ prev_agno = agno;
++ }
+ if (got.br_startoff < start) {
+ del.br_startoff = start;
+ del.br_blockcount -= start - got.br_startoff;
--- /dev/null
+From hch@lst.de Mon Sep 18 10:09:34 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:41 -0700
+Subject: xfs: Fix per-inode DAX flag inheritance
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Lukas Czerner <lczerner@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-17-hch@lst.de>
+
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+commit 56bdf855e676f1f2ed7033f288f57dfd315725ba upstream.
+
+According to the commit that implemented per-inode DAX flag:
+commit 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
+the flag is supposed to act as "inherit flag".
+
+Currently this only works in the situations where parent directory
+already has a flag in di_flags set, otherwise inheritance does not
+work. This is because setting the XFS_DIFLAG2_DAX flag is done in a
+wrong branch designated for di_flags, not di_flags2.
+
+Fix this by moving the code to branch designated for setting di_flags2,
+which does test for flags in di_flags2.
+
+Fixes: 58f88ca2df72 ("xfs: introduce per-inode DAX enablement")
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c | 12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -881,7 +881,6 @@ xfs_ialloc(
+ case S_IFREG:
+ case S_IFDIR:
+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+- uint64_t di_flags2 = 0;
+ uint di_flags = 0;
+
+ if (S_ISDIR(mode)) {
+@@ -918,20 +917,23 @@ xfs_ialloc(
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+- di_flags2 |= XFS_DIFLAG2_DAX;
+
+ ip->i_d.di_flags |= di_flags;
+- ip->i_d.di_flags2 |= di_flags2;
+ }
+ if (pip &&
+ (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
+ pip->i_d.di_version == 3 &&
+ ip->i_d.di_version == 3) {
++ uint64_t di_flags2 = 0;
++
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+- ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
++ di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+ }
++ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
++ di_flags2 |= XFS_DIFLAG2_DAX;
++
++ ip->i_d.di_flags2 |= di_flags2;
+ }
+ /* FALLTHROUGH */
+ case S_IFLNK:
--- /dev/null
+From hch@lst.de Mon Sep 18 10:09:04 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:39 -0700
+Subject: xfs: fix quotacheck dquot id overflow infinite loop
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-15-hch@lst.de>
+
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit cfaf2d034360166e569a4929dd83ae9698bed856 upstream.
+
+If a dquot has an id of U32_MAX, the next lookup index increment
+overflows the uint32_t back to 0. This starts the lookup sequence
+over from the beginning, repeats indefinitely and results in a
+livelock.
+
+Update xfs_qm_dquot_walk() to explicitly check for the lookup
+overflow and exit the loop.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_qm.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -111,6 +111,9 @@ restart:
+ skipped = 0;
+ break;
+ }
++ /* we're done if id overflows back to zero */
++ if (!next_index)
++ break;
+ }
+
+ if (skipped) {
--- /dev/null
+From 95989c46d2a156365867b1d795fdefce71bce378 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 8 Jun 2017 08:23:07 -0700
+Subject: xfs: fix spurious spin_is_locked() assert failures on non-smp kernels
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 95989c46d2a156365867b1d795fdefce71bce378 upstream.
+
+The 0-day kernel test robot reports assertion failures on
+!CONFIG_SMP kernels due to failed spin_is_locked() checks. As it
+turns out, spin_is_locked() is hardcoded to return zero on
+!CONFIG_SMP kernels and so this function cannot be relied on to
+verify spinlock state in this configuration.
+
+To avoid this problem, replace the associated asserts with lockdep
+variants that do the right thing regardless of kernel configuration.
+Drop the one assert that checks for an unlocked lock as there is no
+suitable lockdep variant for that case. This moves the spinlock
+checks from XFS debug code to lockdep, but generally provides the
+same level of protection.
+
+Reported-by: kbuild test robot <fengguang.wu@intel.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf.c | 2 +-
+ fs/xfs/xfs_icache.c | 5 ++---
+ 2 files changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -116,7 +116,7 @@ static inline void
+ __xfs_buf_ioacct_dec(
+ struct xfs_buf *bp)
+ {
+- ASSERT(spin_is_locked(&bp->b_lock));
++ lockdep_assert_held(&bp->b_lock);
+
+ if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
+ bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -66,7 +66,6 @@ xfs_inode_alloc(
+
+ XFS_STATS_INC(mp, vn_active);
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+- ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(!xfs_isiflocked(ip));
+ ASSERT(ip->i_ino == 0);
+
+@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
+ {
+ struct xfs_mount *mp = pag->pag_mount;
+
+- ASSERT(spin_is_locked(&pag->pag_ici_lock));
++ lockdep_assert_held(&pag->pag_ici_lock);
+ if (pag->pag_ici_reclaimable++)
+ return;
+
+@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
+ {
+ struct xfs_mount *mp = pag->pag_mount;
+
+- ASSERT(spin_is_locked(&pag->pag_ici_lock));
++ lockdep_assert_held(&pag->pag_ici_lock);
+ if (--pag->pag_ici_reclaimable)
+ return;
+
--- /dev/null
+From hch@lst.de Mon Sep 18 10:07:40 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:34 -0700
+Subject: xfs: free cowblocks and retry on buffered write ENOSPC
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-10-hch@lst.de>
+
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit cf2cb7845d6e101cb17bd62f8aa08cd514fc8988 upstream.
+
+XFS runs an eofblocks reclaim scan before returning an ENOSPC error to
+userspace for buffered writes. This facilitates aggressive speculative
+preallocation without causing user visible side effects such as
+premature ENOSPC.
+
+Run a cowblocks scan in the same situation to reclaim lingering COW fork
+preallocation throughout the filesystem.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -729,6 +729,7 @@ write_retry:
+ xfs_rw_iunlock(ip, iolock);
+ eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+ xfs_icache_free_eofblocks(ip->i_mount, &eofb);
++ xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ goto write_retry;
+ }
+
--- /dev/null
+From hch@lst.de Mon Sep 18 10:07:17 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:33 -0700
+Subject: xfs: free uncommitted transactions during log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <darrick.wong@oracle.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 39775431f82f890f4aaa08860a30883d081bffc7 upstream.
+
+Log recovery allocates in-core transaction and member item data
+structures on-demand as it processes the on-disk log. Transactions
+are allocated on first encounter on-disk and stored in a hash table
+structure where they are easily accessible for subsequent lookups.
+Transaction items are also allocated on demand and are attached to
+the associated transactions.
+
+When a commit record is encountered in the log, the transaction is
+committed to the fs and the in-core structures are freed. If a
+filesystem crashes or shuts down before all in-core log buffers are
+flushed to the log, however, not all transactions may have commit
+records in the log. As expected, the modifications in such an
+incomplete transaction are not replayed to the fs. The in-core data
+structures for the partial transaction are never freed, however,
+resulting in a memory leak.
+
+Update xlog_do_recovery_pass() to first correctly initialize the
+hash table array so empty lists can be distinguished from populated
+lists on function exit. Update xlog_recover_free_trans() to always
+remove the transaction from the list prior to freeing the associated
+memory. Finally, walk the hash table of transaction lists as the
+last step before it goes out of scope and free any transactions that
+may remain on the lists. This prevents a memory leak of partial
+transactions in the log.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 21 ++++++++++++++++++++-
+ 1 file changed, 20 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4152,7 +4152,7 @@ xlog_recover_commit_trans(
+
+ #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
+
+- hlist_del(&trans->r_list);
++ hlist_del_init(&trans->r_list);
+
+ error = xlog_recover_reorder_trans(log, trans, pass);
+ if (error)
+@@ -4354,6 +4354,8 @@ xlog_recover_free_trans(
+ xlog_recover_item_t *item, *n;
+ int i;
+
++ hlist_del_init(&trans->r_list);
++
+ list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+ /* Free the regions in the item. */
+ list_del(&item->ri_list);
+@@ -5222,12 +5224,16 @@ xlog_do_recovery_pass(
+ int error2 = 0;
+ int bblks, split_bblks;
+ int hblks, split_hblks, wrapped_hblks;
++ int i;
+ struct hlist_head rhash[XLOG_RHASH_SIZE];
+ LIST_HEAD (buffer_list);
+
+ ASSERT(head_blk != tail_blk);
+ rhead_blk = 0;
+
++ for (i = 0; i < XLOG_RHASH_SIZE; i++)
++ INIT_HLIST_HEAD(&rhash[i]);
++
+ /*
+ * Read the header of the tail block and get the iclog buffer size from
+ * h_size. Use this to tell how many sectors make up the log header.
+@@ -5464,6 +5470,19 @@ xlog_do_recovery_pass(
+ if (error && first_bad)
+ *first_bad = rhead_blk;
+
++ /*
++ * Transactions are freed at commit time but transactions without commit
++ * records on disk are never committed. Free any that may be left in the
++ * hash table.
++ */
++ for (i = 0; i < XLOG_RHASH_SIZE; i++) {
++ struct hlist_node *tmp;
++ struct xlog_recover *trans;
++
++ hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
++ xlog_recover_free_trans(trans);
++ }
++
+ return error ? error : error2;
+ }
+
--- /dev/null
+From a54fba8f5a0dc36161cacdf2aa90f007f702ec1a Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 18 May 2017 16:36:24 -0700
+Subject: xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff()
+
+From: Jan Kara <jack@suse.cz>
+
+commit a54fba8f5a0dc36161cacdf2aa90f007f702ec1a upstream.
+
+Currently several places in xfs_find_get_desired_pgoff() handle the case
+of a missing page. Make them all handled in one place after the loop has
+terminated.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c | 38 ++++++++------------------------------
+ 1 file changed, 8 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -1139,29 +1139,8 @@ xfs_find_get_desired_pgoff(
+ want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ want);
+- /*
+- * No page mapped into given range. If we are searching holes
+- * and if this is the first time we got into the loop, it means
+- * that the given offset is landed in a hole, return it.
+- *
+- * If we have already stepped through some block buffers to find
+- * holes but they all contains data. In this case, the last
+- * offset is already updated and pointed to the end of the last
+- * mapped page, if it does not reach the endpoint to search,
+- * that means there should be a hole between them.
+- */
+- if (nr_pages == 0) {
+- /* Data search found nothing */
+- if (type == DATA_OFF)
+- break;
+-
+- ASSERT(type == HOLE_OFF);
+- if (lastoff == startoff || lastoff < endoff) {
+- found = true;
+- *offset = lastoff;
+- }
++ if (nr_pages == 0)
+ break;
+- }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+@@ -1227,21 +1206,20 @@ xfs_find_get_desired_pgoff(
+
+ /*
+ * The number of returned pages less than our desired, search
+- * done. In this case, nothing was found for searching data,
+- * but we found a hole behind the last offset.
++ * done.
+ */
+- if (nr_pages < want) {
+- if (type == HOLE_OFF) {
+- *offset = lastoff;
+- found = true;
+- }
++ if (nr_pages < want)
+ break;
+- }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
++ /* No page at lastoff and we are not done - we found a hole. */
++ if (type == HOLE_OFF && lastoff < endoff) {
++ *offset = lastoff;
++ found = true;
++ }
+ out:
+ pagevec_release(&pvec);
+ return found;
--- /dev/null
+From 7912e7fef2aebe577f0b46d3cba261f2783c5695 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:21:45 -0700
+Subject: xfs: push buffer of flush locked dquot to avoid quotacheck deadlock
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 7912e7fef2aebe577f0b46d3cba261f2783c5695 upstream.
+
+Reclaim during quotacheck can lead to deadlocks on the dquot flush
+lock:
+
+ - Quotacheck populates a local delwri queue with the physical dquot
+ buffers.
+ - Quotacheck performs the xfs_qm_dqusage_adjust() bulkstat and
+ dirties all of the dquots.
+ - Reclaim kicks in and attempts to flush a dquot whose buffer is
+ already queud on the quotacheck queue. The flush succeeds but
+ queueing to the reclaim delwri queue fails as the backing buffer is
+ already queued. The flush unlock is now deferred to I/O completion
+ of the buffer from the quotacheck queue.
+ - The dqadjust bulkstat continues and dirties the recently flushed
+ dquot once again.
+ - Quotacheck proceeds to the xfs_qm_flush_one() walk which requires
+ the flush lock to update the backing buffers with the in-core
+ recalculated values. It deadlocks on the redirtied dquot as the
+ flush lock was already acquired by reclaim, but the buffer resides
+ on the local delwri queue which isn't submitted until the end of
+ quotacheck.
+
+This is reproduced by running quotacheck on a filesystem with a
+couple million inodes in low memory (512MB-1GB) situations. This is
+a regression as of commit 43ff2122e6 ("xfs: on-stack delayed write
+buffer lists"), which removed a trylock and buffer I/O submission
+from the quotacheck dquot flush sequence.
+
+Quotacheck first resets and collects the physical dquot buffers in a
+delwri queue. Then, it traverses the filesystem inodes via bulkstat,
+updates the in-core dquots, flushes the corrected dquots to the
+backing buffers and finally submits the delwri queue for I/O. Since
+the backing buffers are queued across the entire quotacheck
+operation, dquot reclaim cannot possibly complete a dquot flush
+before quotacheck completes.
+
+Therefore, quotacheck must submit the buffer for I/O in order to
+cycle the flush lock and flush the dirty in-core dquot to the
+buffer. Add a delwri queue buffer push mechanism to submit an
+individual buffer for I/O without losing the delwri queue status and
+use it from quotacheck to avoid the deadlock. This restores
+quotacheck behavior to as before the regression was introduced.
+
+Reported-by: Martin Svec <martin.svec@zoner.cz>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_buf.h | 1
+ fs/xfs/xfs_qm.c | 28 +++++++++++++++++++++++-
+ fs/xfs/xfs_trace.h | 1
+ 4 files changed, 89 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
+ return error;
+ }
+
++/*
++ * Push a single buffer on a delwri queue.
++ *
++ * The purpose of this function is to submit a single buffer of a delwri queue
++ * and return with the buffer still on the original queue. The waiting delwri
++ * buffer submission infrastructure guarantees transfer of the delwri queue
++ * buffer reference to a temporary wait list. We reuse this infrastructure to
++ * transfer the buffer back to the original queue.
++ *
++ * Note the buffer transitions from the queued state, to the submitted and wait
++ * listed state and back to the queued state during this call. The buffer
++ * locking and queue management logic between _delwri_pushbuf() and
++ * _delwri_queue() guarantee that the buffer cannot be queued to another list
++ * before returning.
++ */
++int
++xfs_buf_delwri_pushbuf(
++ struct xfs_buf *bp,
++ struct list_head *buffer_list)
++{
++ LIST_HEAD (submit_list);
++ int error;
++
++ ASSERT(bp->b_flags & _XBF_DELWRI_Q);
++
++ trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
++
++ /*
++ * Isolate the buffer to a new local list so we can submit it for I/O
++ * independently from the rest of the original list.
++ */
++ xfs_buf_lock(bp);
++ list_move(&bp->b_list, &submit_list);
++ xfs_buf_unlock(bp);
++
++ /*
++ * Delwri submission clears the DELWRI_Q buffer flag and returns with
++ * the buffer on the wait list with an associated reference. Rather than
++ * bounce the buffer from a local wait list back to the original list
++ * after I/O completion, reuse the original list as the wait list.
++ */
++ xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
++
++ /*
++ * The buffer is now under I/O and wait listed as during typical delwri
++ * submission. Lock the buffer to wait for I/O completion. Rather than
++ * remove the buffer from the wait list and release the reference, we
++ * want to return with the buffer queued to the original list. The
++ * buffer already sits on the original list with a wait list reference,
++ * however. If we let the queue inherit that wait list reference, all we
++ * need to do is reset the DELWRI_Q flag.
++ */
++ xfs_buf_lock(bp);
++ error = bp->b_error;
++ bp->b_flags |= _XBF_DELWRI_Q;
++ xfs_buf_unlock(bp);
++
++ return error;
++}
++
+ int __init
+ xfs_buf_init(void)
+ {
+--- a/fs/xfs/xfs_buf.h
++++ b/fs/xfs/xfs_buf.h
+@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct
+ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+ extern int xfs_buf_delwri_submit(struct list_head *);
+ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
++extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
+
+ /* Buffer Daemon Setup Routines */
+ extern int xfs_buf_init(void);
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1247,6 +1247,7 @@ xfs_qm_flush_one(
+ struct xfs_dquot *dqp,
+ void *data)
+ {
++ struct xfs_mount *mp = dqp->q_mount;
+ struct list_head *buffer_list = data;
+ struct xfs_buf *bp = NULL;
+ int error = 0;
+@@ -1257,7 +1258,32 @@ xfs_qm_flush_one(
+ if (!XFS_DQ_IS_DIRTY(dqp))
+ goto out_unlock;
+
+- xfs_dqflock(dqp);
++ /*
++ * The only way the dquot is already flush locked by the time quotacheck
++ * gets here is if reclaim flushed it before the dqadjust walk dirtied
++ * it for the final time. Quotacheck collects all dquot bufs in the
++ * local delwri queue before dquots are dirtied, so reclaim can't have
++ * possibly queued it for I/O. The only way out is to push the buffer to
++ * cycle the flush lock.
++ */
++ if (!xfs_dqflock_nowait(dqp)) {
++ /* buf is pinned in-core by delwri list */
++ DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
++ mp->m_quotainfo->qi_dqchunklen);
++ bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
++ if (!bp) {
++ error = -EINVAL;
++ goto out_unlock;
++ }
++ xfs_buf_unlock(bp);
++
++ xfs_buf_delwri_pushbuf(bp, buffer_list);
++ xfs_buf_rele(bp);
++
++ error = -EAGAIN;
++ goto out_unlock;
++ }
++
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error)
+ goto out_unlock;
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
++DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
+ DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+ DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+ DEFINE_BUF_EVENT(xfs_buf_item_relse);
--- /dev/null
+From 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:35:35 -0700
+Subject: xfs: release bli from transaction properly on fs shutdown
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 79e641ce29cfae5b8fc55fb77ac62d11d2d849c0 upstream.
+
+If a filesystem shutdown occurs with a buffer log item in the CIL
+and a log force occurs, the ->iop_unpin() handler is generally
+expected to tear down the bli properly. This entails freeing the bli
+memory and releasing the associated hold on the buffer so it can be
+released and the filesystem unmounted.
+
+If this sequence occurs while ->bli_refcount is elevated (i.e.,
+another transaction is open and attempting to modify the buffer),
+however, ->iop_unpin() may not be responsible for releasing the bli.
+Instead, the transaction may release the final ->bli_refcount
+reference and thus xfs_trans_brelse() is responsible for tearing
+down the bli.
+
+While xfs_trans_brelse() does drop the reference count, it only
+attempts to release the bli if it is clean (i.e., not in the
+CIL/AIL). If the filesystem is shutdown and the bli is sitting dirty
+in the CIL as noted above, this ends up skipping the last
+opportunity to release the bli. In turn, this leaves the hold on the
+buffer and causes an unmount hang. This can be reproduced by running
+generic/388 in repetition.
+
+Update xfs_trans_brelse() to handle this shutdown corner case
+correctly. If the final bli reference is dropped and the filesystem
+is shutdown, remove the bli from the AIL (if necessary) and release
+the bli to drop the buffer hold and ensure an unmount does not hang.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_trans_buf.c | 23 +++++++++++++++--------
+ 1 file changed, 15 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
+ xfs_buf_t *bp)
+ {
+ xfs_buf_log_item_t *bip;
++ int freed;
+
+ /*
+ * Default to a normal brelse() call if the tp is NULL.
+@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp,
+ /*
+ * Drop our reference to the buf log item.
+ */
+- atomic_dec(&bip->bli_refcount);
++ freed = atomic_dec_and_test(&bip->bli_refcount);
+
+ /*
+- * If the buf item is not tracking data in the log, then
+- * we must free it before releasing the buffer back to the
+- * free pool. Before releasing the buffer to the free pool,
+- * clear the transaction pointer in b_fsprivate2 to dissolve
+- * its relation to this transaction.
+- */
+- if (!xfs_buf_item_dirty(bip)) {
++ * If the buf item is not tracking data in the log, then we must free it
++ * before releasing the buffer back to the free pool.
++ *
++ * If the fs has shutdown and we dropped the last reference, it may fall
++ * on us to release a (possibly dirty) bli if it never made it to the
++ * AIL (e.g., the aborted unpin already happened and didn't release it
++ * due to our reference). Since we're already shutdown and need xa_lock,
++ * just force remove from the AIL and release the bli here.
++ */
++ if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
++ xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
++ xfs_buf_item_relse(bp);
++ } else if (!xfs_buf_item_dirty(bip)) {
+ /***
+ ASSERT(bp->b_pincount == 0);
+ ***/
--- /dev/null
+From 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 14 Jun 2017 21:35:35 -0700
+Subject: xfs: remove bli from AIL before release on transaction abort
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 3d4b4a3e30ae7a949c31e1e10268a3da4723d290 upstream.
+
+When a buffer is modified, logged and committed, it ultimately ends
+up sitting on the AIL with a dirty bli waiting for metadata
+writeback. If another transaction locks and invalidates the buffer
+(freeing an inode chunk, for example) in the meantime, the bli is
+flagged as stale, the dirty state is cleared and the bli remains in
+the AIL.
+
+If a shutdown occurs before the transaction that has invalidated the
+buffer is committed, the transaction is ultimately aborted. The log
+items are flagged as such and ->iop_unlock() handles the aborted
+items. Because the bli is clean (due to the invalidation),
+->iop_unlock() unconditionally releases it. The log item may still
+reside in the AIL, however, which means the I/O completion handler
+may still run and attempt to access it. This results in assert
+failure due to the release of the bli while still present in the AIL
+and a subsequent NULL dereference and panic in the buffer I/O
+completion handling. This can be reproduced by running generic/388
+in repetition.
+
+To avoid this problem, update xfs_buf_item_unlock() to first check
+whether the bli is aborted and if so, remove it from the AIL before
+it is released. This ensures that the bli is no longer accessed
+during the shutdown sequence after it has been freed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf_item.c | 21 ++++++++++++---------
+ 1 file changed, 12 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -636,20 +636,23 @@ xfs_buf_item_unlock(
+
+ /*
+ * Clean buffers, by definition, cannot be in the AIL. However, aborted
+- * buffers may be dirty and hence in the AIL. Therefore if we are
+- * aborting a buffer and we've just taken the last refernce away, we
+- * have to check if it is in the AIL before freeing it. We need to free
+- * it in this case, because an aborted transaction has already shut the
+- * filesystem down and this is the last chance we will have to do so.
++ * buffers may be in the AIL regardless of dirty state. An aborted
++ * transaction that invalidates a buffer already in the AIL may have
++ * marked it stale and cleared the dirty state, for example.
++ *
++ * Therefore if we are aborting a buffer and we've just taken the last
++ * reference away, we have to check if it is in the AIL before freeing
++ * it. We need to free it in this case, because an aborted transaction
++ * has already shut the filesystem down and this is the last chance we
++ * will have to do so.
+ */
+ if (atomic_dec_and_test(&bip->bli_refcount)) {
+- if (clean)
+- xfs_buf_item_relse(bp);
+- else if (aborted) {
++ if (aborted) {
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bp);
+- }
++ } else if (clean)
++ xfs_buf_item_relse(bp);
+ }
+
+ if (!(flags & XFS_BLI_HOLD))
--- /dev/null
+From hch@lst.de Mon Sep 18 10:08:30 2017
+From: Christoph Hellwig <hch@lst.de>
+Date: Sun, 17 Sep 2017 14:06:37 -0700
+Subject: xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, "Darrick J. Wong" <darrick.wong@oracle.com>
+Message-ID: <20170917210712.10804-13-hch@lst.de>
+
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 4c1a67bd3606540b9b42caff34a1d5cd94b1cf65 upstream.
+
+We must initialize the firstfsb parameter to _bmapi_write so that it
+doesn't incorrectly treat stack garbage as a restriction on which AGs
+it can search for free space.
+
+Fixes-coverity-id: 1402025
+Fixes-coverity-id: 1415167
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c | 9 +++++++++
+ fs/xfs/xfs_reflink.c | 2 +-
+ 2 files changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -6639,6 +6639,15 @@ xfs_bmap_finish_one(
+ bmap.br_blockcount = *blockcount;
+ bmap.br_state = state;
+
++ /*
++ * firstfsb is tied to the transaction lifetime and is used to
++ * ensure correct AG locking order and schedule work item
++ * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
++ * to only making one bmap call per transaction, so it should
++ * be safe to have it as a local variable here.
++ */
++ firstfsb = NULLFSBLOCK;
++
+ trace_xfs_bmap_deferred(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -333,7 +333,7 @@ xfs_reflink_convert_cow_extent(
+ struct xfs_defer_ops *dfops)
+ {
+ struct xfs_bmbt_irec irec = *imap;
+- xfs_fsblock_t first_block;
++ xfs_fsblock_t first_block = NULLFSBLOCK;
+ int nimaps = 1;
+
+ if (imap->br_state == XFS_EXT_NORM)
--- /dev/null
+From e1a4e37cc7b665b6804fba812aca2f4d7402c249 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Wed, 14 Jun 2017 21:25:57 -0700
+Subject: xfs: try to avoid blowing out the transaction reservation when bunmaping a shared extent
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit e1a4e37cc7b665b6804fba812aca2f4d7402c249 upstream.
+
+In a pathological scenario where we are trying to bunmapi a single
+extent in which every other block is shared, it's possible that trying
+to unmap the entire large extent in a single transaction can generate so
+many EFIs that we overflow the transaction reservation.
+
+Therefore, use a heuristic to guess at the number of blocks we can
+safely unmap from a reflink file's data fork in an single transaction.
+This should prevent problems such as the log head slamming into the tail
+and ASSERTs that trigger because we've exceeded the transaction
+reservation.
+
+Note that since bunmapi can fail to unmap the entire range, we must also
+teach the deferred unmap code to roll into a new transaction whenever we
+get low on reservation.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+[hch: random edits, all bugs are my fault]
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c | 37 ++++++++++++++++++++++++++++---------
+ fs/xfs/libxfs/xfs_bmap.h | 2 +-
+ fs/xfs/libxfs/xfs_refcount.c | 10 +---------
+ fs/xfs/libxfs/xfs_refcount.h | 16 ++++++++++++++++
+ fs/xfs/xfs_bmap_item.c | 17 +++++++++++++++--
+ fs/xfs/xfs_trans.h | 2 +-
+ fs/xfs/xfs_trans_bmap.c | 11 +++++++++--
+ 7 files changed, 71 insertions(+), 24 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -5555,6 +5555,7 @@ __xfs_bunmapi(
+ int whichfork; /* data or attribute fork */
+ xfs_fsblock_t sum;
+ xfs_filblks_t len = *rlen; /* length to unmap in file */
++ xfs_fileoff_t max_len;
+
+ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
+@@ -5576,6 +5577,16 @@ __xfs_bunmapi(
+ ASSERT(len > 0);
+ ASSERT(nexts >= 0);
+
++ /*
++ * Guesstimate how many blocks we can unmap without running the risk of
++ * blowing out the transaction with a mix of EFIs and reflink
++ * adjustments.
++ */
++ if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
++ max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
++ else
++ max_len = len;
++
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+@@ -5621,7 +5632,7 @@ __xfs_bunmapi(
+
+ extno = 0;
+ while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+- (nexts == 0 || extno < nexts)) {
++ (nexts == 0 || extno < nexts) && max_len > 0) {
+ /*
+ * Is the found extent after a hole in which bno lives?
+ * Just back up to the previous extent, if so.
+@@ -5655,6 +5666,15 @@ __xfs_bunmapi(
+ }
+ if (del.br_startoff + del.br_blockcount > bno + 1)
+ del.br_blockcount = bno + 1 - del.br_startoff;
++
++ /* How much can we safely unmap? */
++ if (max_len < del.br_blockcount) {
++ del.br_startoff += del.br_blockcount - max_len;
++ if (!wasdel)
++ del.br_startblock += del.br_blockcount - max_len;
++ del.br_blockcount = max_len;
++ }
++
+ sum = del.br_startblock + del.br_blockcount;
+ if (isrt &&
+ (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+@@ -5835,6 +5855,7 @@ __xfs_bunmapi(
+ if (!isrt && wasdel)
+ xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+
++ max_len -= del.br_blockcount;
+ bno = del.br_startoff - 1;
+ nodelete:
+ /*
+@@ -6604,25 +6625,24 @@ xfs_bmap_finish_one(
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+- xfs_filblks_t blockcount,
++ xfs_filblks_t *blockcount,
+ xfs_exntst_t state)
+ {
+ struct xfs_bmbt_irec bmap;
+ int nimaps = 1;
+ xfs_fsblock_t firstfsb;
+ int flags = XFS_BMAPI_REMAP;
+- int done;
+ int error = 0;
+
+ bmap.br_startblock = startblock;
+ bmap.br_startoff = startoff;
+- bmap.br_blockcount = blockcount;
++ bmap.br_blockcount = *blockcount;
+ bmap.br_state = state;
+
+ trace_xfs_bmap_deferred(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+- ip->i_ino, whichfork, startoff, blockcount, state);
++ ip->i_ino, whichfork, startoff, *blockcount, state);
+
+ if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ return -EFSCORRUPTED;
+@@ -6641,12 +6661,11 @@ xfs_bmap_finish_one(
+ bmap.br_blockcount, flags, &firstfsb,
+ bmap.br_blockcount, &bmap, &nimaps,
+ dfops);
++ *blockcount = 0;
+ break;
+ case XFS_BMAP_UNMAP:
+- error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+- bmap.br_blockcount, flags, 1, &firstfsb,
+- dfops, &done);
+- ASSERT(done);
++ error = __xfs_bunmapi(tp, ip, startoff, blockcount,
++ XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
+ break;
+ default:
+ ASSERT(0);
+--- a/fs/xfs/libxfs/xfs_bmap.h
++++ b/fs/xfs/libxfs/xfs_bmap.h
+@@ -265,7 +265,7 @@ struct xfs_bmap_intent {
+ int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+ int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+- xfs_filblks_t blockcount, xfs_exntst_t state);
++ xfs_filblks_t *blockcount, xfs_exntst_t state);
+ int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+ int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
+ }
+
+ /*
+- * While we're adjusting the refcounts records of an extent, we have
+- * to keep an eye on the number of extents we're dirtying -- run too
+- * many in a single transaction and we'll exceed the transaction's
+- * reservation and crash the fs. Each record adds 12 bytes to the
+- * log (plus any key updates) so we'll conservatively assume 24 bytes
+- * per record. We must also leave space for btree splits on both ends
+- * of the range and space for the CUD and a new CUI.
+- *
+ * XXX: This is a pretty hand-wavy estimate. The penalty for guessing
+ * true incorrectly is a shutdown FS; the penalty for guessing false
+ * incorrectly is more transaction rolls than might be necessary.
+@@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
+ else if (overhead > cur->bc_tp->t_log_res)
+ return false;
+ return cur->bc_tp->t_log_res - overhead >
+- cur->bc_private.a.priv.refc.nr_ops * 32;
++ cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ }
+
+ /*
+--- a/fs/xfs/libxfs/xfs_refcount.h
++++ b/fs/xfs/libxfs/xfs_refcount.h
+@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(
+ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+ xfs_agnumber_t agno);
+
++/*
++ * While we're adjusting the refcounts records of an extent, we have
++ * to keep an eye on the number of extents we're dirtying -- run too
++ * many in a single transaction and we'll exceed the transaction's
++ * reservation and crash the fs. Each record adds 12 bytes to the
++ * log (plus any key updates) so we'll conservatively assume 32 bytes
++ * per record. We must also leave space for btree splits on both ends
++ * of the range and space for the CUD and a new CUI.
++ */
++#define XFS_REFCOUNT_ITEM_OVERHEAD 32
++
++static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
++{
++ return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
++}
++
+ #endif /* __XFS_REFCOUNT_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -395,6 +395,7 @@ xfs_bui_recover(
+ struct xfs_map_extent *bmap;
+ xfs_fsblock_t startblock_fsb;
+ xfs_fsblock_t inode_fsb;
++ xfs_filblks_t count;
+ bool op_ok;
+ struct xfs_bud_log_item *budp;
+ enum xfs_bmap_intent_type type;
+@@ -403,6 +404,7 @@ xfs_bui_recover(
+ struct xfs_trans *tp;
+ struct xfs_inode *ip = NULL;
+ struct xfs_defer_ops dfops;
++ struct xfs_bmbt_irec irec;
+ xfs_fsblock_t firstfsb;
+
+ ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+@@ -480,13 +482,24 @@ xfs_bui_recover(
+ }
+ xfs_trans_ijoin(tp, ip, 0);
+
++ count = bmap->me_len;
+ error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+ ip, whichfork, bmap->me_startoff,
+- bmap->me_startblock, bmap->me_len,
+- state);
++ bmap->me_startblock, &count, state);
+ if (error)
+ goto err_dfops;
+
++ if (count > 0) {
++ ASSERT(type == XFS_BMAP_UNMAP);
++ irec.br_startblock = bmap->me_startblock;
++ irec.br_blockcount = count;
++ irec.br_startoff = bmap->me_startoff;
++ irec.br_state = state;
++ error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
++ if (error)
++ goto err_dfops;
++ }
++
+ /* Finish transaction, free inodes. */
+ error = xfs_defer_finish(&tp, &dfops, NULL);
+ if (error)
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -277,6 +277,6 @@ int xfs_trans_log_finish_bmap_update(str
+ struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
+ enum xfs_bmap_intent_type type, struct xfs_inode *ip,
+ int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+- xfs_filblks_t blockcount, xfs_exntst_t state);
++ xfs_filblks_t *blockcount, xfs_exntst_t state);
+
+ #endif /* __XFS_TRANS_H__ */
+--- a/fs/xfs/xfs_trans_bmap.c
++++ b/fs/xfs/xfs_trans_bmap.c
+@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+- xfs_filblks_t blockcount,
++ xfs_filblks_t *blockcount,
+ xfs_exntst_t state)
+ {
+ int error;
+@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
+ void **state)
+ {
+ struct xfs_bmap_intent *bmap;
++ xfs_filblks_t count;
+ int error;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
++ count = bmap->bi_bmap.br_blockcount;
+ error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
+ bmap->bi_type,
+ bmap->bi_owner, bmap->bi_whichfork,
+ bmap->bi_bmap.br_startoff,
+ bmap->bi_bmap.br_startblock,
+- bmap->bi_bmap.br_blockcount,
++ &count,
+ bmap->bi_bmap.br_state);
++ if (!error && count > 0) {
++ ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
++ bmap->bi_bmap.br_blockcount = count;
++ return -EAGAIN;
++ }
+ kmem_free(bmap);
+ return error;
+ }