From: Greg Kroah-Hartman Date: Mon, 5 Jun 2017 15:07:47 +0000 (+0200) Subject: 4.11-stable patches X-Git-Tag: v3.18.56~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7de83b38ddf0c5864aaaaa5b8aa36c78fa2d2c60;p=thirdparty%2Fkernel%2Fstable-queue.git 4.11-stable patches added patches: xfs-actually-report-xattr-extents-via-iomap.patch xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch xfs-fix-up-quotacheck-buffer-list-error-handling.patch xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch xfs-support-ability-to-wait-on-new-inodes.patch xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch xfs-xfs_trans_alloc_empty.patch --- diff --git a/queue-4.11/series b/queue-4.11/series index afca4905ead..7886be6169f 100644 --- a/queue-4.11/series +++ b/queue-4.11/series @@ -94,3 +94,22 @@ xfs-fix-missed-holes-in-seek_hole-implementation.patch xfs-use-b_state-to-fix-buffer-i-o-accounting-release-race.patch xfs-fix-off-by-one-on-max-nr_pages-in-xfs_find_get_desired_pgoff.patch xfs-use-dedicated-log-worker-wq-to-avoid-deadlock-with-cil-wq.patch +xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch +xfs-actually-report-xattr-extents-via-iomap.patch +xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch +xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch +xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch +xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch +xfs-fix-up-quotacheck-buffer-list-error-handling.patch +xfs-support-ability-to-wait-on-new-inodes.patch +xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch +xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch +xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch +xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch +xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch +xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch +xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch +xfs-xfs_trans_alloc_empty.patch +xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch +xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch +xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch diff --git a/queue-4.11/xfs-actually-report-xattr-extents-via-iomap.patch b/queue-4.11/xfs-actually-report-xattr-extents-via-iomap.patch new file mode 100644 index 00000000000..3fc5230e234 --- /dev/null +++ b/queue-4.11/xfs-actually-report-xattr-extents-via-iomap.patch @@ -0,0 +1,37 @@ +From 84358536dc355a9c8978ee425f87e116186bed16 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 6 Apr 2017 16:00:39 -0700 +Subject: xfs: actually report xattr extents via iomap + +From: Darrick J. Wong + +commit 84358536dc355a9c8978ee425f87e116186bed16 upstream. + +Apparently FIEMAP for xattrs has been broken since we switched to +the iomap backend because of an incorrect check for xattr presence. +Also fix the broken locking. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_iomap.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1170,10 +1170,10 @@ xfs_xattr_iomap_begin( + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + +- lockmode = xfs_ilock_data_map_shared(ip); ++ lockmode = xfs_ilock_attr_map_shared(ip); + + /* if there are no attribute fork or extents, return ENOENT */ +- if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { ++ if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + error = -ENOENT; + goto out_unlock; + } diff --git a/queue-4.11/xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch b/queue-4.11/xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch new file mode 100644 index 00000000000..ea577bb1b31 --- /dev/null +++ b/queue-4.11/xfs-avoid-mount-time-deadlock-in-cow-extent-recovery.patch @@ -0,0 +1,102 @@ +From 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 15 May 2017 19:16:15 -0700 +Subject: xfs: avoid mount-time deadlock in CoW extent recovery + +From: Darrick J. Wong + +commit 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 upstream. + +If a malicious user corrupts the refcount btree to cause a cycle between +different levels of the tree, the next mount attempt will deadlock in +the CoW recovery routine while grabbing buffer locks. We can use the +ability to re-grab a buffer that was previous locked to a transaction to +avoid deadlocks, so do that here. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_refcount.c | 43 +++++++++++++++++++++++++++++++------------ + 1 file changed, 31 insertions(+), 12 deletions(-) + +--- a/fs/xfs/libxfs/xfs_refcount.c ++++ b/fs/xfs/libxfs/xfs_refcount.c +@@ -1629,13 +1629,28 @@ xfs_refcount_recover_cow_leftovers( + if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + return -EOPNOTSUPP; + +- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); ++ INIT_LIST_HEAD(&debris); ++ ++ /* ++ * In this first part, we use an empty transaction to gather up ++ * all the leftover CoW extents so that we can subsequently ++ * delete them. The empty transaction is used to avoid ++ * a buffer lock deadlock if there happens to be a loop in the ++ * refcountbt because we're allowed to re-grab a buffer that is ++ * already attached to our transaction. When we're done ++ * recording the CoW debris we cancel the (empty) transaction ++ * and everything goes away cleanly. ++ */ ++ error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; +- cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); ++ ++ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); ++ if (error) ++ goto out_trans; ++ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); + + /* Find all the leftover CoW staging extents. */ +- INIT_LIST_HEAD(&debris); + memset(&low, 0, sizeof(low)); + memset(&high, 0, sizeof(high)); + low.rc.rc_startblock = XFS_REFC_COW_START; +@@ -1645,10 +1660,11 @@ xfs_refcount_recover_cow_leftovers( + if (error) + goto out_cursor; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); +- xfs_buf_relse(agbp); ++ xfs_trans_brelse(tp, agbp); ++ xfs_trans_cancel(tp); + + /* Now iterate the list to free the leftovers */ +- list_for_each_entry(rr, &debris, rr_list) { ++ list_for_each_entry_safe(rr, n, &debris, rr_list) { + /* Set up transaction. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) +@@ -1676,8 +1692,16 @@ xfs_refcount_recover_cow_leftovers( + error = xfs_trans_commit(tp); + if (error) + goto out_free; ++ ++ list_del(&rr->rr_list); ++ kmem_free(rr); + } + ++ return error; ++out_defer: ++ xfs_defer_cancel(&dfops); ++out_trans: ++ xfs_trans_cancel(tp); + out_free: + /* Free the leftover list */ + list_for_each_entry_safe(rr, n, &debris, rr_list) { +@@ -1688,11 +1712,6 @@ out_free: + + out_cursor: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); +- xfs_buf_relse(agbp); +- goto out_free; +- +-out_defer: +- xfs_defer_cancel(&dfops); +- xfs_trans_cancel(tp); +- goto out_free; ++ xfs_trans_brelse(tp, agbp); ++ goto out_trans; + } diff --git a/queue-4.11/xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch b/queue-4.11/xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch new file mode 100644 index 00000000000..02cb9e8d394 --- /dev/null +++ b/queue-4.11/xfs-bad-assertion-for-delalloc-an-extent-that-start-at-i_size.patch @@ -0,0 +1,46 @@ +From 892d2a5f705723b2cb488bfb38bcbdcf83273184 Mon Sep 17 00:00:00 2001 +From: Zorro Lang +Date: Mon, 15 May 2017 08:40:02 -0700 +Subject: xfs: bad assertion for delalloc an extent that start at i_size + +From: Zorro Lang + +commit 892d2a5f705723b2cb488bfb38bcbdcf83273184 upstream. + +By run fsstress long enough time enough in RHEL-7, I find an +assertion failure (harder to reproduce on linux-4.11, but problem +is still there): + + XFS: Assertion failed: (iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c + +The assertion is in xfs_getbmap() funciton: + + if (map[i].br_startblock == DELAYSTARTBLOCK && +--> map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + +When map[i].br_startoff == XFS_B_TO_FSB(mp, XFS_ISIZE(ip)), the +startoff is just at EOF. But we only need to make sure delalloc +extents that are within EOF, not include EOF. + +Signed-off-by: Zorro Lang +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -717,7 +717,7 @@ xfs_getbmap( + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && +- map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ++ map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + + if (map[i].br_startblock == HOLESTARTBLOCK && diff --git a/queue-4.11/xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch b/queue-4.11/xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch new file mode 100644 index 00000000000..9a747b53af7 --- /dev/null +++ b/queue-4.11/xfs-bmapx-shouldn-t-barf-on-inline-format-directories.patch @@ -0,0 +1,42 @@ +From 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Fri, 12 May 2017 10:44:08 -0700 +Subject: xfs: BMAPX shouldn't barf on inline-format directories + +From: Darrick J. Wong + +commit 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 upstream. + +When we're fulfilling a BMAPX request, jump out early if the data fork +is in local format. This prevents us from hitting a debugging check in +bmapi_read and barfing errors back to userspace. The on-disk extent +count check later isn't sufficient for IF_DELALLOC mode because da +extents are in memory and not on disk. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -583,9 +583,13 @@ xfs_getbmap( + } + break; + default: ++ /* Local format data forks report no extents. */ ++ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { ++ bmv->bmv_entries = 0; ++ return 0; ++ } + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && +- ip->i_d.di_format != XFS_DINODE_FMT_BTREE && +- ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) ++ ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + return -EINVAL; + + if (xfs_get_extsz_hint(ip) || diff --git a/queue-4.11/xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch b/queue-4.11/xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch new file mode 100644 index 00000000000..215e552bfb5 --- /dev/null +++ b/queue-4.11/xfs-drop-iolock-from-reclaim-context-to-appease-lockdep.patch @@ -0,0 +1,74 @@ +From 3b4683c294095b5f777c03307ef8c60f47320e12 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Tue, 11 Apr 2017 10:50:05 -0700 +Subject: xfs: drop iolock from reclaim context to appease lockdep + +From: Brian Foster + +commit 3b4683c294095b5f777c03307ef8c60f47320e12 upstream. + +Lockdep complains about use of the iolock in inode reclaim context +because it doesn't understand that reclaim has the last reference to +the inode, and thus an iolock->reclaim->iolock deadlock is not +possible. + +The iolock is technically not necessary in xfs_inactive() and was +only added to appease an assert in xfs_free_eofblocks(), which can +be called from other non-reclaim contexts. Therefore, just kill the +assert and drop the use of the iolock from reclaim context to quiet +lockdep. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 8 +++----- + fs/xfs/xfs_inode.c | 9 +++++---- + 2 files changed, 8 insertions(+), 9 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -904,9 +904,9 @@ xfs_can_free_eofblocks(struct xfs_inode + } + + /* +- * This is called by xfs_inactive to free any blocks beyond eof +- * when the link count isn't zero and by xfs_dm_punch_hole() when +- * punching a hole to EOF. ++ * This is called to free any blocks beyond eof. The caller must hold ++ * IOLOCK_EXCL unless we are in the inode reclaim path and have the only ++ * reference to the inode. + */ + int + xfs_free_eofblocks( +@@ -921,8 +921,6 @@ xfs_free_eofblocks( + struct xfs_bmbt_irec imap; + struct xfs_mount *mp = ip->i_mount; + +- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); +- + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1906,12 +1906,13 @@ xfs_inactive( + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. ++ * ++ * Note: don't bother with iolock here since lockdep complains ++ * about acquiring it in reclaim context. We have the only ++ * reference to the inode at this point anyways. + */ +- if (xfs_can_free_eofblocks(ip, true)) { +- xfs_ilock(ip, XFS_IOLOCK_EXCL); ++ if (xfs_can_free_eofblocks(ip, true)) + xfs_free_eofblocks(ip); +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); +- } + + return; + } diff --git a/queue-4.11/xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch b/queue-4.11/xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch new file mode 100644 index 00000000000..665e639eda1 --- /dev/null +++ b/queue-4.11/xfs-fix-indlen-accounting-error-on-partial-delalloc-conversion.patch @@ -0,0 +1,71 @@ +From 0daaecacb83bc6b656a56393ab77a31c28139bc7 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 12 May 2017 10:44:08 -0700 +Subject: xfs: fix indlen accounting error on partial delalloc conversion + +From: Brian Foster + +commit 0daaecacb83bc6b656a56393ab77a31c28139bc7 upstream. + +The delalloc -> real block conversion path uses an incorrect +calculation in the case where the middle part of a delalloc extent +is being converted. This is documented as a rare situation because +XFS generally attempts to maximize contiguity by converting as much +of a delalloc extent as possible. + +If this situation does occur, the indlen reservation for the two new +delalloc extents left behind by the conversion of the middle range +is calculated and compared with the original reservation. If more +blocks are required, the delta is allocated from the global block +pool. This delta value can be characterized as the difference +between the new total requirement (temp + temp2) and the currently +available reservation minus those blocks that have already been +allocated (startblockval(PREV.br_startblock) - allocated). + +The problem is that the current code does not account for previously +allocated blocks correctly. It subtracts the current allocation +count from the (new - old) delta rather than the old indlen +reservation. This means that more indlen blocks than have been +allocated end up stashed in the remaining extents and free space +accounting is broken as a result. + +Fix up the calculation to subtract the allocated block count from +the original extent indlen and thus correctly allocate the +reservation delta based on the difference between the new total +requirement and the unused blocks from the original reservation. +Also remove a bogus assert that contradicts the fact that the new +indlen reservation can be larger than the original indlen +reservation. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -2106,8 +2106,10 @@ xfs_bmap_add_extent_delay_real( + } + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); +- diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - +- (bma->cur ? bma->cur->bc_private.b.allocated : 0)); ++ diff = (int)(temp + temp2 - ++ (startblockval(PREV.br_startblock) - ++ (bma->cur ? ++ bma->cur->bc_private.b.allocated : 0))); + if (diff > 0) { + error = xfs_mod_fdblocks(bma->ip->i_mount, + -((int64_t)diff), false); +@@ -2164,7 +2166,6 @@ xfs_bmap_add_extent_delay_real( + temp = da_new; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; +- ASSERT(temp <= da_old); + if (temp < da_old) + xfs_mod_fdblocks(bma->ip->i_mount, + (int64_t)(da_old - temp), false); diff --git a/queue-4.11/xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch b/queue-4.11/xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch new file mode 100644 index 00000000000..bf9b7fea03d --- /dev/null +++ b/queue-4.11/xfs-fix-integer-truncation-in-xfs_bmap_remap_alloc.patch @@ -0,0 +1,32 @@ +From 52813fb13ff90bd9c39a93446cbf1103c290b6e9 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Tue, 11 Apr 2017 16:45:52 -0700 +Subject: xfs: fix integer truncation in xfs_bmap_remap_alloc + +From: Christoph Hellwig + +commit 52813fb13ff90bd9c39a93446cbf1103c290b6e9 upstream. + +bno should be a xfs_fsblock_t, which is 64-bit wides instead of a +xfs_aglock_t, which truncates the value to 32 bits. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -3863,7 +3863,7 @@ xfs_bmap_remap_alloc( + { + struct xfs_trans *tp = ap->tp; + struct xfs_mount *mp = tp->t_mountp; +- xfs_agblock_t bno; ++ xfs_fsblock_t bno; + struct xfs_alloc_arg args; + int error; + diff --git a/queue-4.11/xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch b/queue-4.11/xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch new file mode 100644 index 00000000000..c5b5bc54a2c --- /dev/null +++ b/queue-4.11/xfs-fix-off-by-in-in-loop-termination-in-xfs_find_get_desired_pgoff.patch @@ -0,0 +1,34 @@ +From d7fd24257aa60316bf81093f7f909dc9475ae974 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 18 May 2017 16:36:23 -0700 +Subject: xfs: Fix off-by-in in loop termination in xfs_find_get_desired_pgoff() + +From: Jan Kara + +commit d7fd24257aa60316bf81093f7f909dc9475ae974 upstream. + +There is an off-by-one error in loop termination conditions in +xfs_find_get_desired_pgoff() since 'end' may index a page beyond end of +desired range if 'endoff' is page aligned. It doesn't have any visible +effects but still it is good to fix it. + +Signed-off-by: Jan Kara +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -1043,7 +1043,7 @@ xfs_find_get_desired_pgoff( + + index = startoff >> PAGE_SHIFT; + endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); +- end = endoff >> PAGE_SHIFT; ++ end = (endoff - 1) >> PAGE_SHIFT; + do { + int want; + unsigned nr_pages; diff --git a/queue-4.11/xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch b/queue-4.11/xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch new file mode 100644 index 00000000000..72706048ffc --- /dev/null +++ b/queue-4.11/xfs-fix-over-copying-of-getbmap-parameters-from-userspace.patch @@ -0,0 +1,38 @@ +From be6324c00c4d1e0e665f03ed1fc18863a88da119 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 3 Apr 2017 15:17:57 -0700 +Subject: xfs: fix over-copying of getbmap parameters from userspace + +From: Darrick J. Wong + +commit be6324c00c4d1e0e665f03ed1fc18863a88da119 upstream. + +In xfs_ioc_getbmap, we should only copy the fields of struct getbmap +from userspace, or else we end up copying random stack contents into the +kernel. struct getbmap is a strict subset of getbmapx, so a partial +structure copy should work fine. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_ioctl.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_ioctl.c ++++ b/fs/xfs/xfs_ioctl.c +@@ -1543,10 +1543,11 @@ xfs_ioc_getbmap( + unsigned int cmd, + void __user *arg) + { +- struct getbmapx bmx; ++ struct getbmapx bmx = { 0 }; + int error; + +- if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) ++ /* struct getbmap is a strict subset of struct getbmapx. */ ++ if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) + return -EFAULT; + + if (bmx.bmv_count < 2) diff --git a/queue-4.11/xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch b/queue-4.11/xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch new file mode 100644 index 00000000000..74aa17e982e --- /dev/null +++ b/queue-4.11/xfs-fix-unaligned-access-in-xfs_btree_visit_blocks.patch @@ -0,0 +1,35 @@ +From a4d768e702de224cc85e0c8eac9311763403b368 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Mon, 22 May 2017 19:54:10 -0700 +Subject: xfs: fix unaligned access in xfs_btree_visit_blocks + +From: Eric Sandeen + +commit a4d768e702de224cc85e0c8eac9311763403b368 upstream. + +This structure copy was throwing unaligned access warnings on sparc64: + +Kernel unaligned access at TPC[1043c088] xfs_btree_visit_blocks+0x88/0xe0 [xfs] + +xfs_btree_copy_ptrs does a memcpy, which avoids it. + +Signed-off-by: Eric Sandeen +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_btree.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -4395,7 +4395,7 @@ xfs_btree_visit_blocks( + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ +- lptr = *ptr; ++ xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); + } + + /* for each buffer in the level */ diff --git a/queue-4.11/xfs-fix-up-quotacheck-buffer-list-error-handling.patch b/queue-4.11/xfs-fix-up-quotacheck-buffer-list-error-handling.patch new file mode 100644 index 00000000000..58932b5a472 --- /dev/null +++ b/queue-4.11/xfs-fix-up-quotacheck-buffer-list-error-handling.patch @@ -0,0 +1,96 @@ +From 20e8a063786050083fe05b4f45be338c60b49126 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 21 Apr 2017 12:40:44 -0700 +Subject: xfs: fix up quotacheck buffer list error handling + +From: Brian Foster + +commit 20e8a063786050083fe05b4f45be338c60b49126 upstream. + +The quotacheck error handling of the delwri buffer list assumes the +resident buffers are locked and doesn't clear the _XBF_DELWRI_Q flag +on the buffers that are dequeued. This can lead to assert failures +on buffer release and possibly other locking problems. + +Move this code to a delwri queue cancel helper function to +encapsulate the logic required to properly release buffers from a +delwri queue. Update the helper to clear the delwri queue flag and +call it from quotacheck. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_buf.c | 24 ++++++++++++++++++++++++ + fs/xfs/xfs_buf.h | 1 + + fs/xfs/xfs_qm.c | 7 +------ + 3 files changed, 26 insertions(+), 6 deletions(-) + +--- a/fs/xfs/xfs_buf.c ++++ b/fs/xfs/xfs_buf.c +@@ -1093,6 +1093,8 @@ void + xfs_buf_unlock( + struct xfs_buf *bp) + { ++ ASSERT(xfs_buf_islocked(bp)); ++ + XB_CLEAR_OWNER(bp); + up(&bp->b_sema); + +@@ -1829,6 +1831,28 @@ error: + } + + /* ++ * Cancel a delayed write list. ++ * ++ * Remove each buffer from the list, clear the delwri queue flag and drop the ++ * associated buffer reference. ++ */ ++void ++xfs_buf_delwri_cancel( ++ struct list_head *list) ++{ ++ struct xfs_buf *bp; ++ ++ while (!list_empty(list)) { ++ bp = list_first_entry(list, struct xfs_buf, b_list); ++ ++ xfs_buf_lock(bp); ++ bp->b_flags &= ~_XBF_DELWRI_Q; ++ list_del_init(&bp->b_list); ++ xfs_buf_relse(bp); ++ } ++} ++ ++/* + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that +--- a/fs/xfs/xfs_buf.h ++++ b/fs/xfs/xfs_buf.h +@@ -329,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_b + extern void xfs_buf_stale(struct xfs_buf *bp); + + /* Delayed Write Buffer Routines */ ++extern void xfs_buf_delwri_cancel(struct list_head *); + extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); + extern int xfs_buf_delwri_submit(struct list_head *); + extern int xfs_buf_delwri_submit_nowait(struct list_head *); +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1384,12 +1384,7 @@ xfs_qm_quotacheck( + mp->m_qflags |= flags; + + error_return: +- while (!list_empty(&buffer_list)) { +- struct xfs_buf *bp = +- list_first_entry(&buffer_list, struct xfs_buf, b_list); +- list_del_init(&bp->b_list); +- xfs_buf_relse(bp); +- } ++ xfs_buf_delwri_cancel(&buffer_list); + + if (error) { + xfs_warn(mp, diff --git a/queue-4.11/xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch b/queue-4.11/xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch new file mode 100644 index 00000000000..5f3982751c8 --- /dev/null +++ b/queue-4.11/xfs-fix-use-after-free-in-xfs_finish_page_writeback.patch @@ -0,0 +1,113 @@ +From 161f55efba5ddccc690139fae9373cafc3447a97 Mon Sep 17 00:00:00 2001 +From: Eryu Guan +Date: Tue, 2 May 2017 13:54:47 -0700 +Subject: xfs: fix use-after-free in xfs_finish_page_writeback + +From: Eryu Guan + +commit 161f55efba5ddccc690139fae9373cafc3447a97 upstream. + +Commit 28b783e47ad7 ("xfs: bufferhead chains are invalid after +end_page_writeback") fixed one use-after-free issue by +pre-calculating the loop conditionals before calling bh->b_end_io() +in the end_io processing loop, but it assigned 'next' pointer before +checking end offset boundary & breaking the loop, at which point the +bh might be freed already, and caused use-after-free. + +This is caught by KASAN when running fstests generic/127 on sub-page +block size XFS. + +[ 2517.244502] run fstests generic/127 at 2017-04-27 07:30:50 +[ 2747.868840] ================================================================== +[ 2747.876949] BUG: KASAN: use-after-free in xfs_destroy_ioend+0x3d3/0x4e0 [xfs] at addr ffff8801395ae698 +... +[ 2747.918245] Call Trace: +[ 2747.920975] dump_stack+0x63/0x84 +[ 2747.924673] kasan_object_err+0x21/0x70 +[ 2747.928950] kasan_report+0x271/0x530 +[ 2747.933064] ? xfs_destroy_ioend+0x3d3/0x4e0 [xfs] +[ 2747.938409] ? end_page_writeback+0xce/0x110 +[ 2747.943171] __asan_report_load8_noabort+0x19/0x20 +[ 2747.948545] xfs_destroy_ioend+0x3d3/0x4e0 [xfs] +[ 2747.953724] xfs_end_io+0x1af/0x2b0 [xfs] +[ 2747.958197] process_one_work+0x5ff/0x1000 +[ 2747.962766] worker_thread+0xe4/0x10e0 +[ 2747.966946] kthread+0x2d3/0x3d0 +[ 2747.970546] ? process_one_work+0x1000/0x1000 +[ 2747.975405] ? kthread_create_on_node+0xc0/0xc0 +[ 2747.980457] ? syscall_return_slowpath+0xe6/0x140 +[ 2747.985706] ? do_page_fault+0x30/0x80 +[ 2747.989887] ret_from_fork+0x2c/0x40 +[ 2747.993874] Object at ffff8801395ae690, in cache buffer_head size: 104 +[ 2748.001155] Allocated: +[ 2748.003782] PID = 8327 +[ 2748.006411] save_stack_trace+0x1b/0x20 +[ 2748.010688] save_stack+0x46/0xd0 +[ 2748.014383] kasan_kmalloc+0xad/0xe0 +[ 2748.018370] kasan_slab_alloc+0x12/0x20 +[ 2748.022648] kmem_cache_alloc+0xb8/0x1b0 +[ 2748.027024] alloc_buffer_head+0x22/0xc0 +[ 2748.031399] alloc_page_buffers+0xd1/0x250 +[ 2748.035968] create_empty_buffers+0x30/0x410 +[ 2748.040730] create_page_buffers+0x120/0x1b0 +[ 2748.045493] __block_write_begin_int+0x17a/0x1800 +[ 2748.050740] iomap_write_begin+0x100/0x2f0 +[ 2748.055308] iomap_zero_range_actor+0x253/0x5c0 +[ 2748.060362] iomap_apply+0x157/0x270 +[ 2748.064347] iomap_zero_range+0x5a/0x80 +[ 2748.068624] iomap_truncate_page+0x6b/0xa0 +[ 2748.073227] xfs_setattr_size+0x1f7/0xa10 [xfs] +[ 2748.078312] xfs_vn_setattr_size+0x68/0x140 [xfs] +[ 2748.083589] xfs_file_fallocate+0x4ac/0x820 [xfs] +[ 2748.088838] vfs_fallocate+0x2cf/0x780 +[ 2748.093021] SyS_fallocate+0x48/0x80 +[ 2748.097006] do_syscall_64+0x18a/0x430 +[ 2748.101186] return_from_SYSCALL_64+0x0/0x6a +[ 2748.105948] Freed: +[ 2748.108189] PID = 8327 +[ 2748.110816] save_stack_trace+0x1b/0x20 +[ 2748.115093] save_stack+0x46/0xd0 +[ 2748.118788] kasan_slab_free+0x73/0xc0 +[ 2748.122969] kmem_cache_free+0x7a/0x200 +[ 2748.127247] free_buffer_head+0x41/0x80 +[ 2748.131524] try_to_free_buffers+0x178/0x250 +[ 2748.136316] xfs_vm_releasepage+0x2e9/0x3d0 [xfs] +[ 2748.141563] try_to_release_page+0x100/0x180 +[ 2748.146325] invalidate_inode_pages2_range+0x7da/0xcf0 +[ 2748.152087] xfs_shift_file_space+0x37d/0x6e0 [xfs] +[ 2748.157557] xfs_collapse_file_space+0x49/0x120 [xfs] +[ 2748.163223] xfs_file_fallocate+0x2a7/0x820 [xfs] +[ 2748.168462] vfs_fallocate+0x2cf/0x780 +[ 2748.172642] SyS_fallocate+0x48/0x80 +[ 2748.176629] do_syscall_64+0x18a/0x430 +[ 2748.180810] return_from_SYSCALL_64+0x0/0x6a + +Fixed it by checking on offset against end & breaking out first, +dereference bh only if there're still bufferheads to process. + +Signed-off-by: Eryu Guan +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_aops.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -111,11 +111,11 @@ xfs_finish_page_writeback( + + bsize = bh->b_size; + do { ++ if (off > end) ++ break; + next = bh->b_this_page; + if (off < bvec->bv_offset) + goto next_bh; +- if (off > end) +- break; + bh->b_end_io(bh, !error); + next_bh: + off += bsize; diff --git a/queue-4.11/xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch b/queue-4.11/xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch new file mode 100644 index 00000000000..4c2e7b37d80 --- /dev/null +++ b/queue-4.11/xfs-handle-array-index-overrun-in-xfs_dir2_leaf_readbuf.patch @@ -0,0 +1,103 @@ +From 023cc840b40fad95c6fe26fff1d380a8c9d45939 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Thu, 13 Apr 2017 15:15:47 -0700 +Subject: xfs: handle array index overrun in xfs_dir2_leaf_readbuf() + +From: Eric Sandeen + +commit 023cc840b40fad95c6fe26fff1d380a8c9d45939 upstream. + +Carlos had a case where "find" seemed to start spinning +forever and never return. + +This was on a filesystem with non-default multi-fsb (8k) +directory blocks, and a fragmented directory with extents +like this: + +0:[0,133646,2,0] +1:[2,195888,1,0] +2:[3,195890,1,0] +3:[4,195892,1,0] +4:[5,195894,1,0] +5:[6,195896,1,0] +6:[7,195898,1,0] +7:[8,195900,1,0] +8:[9,195902,1,0] +9:[10,195908,1,0] +10:[11,195910,1,0] +11:[12,195912,1,0] +12:[13,195914,1,0] +... + +i.e. the first extent is a contiguous 2-fsb dir block, but +after that it is fragmented into 1 block extents. + +At the top of the readdir path, we allocate a mapping array +which (for this filesystem geometry) can hold 10 extents; see +the assignment to map_info->map_size. During readdir, we are +therefore able to map extents 0 through 9 above into the array +for readahead purposes. If we count by 2, we see that the last +mapped index (9) is the first block of a 2-fsb directory block. + +At the end of xfs_dir2_leaf_readbuf() we have 2 loops to fill +more readahead; the outer loop assumes one full dir block is +processed each loop iteration, and an inner loop that ensures +that this is so by advancing to the next extent until a full +directory block is mapped. + +The problem is that this inner loop may step past the last +extent in the mapping array as it tries to reach the end of +the directory block. This will read garbage for the extent +length, and as a result the loop control variable 'j' may +become corrupted and never fail the loop conditional. + +The number of valid mappings we have in our array is stored +in map->map_valid, so stop this inner loop based on that limit. + +There is an ASSERT at the top of the outer loop for this +same condition, but we never made it out of the inner loop, +so the ASSERT never fired. + +Huge appreciation for Carlos for debugging and isolating +the problem. + +Debugged-and-analyzed-by: Carlos Maiolino +Signed-off-by: Eric Sandeen +Tested-by: Carlos Maiolino +Reviewed-by: Carlos Maiolino +Reviewed-by: Bill O'Donnell +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_dir2_readdir.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_dir2_readdir.c ++++ b/fs/xfs/xfs_dir2_readdir.c +@@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf( + + /* + * Do we need more readahead? ++ * Each loop tries to process 1 full dir blk; last may be partial. + */ + blk_start_plug(&plug); + for (mip->ra_index = mip->ra_offset = i = 0; +@@ -425,9 +426,14 @@ xfs_dir2_leaf_readbuf( + } + + /* +- * Advance offset through the mapping table. ++ * Advance offset through the mapping table, processing a full ++ * dir block even if it is fragmented into several extents. ++ * But stop if we have consumed all valid mappings, even if ++ * it's not yet a full directory block. + */ +- for (j = 0; j < geo->fsbcount; j += length ) { ++ for (j = 0; ++ j < geo->fsbcount && mip->ra_index < mip->map_valid; ++ j += length ) { + /* + * The rest of this extent but not more than a dir + * block. diff --git a/queue-4.11/xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch b/queue-4.11/xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch new file mode 100644 index 00000000000..4a8cc6910ff --- /dev/null +++ b/queue-4.11/xfs-prevent-multi-fsb-dir-readahead-from-reading-random-blocks.patch @@ -0,0 +1,80 @@ +From cb52ee334a45ae6c78a3999e4b473c43ddc528f4 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 20 Apr 2017 08:06:47 -0700 +Subject: xfs: prevent multi-fsb dir readahead from reading random blocks + +From: Brian Foster + +commit cb52ee334a45ae6c78a3999e4b473c43ddc528f4 upstream. + +Directory block readahead uses a complex iteration mechanism to map +between high-level directory blocks and underlying physical extents. +This mechanism attempts to traverse the higher-level dir blocks in a +manner that handles multi-fsb directory blocks and simultaneously +maintains a reference to the corresponding physical blocks. + +This logic doesn't handle certain (discontiguous) physical extent +layouts correctly with multi-fsb directory blocks. For example, +consider the case of a 4k FSB filesystem with a 2 FSB (8k) directory +block size and a directory with the following extent layout: + + EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL + 0: [0..7]: 88..95 0 (88..95) 8 + 1: [8..15]: 80..87 0 (80..87) 8 + 2: [16..39]: 168..191 0 (168..191) 24 + 3: [40..63]: 5242952..5242975 1 (72..95) 24 + +Directory block 0 spans physical extents 0 and 1, dirblk 1 lies +entirely within extent 2 and dirblk 2 spans extents 2 and 3. Because +extent 2 is larger than the directory block size, the readahead code +erroneously assumes the block is contiguous and issues a readahead +based on the physical mapping of the first fsb of the dirblk. This +results in read verifier failure and a spurious corruption or crc +failure, depending on the filesystem format. + +Further, the subsequent readahead code responsible for walking +through the physical table doesn't correctly advance the physical +block reference for dirblk 2. Instead of advancing two physical +filesystem blocks, the first iteration of the loop advances 1 block +(correctly), but the subsequent iteration advances 2 more physical +blocks because the next physical extent (extent 3, above) happens to +cover more than dirblk 2. At this point, the higher-level directory +block walking is completely off the rails of the actual physical +layout of the directory for the respective mapping table. + +Update the contiguous dirblock logic to consider the current offset +in the physical extent to avoid issuing directory readahead to +unrelated blocks. Also, update the mapping table advancing code to +consider the current offset within the current dirblock to avoid +advancing the mapping reference too far beyond the dirblock. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_dir2_readdir.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_dir2_readdir.c ++++ b/fs/xfs/xfs_dir2_readdir.c +@@ -405,7 +405,8 @@ xfs_dir2_leaf_readbuf( + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && +- map[mip->ra_index].br_blockcount >= geo->fsbcount) { ++ (map[mip->ra_index].br_blockcount - mip->ra_offset) >= ++ geo->fsbcount) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + mip->ra_offset, + XFS_FSB_TO_DADDR(dp->i_mount, +@@ -438,7 +439,7 @@ xfs_dir2_leaf_readbuf( + * The rest of this extent but not more than a dir + * block. + */ +- length = min_t(int, geo->fsbcount, ++ length = min_t(int, geo->fsbcount - j, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + mip->ra_offset += length; diff --git a/queue-4.11/xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch b/queue-4.11/xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch new file mode 100644 index 00000000000..8b7a029c832 --- /dev/null +++ b/queue-4.11/xfs-reserve-enough-blocks-to-handle-btree-splits-when-remapping.patch @@ -0,0 +1,123 @@ +From fe0be23e68200573de027de9b8cc2b27e7fce35e Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 12 Apr 2017 12:26:07 -0700 +Subject: xfs: reserve enough blocks to handle btree splits when remapping + +From: Darrick J. Wong + +commit fe0be23e68200573de027de9b8cc2b27e7fce35e upstream. + +In xfs_reflink_end_cow, we erroneously reserve only enough blocks to +handle adding 1 extent. This is problematic if we fragment free space, +have to do CoW, and then have to perform multiple bmap btree expansions. +Furthermore, the BUI recovery routine doesn't reserve /any/ blocks to +handle btree splits, so log recovery fails after our first error causes +the filesystem to go down. + +Therefore, refactor the transaction block reservation macros until we +have a macro that works for our deferred (re)mapping activities, and fix +both problems by using that macro. + +With 1k blocks we can hit this fairly often in g/187 if the scratch fs +is big enough. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_trans_space.h | 23 +++++++++++++++++------ + fs/xfs/xfs_bmap_item.c | 5 ++++- + fs/xfs/xfs_reflink.c | 18 ++++++++++++++++-- + 3 files changed, 37 insertions(+), 9 deletions(-) + +--- a/fs/xfs/libxfs/xfs_trans_space.h ++++ b/fs/xfs/libxfs/xfs_trans_space.h +@@ -21,8 +21,20 @@ + /* + * Components of space reservations. + */ ++ ++/* Worst case number of rmaps that can be held in a block. */ + #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ + (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) ++ ++/* Adding one rmap could split every level up to the top of the tree. */ ++#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) ++ ++/* Blocks we might need to add "b" rmaps to a tree. */ ++#define XFS_NRMAPADD_SPACE_RES(mp, b)\ ++ (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ ++ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ ++ XFS_RMAPADD_SPACE_RES(mp)) ++ + #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ + (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) + #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) +@@ -30,13 +42,12 @@ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w)) ++ ++/* Blocks we might need to add "b" mappings & rmappings to a file. */ + #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ +- (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ +- XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ +- XFS_EXTENTADD_SPACE_RES(mp,w) + \ +- ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ +- XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ +- (mp)->m_rmap_maxlevels) ++ (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \ ++ XFS_NRMAPADD_SPACE_RES((mp), (b))) ++ + #define XFS_DAENTER_1B(mp,w) \ + ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) + #define XFS_DAENTER_DBS(mp,w) \ +--- a/fs/xfs/xfs_bmap_item.c ++++ b/fs/xfs/xfs_bmap_item.c +@@ -34,6 +34,8 @@ + #include "xfs_bmap.h" + #include "xfs_icache.h" + #include "xfs_trace.h" ++#include "xfs_bmap_btree.h" ++#include "xfs_trans_space.h" + + + kmem_zone_t *xfs_bui_zone; +@@ -446,7 +448,8 @@ xfs_bui_recover( + return -EIO; + } + +- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); ++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, ++ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + if (error) + return error; + budp = xfs_trans_get_bud(tp, buip); +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -709,8 +709,22 @@ xfs_reflink_end_cow( + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + +- /* Start a rolling transaction to switch the mappings */ +- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); ++ /* ++ * Start a rolling transaction to switch the mappings. We're ++ * unlikely ever to have to remap 16T worth of single-block ++ * extents, so just cap the worst case extent count to 2^32-1. ++ * Stick a warning in just in case, and avoid 64-bit division. ++ */ ++ BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); ++ if (end_fsb - offset_fsb > UINT_MAX) { ++ error = -EFSCORRUPTED; ++ xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); ++ ASSERT(0); ++ goto out; ++ } ++ resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, ++ (unsigned int)(end_fsb - offset_fsb), ++ XFS_DATA_FORK); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + resblks, 0, 0, &tp); + if (error) diff --git a/queue-4.11/xfs-support-ability-to-wait-on-new-inodes.patch b/queue-4.11/xfs-support-ability-to-wait-on-new-inodes.patch new file mode 100644 index 00000000000..d5336538d29 --- /dev/null +++ b/queue-4.11/xfs-support-ability-to-wait-on-new-inodes.patch @@ -0,0 +1,71 @@ +From 756baca27fff3ecaeab9dbc7a5ee35a1d7bc0c7f Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 26 Apr 2017 08:30:39 -0700 +Subject: xfs: support ability to wait on new inodes + +From: Brian Foster + +commit 756baca27fff3ecaeab9dbc7a5ee35a1d7bc0c7f upstream. + +Inodes that are inserted into the perag tree but still under +construction are flagged with the XFS_INEW bit. Most contexts either +skip such inodes when they are encountered or have the ability to +handle them. + +The runtime quotaoff sequence introduces a context that must wait +for construction of such inodes to correctly ensure that all dquots +in the fs are released. In anticipation of this, support the ability +to wait on new inodes. Wake the appropriate bit when XFS_INEW is +cleared. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_icache.c | 5 ++++- + fs/xfs/xfs_inode.h | 4 +++- + 2 files changed, 7 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -366,14 +366,17 @@ xfs_iget_cache_hit( + + error = xfs_reinit_inode(mp, inode); + if (error) { ++ bool wake; + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); +- ++ wake = !!__xfs_iflags_test(ip, XFS_INEW); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); ++ if (wake) ++ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; +--- a/fs/xfs/xfs_inode.h ++++ b/fs/xfs/xfs_inode.h +@@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode( + #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ + #define XFS_ISTALE (1 << 1) /* inode has been staled */ + #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +-#define XFS_INEW (1 << 3) /* inode has just been allocated */ ++#define __XFS_INEW_BIT 3 /* inode has just been allocated */ ++#define XFS_INEW (1 << __XFS_INEW_BIT) + #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ + #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ + #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +@@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setu + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); ++ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); + } + + static inline void xfs_setup_existing_inode(struct xfs_inode *ip) diff --git a/queue-4.11/xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch b/queue-4.11/xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch new file mode 100644 index 00000000000..b945f9b32e4 --- /dev/null +++ b/queue-4.11/xfs-update-ag-iterator-to-support-wait-on-new-inodes.patch @@ -0,0 +1,188 @@ +From ae2c4ac2dd39b23a87ddb14ceddc3f2872c6aef5 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 26 Apr 2017 08:30:39 -0700 +Subject: xfs: update ag iterator to support wait on new inodes + +From: Brian Foster + +commit ae2c4ac2dd39b23a87ddb14ceddc3f2872c6aef5 upstream. + +The AG inode iterator currently skips new inodes as such inodes are +inserted into the inode radix tree before they are fully +constructed. Certain contexts require the ability to wait on the +construction of new inodes, however. The fs-wide dquot release from +the quotaoff sequence is an example of this. + +Update the AG inode iterator to support the ability to wait on +inodes flagged with XFS_INEW upon request. Create a new +xfs_inode_ag_iterator_flags() interface and support a set of +iteration flags to modify the iteration behavior. When the +XFS_AGITER_INEW_WAIT flag is set, include XFS_INEW flags in the +radix tree inode lookup and wait on them before the callback is +executed. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_icache.c | 53 ++++++++++++++++++++++++++++++++++++++++++++-------- + fs/xfs/xfs_icache.h | 8 +++++++ + 2 files changed, 53 insertions(+), 8 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag( + xfs_perag_clear_reclaim_tag(pag); + } + ++static void ++xfs_inew_wait( ++ struct xfs_inode *ip) ++{ ++ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); ++ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); ++ ++ do { ++ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); ++ if (!xfs_iflags_test(ip, XFS_INEW)) ++ break; ++ schedule(); ++ } while (true); ++ finish_wait(wq, &wait.wait); ++} ++ + /* + * When we recycle a reclaimable inode, we need to re-initialise the VFS inode + * part of the structure. This is made more complex by the fact we store +@@ -626,9 +642,11 @@ out_error_or_again: + + STATIC int + xfs_inode_ag_walk_grab( +- struct xfs_inode *ip) ++ struct xfs_inode *ip, ++ int flags) + { + struct inode *inode = VFS_I(ip); ++ bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); + + ASSERT(rcu_read_lock_held()); + +@@ -646,7 +664,8 @@ xfs_inode_ag_walk_grab( + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ +- if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) ++ if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || ++ __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + +@@ -674,7 +693,8 @@ xfs_inode_ag_walk( + void *args), + int flags, + void *args, +- int tag) ++ int tag, ++ int iter_flags) + { + uint32_t first_index; + int last_error = 0; +@@ -716,7 +736,7 @@ restart: + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + +- if (done || xfs_inode_ag_walk_grab(ip)) ++ if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) + batch[i] = NULL; + + /* +@@ -744,6 +764,9 @@ restart: + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; ++ if ((iter_flags & XFS_AGITER_INEW_WAIT) && ++ xfs_iflags_test(batch[i], XFS_INEW)) ++ xfs_inew_wait(batch[i]); + error = execute(batch[i], flags, args); + IRELE(batch[i]); + if (error == -EAGAIN) { +@@ -823,12 +846,13 @@ xfs_cowblocks_worker( + } + + int +-xfs_inode_ag_iterator( ++xfs_inode_ag_iterator_flags( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, +- void *args) ++ void *args, ++ int iter_flags) + { + struct xfs_perag *pag; + int error = 0; +@@ -838,7 +862,8 @@ xfs_inode_ag_iterator( + ag = 0; + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; +- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); ++ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, ++ iter_flags); + xfs_perag_put(pag); + if (error) { + last_error = error; +@@ -850,6 +875,17 @@ xfs_inode_ag_iterator( + } + + int ++xfs_inode_ag_iterator( ++ struct xfs_mount *mp, ++ int (*execute)(struct xfs_inode *ip, int flags, ++ void *args), ++ int flags, ++ void *args) ++{ ++ return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); ++} ++ ++int + xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, +@@ -866,7 +902,8 @@ xfs_inode_ag_iterator_tag( + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; +- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); ++ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, ++ 0); + xfs_perag_put(pag); + if (error) { + last_error = error; +--- a/fs/xfs/xfs_icache.h ++++ b/fs/xfs/xfs_icache.h +@@ -48,6 +48,11 @@ struct xfs_eofblocks { + #define XFS_IGET_UNTRUSTED 0x2 + #define XFS_IGET_DONTCACHE 0x4 + ++/* ++ * flags for AG inode iterator ++ */ ++#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ ++ + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); + +@@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_st + int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args); ++int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, ++ int (*execute)(struct xfs_inode *ip, int flags, void *args), ++ int flags, void *args, int iter_flags); + int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int tag); diff --git a/queue-4.11/xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch b/queue-4.11/xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch new file mode 100644 index 00000000000..fd321dccbdd --- /dev/null +++ b/queue-4.11/xfs-wait-on-new-inodes-during-quotaoff-dquot-release.patch @@ -0,0 +1,54 @@ +From e20c8a517f259cb4d258e10b0cd5d4b30d4167a0 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 26 Apr 2017 08:30:40 -0700 +Subject: xfs: wait on new inodes during quotaoff dquot release + +From: Brian Foster + +commit e20c8a517f259cb4d258e10b0cd5d4b30d4167a0 upstream. + +The quotaoff operation has a race with inode allocation that results +in a livelock. An inode allocation that occurs before the quota +status flags are updated acquires the appropriate dquots for the +inode via xfs_qm_vop_dqalloc(). It then inserts the XFS_INEW inode +into the perag radix tree, sometime later attaches the dquots to the +inode and finally clears the XFS_INEW flag. Quotaoff expects to +release the dquots from all inodes in the filesystem via +xfs_qm_dqrele_all_inodes(). This invokes the AG inode iterator, +which skips inodes in the XFS_INEW state because they are not fully +constructed. If the scan occurs after dquots have been attached to +an inode, but before XFS_INEW is cleared, the newly allocated inode +will continue to hold a reference to the applicable dquots. When +quotaoff invokes xfs_qm_dqpurge_all(), the reference count of those +dquot(s) remain elevated and the dqpurge scan spins indefinitely. + +To address this problem, update the xfs_qm_dqrele_all_inodes() scan +to wait on inodes marked on the XFS_INEW state. We wait on the +inodes explicitly rather than skip and retry to avoid continuous +retry loops due to a parallel inode allocation workload. Since +quotaoff updates the quota state flags and uses a synchronous +transaction before the dqrele scan, and dquots are attached to +inodes after radix tree insertion iff quota is enabled, one INEW +waiting pass through the AG guarantees that the scan has processed +all inodes that could possibly hold dquot references. + +Reported-by: Eryu Guan +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_qm_syscalls.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_qm_syscalls.c ++++ b/fs/xfs/xfs_qm_syscalls.c +@@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes( + uint flags) + { + ASSERT(mp->m_quotainfo); +- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); ++ xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, ++ XFS_AGITER_INEW_WAIT); + } diff --git a/queue-4.11/xfs-xfs_trans_alloc_empty.patch b/queue-4.11/xfs-xfs_trans_alloc_empty.patch new file mode 100644 index 00000000000..6af2acd7721 --- /dev/null +++ b/queue-4.11/xfs-xfs_trans_alloc_empty.patch @@ -0,0 +1,65 @@ +From hch@lst.de Mon Jun 5 17:05:12 2017 +From: Christoph Hellwig +Date: Sat, 3 Jun 2017 15:18:31 +0200 +Subject: xfs: xfs_trans_alloc_empty +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, "Darrick J . Wong" +Message-ID: <20170603131836.26661-21-hch@lst.de> + +From: Christoph Hellwig + +This is a partial cherry-pick of commit e89c041338 +("xfs: implement the GETFSMAP ioctl"), which also adds this helper, and +a great example of why feature patches should be properly split into +their parts. + +Signed-off-by: Darrick J. Wong +[hch: split from the larger patch for -stable] +Signed-off-by: Christoph Hellwig +--- + fs/xfs/xfs_trans.c | 22 ++++++++++++++++++++++ + fs/xfs/xfs_trans.h | 2 ++ + 2 files changed, 24 insertions(+) + +--- a/fs/xfs/xfs_trans.c ++++ b/fs/xfs/xfs_trans.c +@@ -263,6 +263,28 @@ xfs_trans_alloc( + } + + /* ++ * Create an empty transaction with no reservation. This is a defensive ++ * mechanism for routines that query metadata without actually modifying ++ * them -- if the metadata being queried is somehow cross-linked (think a ++ * btree block pointer that points higher in the tree), we risk deadlock. ++ * However, blocks grabbed as part of a transaction can be re-grabbed. ++ * The verifiers will notice the corrupt block and the operation will fail ++ * back to userspace without deadlocking. ++ * ++ * Note the zero-length reservation; this transaction MUST be cancelled ++ * without any dirty data. ++ */ ++int ++xfs_trans_alloc_empty( ++ struct xfs_mount *mp, ++ struct xfs_trans **tpp) ++{ ++ struct xfs_trans_res resv = {0}; ++ ++ return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); ++} ++ ++/* + * Record the indicated change to the given field for application + * to the file system's superblock when the transaction commits. + * For now, just store the change in the transaction structure. +--- a/fs/xfs/xfs_trans.h ++++ b/fs/xfs/xfs_trans.h +@@ -158,6 +158,8 @@ typedef struct xfs_trans { + int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, + uint blocks, uint rtextents, uint flags, + struct xfs_trans **tpp); ++int xfs_trans_alloc_empty(struct xfs_mount *mp, ++ struct xfs_trans **tpp); + void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); + + struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,