From: Greg Kroah-Hartman Date: Tue, 24 Oct 2017 12:54:24 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v3.18.78~4 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5df2dcbf4b14671192cb3f3bc296f3059a205550;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: fs-xfs-use-ps-printk-format-for-direct-addresses.patch xfs-always-swap-the-cow-forks-when-swapping-extents.patch xfs-cancel-dirty-pages-on-invalidation.patch xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch xfs-don-t-change-inode-mode-if-acl-update-fails.patch xfs-don-t-log-uninitialised-fields-in-inode-structures.patch xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch xfs-handle-error-if-xfs_btree_get_bufs-fails.patch xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch xfs-move-more-rt-specific-code-under-config_xfs_rt.patch xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch xfs-trim-writepage-mapping-to-within-eof.patch xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch --- diff --git a/queue-4.9/fs-xfs-use-ps-printk-format-for-direct-addresses.patch b/queue-4.9/fs-xfs-use-ps-printk-format-for-direct-addresses.patch new file mode 100644 index 00000000000..55bd9d50533 --- /dev/null +++ b/queue-4.9/fs-xfs-use-ps-printk-format-for-direct-addresses.patch @@ -0,0 +1,34 @@ +From e150dcd459e1b441eaf08f341a986f04e61bf3b8 Mon Sep 17 00:00:00 2001 +From: Helge Deller +Date: Mon, 18 Sep 2017 11:34:16 -0700 +Subject: fs/xfs: Use %pS printk format for direct addresses + +From: Helge Deller + +commit e150dcd459e1b441eaf08f341a986f04e61bf3b8 upstream. + +Use the %pS instead of the %pF printk format specifier for printing symbols +from direct addresses. This is needed for the ia64, ppc64 and parisc64 +architectures. + +Signed-off-by: Helge Deller +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_error.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_error.c ++++ b/fs/xfs/xfs_error.c +@@ -167,7 +167,7 @@ xfs_verifier_error( + { + struct xfs_mount *mp = bp->b_target->bt_mount; + +- xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", ++ xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", + bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", + __return_address, bp->b_ops->name, bp->b_bn); + diff --git a/queue-4.9/series b/queue-4.9/series index 302aa6f59d4..367f2797e1d 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -30,3 +30,19 @@ lib-digsig-fix-dereference-of-null-user_key_payload.patch keys-don-t-let-add_key-update-an-uninstantiated-key.patch pkcs7-prevent-null-pointer-dereference-since-sinfo-is-not-always-set.patch vmbus-fix-missing-signaling-in-hv_signal_on_read.patch +xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch +xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch +fs-xfs-use-ps-printk-format-for-direct-addresses.patch +xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch +xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch +xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch +xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch +xfs-always-swap-the-cow-forks-when-swapping-extents.patch +xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch +xfs-don-t-log-uninitialised-fields-in-inode-structures.patch +xfs-move-more-rt-specific-code-under-config_xfs_rt.patch +xfs-don-t-change-inode-mode-if-acl-update-fails.patch +xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch +xfs-handle-error-if-xfs_btree_get_bufs-fails.patch +xfs-cancel-dirty-pages-on-invalidation.patch +xfs-trim-writepage-mapping-to-within-eof.patch diff --git a/queue-4.9/xfs-always-swap-the-cow-forks-when-swapping-extents.patch b/queue-4.9/xfs-always-swap-the-cow-forks-when-swapping-extents.patch new file mode 100644 index 00000000000..8311f3a5e4f --- /dev/null +++ b/queue-4.9/xfs-always-swap-the-cow-forks-when-swapping-extents.patch @@ -0,0 +1,58 @@ +From 52bfcdd7adbc26639bc7b2356ab9a3f5dad68ad6 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 18 Sep 2017 09:41:18 -0700 +Subject: xfs: always swap the cow forks when swapping extents + +From: Darrick J. Wong + +commit 52bfcdd7adbc26639bc7b2356ab9a3f5dad68ad6 upstream. + +Since the CoW fork exists as a secondary data structure to the data +fork, we must always swap cow forks during swapext. We also need to +swap the extent counts and reset the cowblocks tags. + +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -2106,11 +2106,31 @@ xfs_swap_extents( + ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; ++ } ++ ++ /* Swap the cow forks. */ ++ if (xfs_sb_version_hasreflink(&mp->m_sb)) { ++ xfs_extnum_t extnum; ++ ++ ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); ++ ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); ++ ++ extnum = ip->i_cnextents; ++ ip->i_cnextents = tip->i_cnextents; ++ tip->i_cnextents = extnum; ++ + cowfp = ip->i_cowfp; + ip->i_cowfp = tip->i_cowfp; + tip->i_cowfp = cowfp; +- xfs_inode_set_cowblocks_tag(ip); +- xfs_inode_set_cowblocks_tag(tip); ++ ++ if (ip->i_cowfp && ip->i_cnextents) ++ xfs_inode_set_cowblocks_tag(ip); ++ else ++ xfs_inode_clear_cowblocks_tag(ip); ++ if (tip->i_cowfp && tip->i_cnextents) ++ xfs_inode_set_cowblocks_tag(tip); ++ else ++ xfs_inode_clear_cowblocks_tag(tip); + } + + xfs_trans_log_inode(tp, ip, src_log_flags); diff --git a/queue-4.9/xfs-cancel-dirty-pages-on-invalidation.patch b/queue-4.9/xfs-cancel-dirty-pages-on-invalidation.patch new file mode 100644 index 00000000000..db79bb98614 --- /dev/null +++ b/queue-4.9/xfs-cancel-dirty-pages-on-invalidation.patch @@ -0,0 +1,103 @@ +From 793d7dbe6d82a50b9d14bf992b9eaacb70a11ce6 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Fri, 13 Oct 2017 09:47:45 -0700 +Subject: xfs: cancel dirty pages on invalidation + +From: Dave Chinner + +commit 793d7dbe6d82a50b9d14bf992b9eaacb70a11ce6 upstream. + +Recently we've had warnings arise from the vm handing us pages +without bufferheads attached to them. This should not ever occur +in XFS, but we don't defend against it properly if it does. The only +place where we remove bufferheads from a page is in +xfs_vm_releasepage(), but we can't tell the difference here between +"page is dirty so don't release" and "page is dirty but is being +invalidated so release it". + +In some places that are invalidating pages ask for pages to be +released and follow up afterward calling ->releasepage by checking +whether the page was dirty and then aborting the invalidation. This +is a possible vector for releasing buffers from a page but then +leaving it in the mapping, so we really do need to avoid dirty pages +in xfs_vm_releasepage(). + +To differentiate between invalidated pages and normal pages, we need +to clear the page dirty flag when invalidating the pages. This can +be done through xfs_vm_invalidatepage(), and will result +xfs_vm_releasepage() seeing the page as clean which matches the +bufferhead state on the page after calling block_invalidatepage(). + +Hence we can re-add the page dirty check in xfs_vm_releasepage to +catch the case where we might be releasing a page that is actually +dirty and so should not have the bufferheads on it removed. This +will remove one possible vector of "dirty page with no bufferheads" +and so help narrow down the search for the root cause of that +problem. + +Signed-Off-By: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_aops.c | 34 ++++++++++++++++++++++------------ + 1 file changed, 22 insertions(+), 12 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -726,6 +726,14 @@ xfs_vm_invalidatepage( + { + trace_xfs_invalidatepage(page->mapping->host, page, offset, + length); ++ ++ /* ++ * If we are invalidating the entire page, clear the dirty state from it ++ * so that we can check for attempts to release dirty cached pages in ++ * xfs_vm_releasepage(). ++ */ ++ if (offset == 0 && length >= PAGE_SIZE) ++ cancel_dirty_page(page); + block_invalidatepage(page, offset, length); + } + +@@ -1181,25 +1189,27 @@ xfs_vm_releasepage( + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(). Conversely, +- * block_invalidatepage() can send pages that are still marked dirty +- * but otherwise have invalidated buffers. ++ * block_invalidatepage() can send pages that are still marked dirty but ++ * otherwise have invalidated buffers. + * + * We want to release the latter to avoid unnecessary buildup of the +- * LRU, skip the former and warn if we've left any lingering +- * delalloc/unwritten buffers on clean pages. Skip pages with delalloc +- * or unwritten buffers and warn if the page is not dirty. Otherwise +- * try to release the buffers. ++ * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages ++ * that are entirely invalidated and need to be released. Hence the ++ * only time we should get dirty pages here is through ++ * shrink_active_list() and so we can simply skip those now. ++ * ++ * warn if we've left any lingering delalloc/unwritten buffers on clean ++ * or invalidated pages we are about to release. + */ ++ if (PageDirty(page)) ++ return 0; ++ + xfs_count_page_state(page, &delalloc, &unwritten); + +- if (delalloc) { +- WARN_ON_ONCE(!PageDirty(page)); ++ if (WARN_ON_ONCE(delalloc)) + return 0; +- } +- if (unwritten) { +- WARN_ON_ONCE(!PageDirty(page)); ++ if (WARN_ON_ONCE(unwritten)) + return 0; +- } + + return try_to_free_buffers(page); + } diff --git a/queue-4.9/xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch b/queue-4.9/xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch new file mode 100644 index 00000000000..ec52d50b172 --- /dev/null +++ b/queue-4.9/xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch @@ -0,0 +1,40 @@ +From 842f6e9f786226c58fcbd5ef80eadca72fdfe652 Mon Sep 17 00:00:00 2001 +From: Carlos Maiolino +Date: Fri, 22 Sep 2017 11:47:46 -0700 +Subject: xfs: Capture state of the right inode in xfs_iflush_done + +From: Carlos Maiolino + +commit 842f6e9f786226c58fcbd5ef80eadca72fdfe652 upstream. + +My previous patch: d3a304b6292168b83b45d624784f973fdc1ca674 check for +XFS_LI_FAILED flag xfs_iflush done, so the failed item can be properly +resubmitted. + +In the loop scanning other inodes being completed, it should check the +current item for the XFS_LI_FAILED, and not the initial one. + +The state of the initial inode is checked after the loop ends + +Kudos to Eric for catching this. + +Signed-off-by: Carlos Maiolino +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_inode_item.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_inode_item.c ++++ b/fs/xfs/xfs_inode_item.c +@@ -745,7 +745,7 @@ xfs_iflush_done( + */ + iip = INODE_ITEM(blip); + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || +- lip->li_flags & XFS_LI_FAILED) ++ (blip->li_flags & XFS_LI_FAILED)) + need_ail++; + + blip = next; diff --git a/queue-4.9/xfs-don-t-change-inode-mode-if-acl-update-fails.patch b/queue-4.9/xfs-don-t-change-inode-mode-if-acl-update-fails.patch new file mode 100644 index 00000000000..8c0ff5024ee --- /dev/null +++ b/queue-4.9/xfs-don-t-change-inode-mode-if-acl-update-fails.patch @@ -0,0 +1,72 @@ +From 67f2ffe31d1a683170c2ba0ecc643e42a5fdd397 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 9 Oct 2017 11:37:23 -0700 +Subject: xfs: don't change inode mode if ACL update fails + +From: Dave Chinner + +commit 67f2ffe31d1a683170c2ba0ecc643e42a5fdd397 upstream. + +If we get ENOSPC half way through setting the ACL, the inode mode +can still be changed even though the ACL does not exist. Reorder the +operation to only change the mode of the inode if the ACL is set +correctly. + +Whilst this does not fix the problem with crash consistency (that requires +attribute addition to be a deferred op) it does prevent ENOSPC and other +non-fatal errors setting an xattr to be handled sanely. + +This fixes xfstests generic/449. + +Signed-Off-By: Dave Chinner +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_acl.c | 22 ++++++++++++++++------ + 1 file changed, 16 insertions(+), 6 deletions(-) + +--- a/fs/xfs/xfs_acl.c ++++ b/fs/xfs/xfs_acl.c +@@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_ + int + xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) + { ++ umode_t mode; ++ bool set_mode = false; + int error = 0; + + if (!acl) +@@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct + return error; + + if (type == ACL_TYPE_ACCESS) { +- umode_t mode; +- + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + return error; +- error = xfs_set_mode(inode, mode); +- if (error) +- return error; ++ set_mode = true; + } + + set_acl: +- return __xfs_set_acl(inode, acl, type); ++ error = __xfs_set_acl(inode, acl, type); ++ if (error) ++ return error; ++ ++ /* ++ * We set the mode after successfully updating the ACL xattr because the ++ * xattr update can fail at ENOSPC and we don't want to change the mode ++ * if the ACL update hasn't been applied. ++ */ ++ if (set_mode) ++ error = xfs_set_mode(inode, mode); ++ ++ return error; + } diff --git a/queue-4.9/xfs-don-t-log-uninitialised-fields-in-inode-structures.patch b/queue-4.9/xfs-don-t-log-uninitialised-fields-in-inode-structures.patch new file mode 100644 index 00000000000..bb9121e6be3 --- /dev/null +++ b/queue-4.9/xfs-don-t-log-uninitialised-fields-in-inode-structures.patch @@ -0,0 +1,226 @@ +From 20413e37d71befd02b5846acdaf5e2564dd1c38e Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 9 Oct 2017 11:37:22 -0700 +Subject: xfs: Don't log uninitialised fields in inode structures + +From: Dave Chinner + +commit 20413e37d71befd02b5846acdaf5e2564dd1c38e upstream. + +Prevent kmemcheck from throwing warnings about reading uninitialised +memory when formatting inodes into the incore log buffer. There are +several issues here - we don't always log all the fields in the +inode log format item, and we never log the inode the +di_next_unlinked field. + +In the case of the inode log format item, this is exacerbated +by the old xfs_inode_log_format structure padding issue. Hence make +the padded, 64 bit aligned version of the structure the one we always +use for formatting the log and get rid of the 64 bit variant. This +means we'll always log the 64-bit version and so recovery only needs +to convert from the unpadded 32 bit version from older 32 bit +kernels. + +Signed-Off-By: Dave Chinner +Tested-by: Tetsuo Handa +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/xfs/libxfs/xfs_log_format.h | 27 ++++--------- + fs/xfs/xfs_inode_item.c | 82 ++++++++++++++++++++--------------------- + fs/xfs/xfs_ondisk.h | 2 - + 3 files changed, 50 insertions(+), 61 deletions(-) + +--- a/fs/xfs/libxfs/xfs_log_format.h ++++ b/fs/xfs/libxfs/xfs_log_format.h +@@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format { + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ ++ __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ +@@ -280,29 +281,17 @@ typedef struct xfs_inode_log_format { + __int32_t ilf_boffset; /* off of inode in buffer */ + } xfs_inode_log_format_t; + +-typedef struct xfs_inode_log_format_32 { +- __uint16_t ilf_type; /* inode log item type */ +- __uint16_t ilf_size; /* size of this item */ +- __uint32_t ilf_fields; /* flags for fields logged */ +- __uint16_t ilf_asize; /* size of attr d/ext/root */ +- __uint16_t ilf_dsize; /* size of data/ext/root */ +- __uint64_t ilf_ino; /* inode number */ +- union { +- __uint32_t ilfu_rdev; /* rdev value for dev inode*/ +- uuid_t ilfu_uuid; /* mount point value */ +- } ilf_u; +- __int64_t ilf_blkno; /* blkno of inode buffer */ +- __int32_t ilf_len; /* len of inode buffer */ +- __int32_t ilf_boffset; /* off of inode in buffer */ +-} __attribute__((packed)) xfs_inode_log_format_32_t; +- +-typedef struct xfs_inode_log_format_64 { ++/* ++ * Old 32 bit systems will log in this format without the 64 bit ++ * alignment padding. Recovery will detect this and convert it to the ++ * correct format. ++ */ ++struct xfs_inode_log_format_32 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ +- __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ +@@ -311,7 +300,7 @@ typedef struct xfs_inode_log_format_64 { + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +-} xfs_inode_log_format_64_t; ++} __attribute__((packed)); + + + /* +--- a/fs/xfs/xfs_inode_item.c ++++ b/fs/xfs/xfs_inode_item.c +@@ -364,6 +364,9 @@ xfs_inode_to_log_dinode( + to->di_dmstate = from->di_dmstate; + to->di_flags = from->di_flags; + ++ /* log a dummy value to ensure log structure is fully initialised */ ++ to->di_next_unlinked = NULLAGINO; ++ + if (from->di_version == 3) { + to->di_changecount = inode->i_version; + to->di_crtime.t_sec = from->di_crtime.t_sec; +@@ -404,6 +407,11 @@ xfs_inode_item_format_core( + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. ++ * ++ * Note: Always use the 64 bit inode log format structure so we don't ++ * leave an uninitialised hole in the format item on 64 bit systems. Log ++ * recovery on 32 bit systems handles this just fine, so there's no reason ++ * for not using an initialising the properly padded structure all the time. + */ + STATIC void + xfs_inode_item_format( +@@ -412,8 +420,8 @@ xfs_inode_item_format( + { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; +- struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; ++ struct xfs_inode_log_format *ilf; + + ASSERT(ip->i_d.di_version > 1); + +@@ -425,7 +433,17 @@ xfs_inode_item_format( + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ +- xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); ++ ++ /* ++ * make sure we don't leak uninitialised data into the log in the case ++ * when we don't log every field in the inode. ++ */ ++ ilf->ilf_dsize = 0; ++ ilf->ilf_asize = 0; ++ ilf->ilf_pad = 0; ++ memset(&ilf->ilf_u.ilfu_uuid, 0, sizeof(ilf->ilf_u.ilfu_uuid)); ++ ++ xlog_finish_iovec(lv, vecp, sizeof(*ilf)); + + xfs_inode_item_format_core(ip, lv, &vecp); + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); +@@ -855,48 +873,30 @@ xfs_istale_done( + } + + /* +- * convert an xfs_inode_log_format struct from either 32 or 64 bit versions +- * (which can have different field alignments) to the native version ++ * convert an xfs_inode_log_format struct from the old 32 bit version ++ * (which can have different field alignments) to the native 64 bit version + */ + int + xfs_inode_item_format_convert( +- xfs_log_iovec_t *buf, +- xfs_inode_log_format_t *in_f) ++ struct xfs_log_iovec *buf, ++ struct xfs_inode_log_format *in_f) + { +- if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { +- xfs_inode_log_format_32_t *in_f32 = buf->i_addr; ++ struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; + +- in_f->ilf_type = in_f32->ilf_type; +- in_f->ilf_size = in_f32->ilf_size; +- in_f->ilf_fields = in_f32->ilf_fields; +- in_f->ilf_asize = in_f32->ilf_asize; +- in_f->ilf_dsize = in_f32->ilf_dsize; +- in_f->ilf_ino = in_f32->ilf_ino; +- /* copy biggest field of ilf_u */ +- memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, +- in_f32->ilf_u.ilfu_uuid.__u_bits, +- sizeof(uuid_t)); +- in_f->ilf_blkno = in_f32->ilf_blkno; +- in_f->ilf_len = in_f32->ilf_len; +- in_f->ilf_boffset = in_f32->ilf_boffset; +- return 0; +- } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ +- xfs_inode_log_format_64_t *in_f64 = buf->i_addr; +- +- in_f->ilf_type = in_f64->ilf_type; +- in_f->ilf_size = in_f64->ilf_size; +- in_f->ilf_fields = in_f64->ilf_fields; +- in_f->ilf_asize = in_f64->ilf_asize; +- in_f->ilf_dsize = in_f64->ilf_dsize; +- in_f->ilf_ino = in_f64->ilf_ino; +- /* copy biggest field of ilf_u */ +- memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, +- in_f64->ilf_u.ilfu_uuid.__u_bits, +- sizeof(uuid_t)); +- in_f->ilf_blkno = in_f64->ilf_blkno; +- in_f->ilf_len = in_f64->ilf_len; +- in_f->ilf_boffset = in_f64->ilf_boffset; +- return 0; +- } +- return -EFSCORRUPTED; ++ if (buf->i_len != sizeof(*in_f32)) ++ return -EFSCORRUPTED; ++ ++ in_f->ilf_type = in_f32->ilf_type; ++ in_f->ilf_size = in_f32->ilf_size; ++ in_f->ilf_fields = in_f32->ilf_fields; ++ in_f->ilf_asize = in_f32->ilf_asize; ++ in_f->ilf_dsize = in_f32->ilf_dsize; ++ in_f->ilf_ino = in_f32->ilf_ino; ++ /* copy biggest field of ilf_u */ ++ memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, ++ in_f32->ilf_u.ilfu_uuid.__u_bits, sizeof(uuid_t)); ++ in_f->ilf_blkno = in_f32->ilf_blkno; ++ in_f->ilf_len = in_f32->ilf_len; ++ in_f->ilf_boffset = in_f32->ilf_boffset; ++ return 0; + } +--- a/fs/xfs/xfs_ondisk.h ++++ b/fs/xfs/xfs_ondisk.h +@@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void) + XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); + XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); +- XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56); ++ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); + XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); + XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + } diff --git a/queue-4.9/xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch b/queue-4.9/xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch new file mode 100644 index 00000000000..4af867509db --- /dev/null +++ b/queue-4.9/xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch @@ -0,0 +1,42 @@ +From cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 18 Sep 2017 09:41:16 -0700 +Subject: xfs: don't unconditionally clear the reflink flag on zero-block files + +From: Darrick J. Wong + +commit cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 upstream. + +If we have speculative cow preallocations hanging around in the cow +fork, don't let a truncate operation clear the reflink flag because if +we do then there's a chance we'll forget to free those extents when we +destroy the incore inode. + +Reported-by: Amir Goldstein +Reviewed-by: Carlos Maiolino +Reviewed-by: Christoph Hellwig +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_inode.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1632,10 +1632,12 @@ xfs_itruncate_extents( + goto out; + + /* +- * Clear the reflink flag if we truncated everything. ++ * Clear the reflink flag if there are no data fork blocks and ++ * there are no extents staged in the cow fork. + */ +- if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { +- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; ++ if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { ++ if (ip->i_d.di_nblocks == 0) ++ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + } + diff --git a/queue-4.9/xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch b/queue-4.9/xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch new file mode 100644 index 00000000000..9b6e05f349c --- /dev/null +++ b/queue-4.9/xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch @@ -0,0 +1,46 @@ +From 3af423b03435c81036fa710623d3ae92fbe346a3 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 18 Sep 2017 09:41:17 -0700 +Subject: xfs: evict CoW fork extents when performing finsert/fcollapse + +From: Darrick J. Wong + +commit 3af423b03435c81036fa710623d3ae92fbe346a3 upstream. + +When we perform an finsert/fcollapse operation, cancel all the CoW +extents for the affected file offset range so that they don't end up +pointing to the wrong blocks. + +Reported-by: Amir Goldstein +Reviewed-by: Carlos Maiolino +Reviewed-by: Christoph Hellwig +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -1445,7 +1445,19 @@ xfs_shift_file_space( + return error; + + /* +- * The extent shiting code works on extent granularity. So, if ++ * Clean out anything hanging around in the cow fork now that ++ * we've flushed all the dirty data out to disk to avoid having ++ * CoW extents at the wrong offsets. ++ */ ++ if (xfs_is_reflink_inode(ip)) { ++ error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF, ++ true); ++ if (error) ++ return error; ++ } ++ ++ /* ++ * The extent shifting code works on extent granularity. So, if + * stop_fsb is not the starting block of extent, we need to split + * the extent at stop_fsb. + */ diff --git a/queue-4.9/xfs-handle-error-if-xfs_btree_get_bufs-fails.patch b/queue-4.9/xfs-handle-error-if-xfs_btree_get_bufs-fails.patch new file mode 100644 index 00000000000..b157b28cdb9 --- /dev/null +++ b/queue-4.9/xfs-handle-error-if-xfs_btree_get_bufs-fails.patch @@ -0,0 +1,60 @@ +From 93e8befc17f6d6ea92b0aee3741ceac8bca4590f Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Mon, 9 Oct 2017 21:08:06 -0700 +Subject: xfs: handle error if xfs_btree_get_bufs fails + +From: Eric Sandeen + +commit 93e8befc17f6d6ea92b0aee3741ceac8bca4590f upstream. + +Jason reported that a corrupted filesystem failed to replay +the log with a metadata block out of bounds warning: + +XFS (dm-2): _xfs_buf_find: Block out of range: block 0x80270fff8, EOFS 0x9c40000 + +_xfs_buf_find() and xfs_btree_get_bufs() return NULL if +that happens, and then when xfs_alloc_fix_freelist() calls +xfs_trans_binval() on that NULL bp, we oops with: + +BUG: unable to handle kernel NULL pointer dereference at 00000000000000f8 + +We don't handle _xfs_buf_find errors very well, every +caller higher up the stack gets to guess at why it failed. +But we should at least handle it somehow, so return +EFSCORRUPTED here. + +Reported-by: Jason L Tibbitts III +Signed-off-by: Eric Sandeen +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_alloc.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/xfs/libxfs/xfs_alloc.c ++++ b/fs/xfs/libxfs/xfs_alloc.c +@@ -1579,6 +1579,10 @@ xfs_alloc_ag_vextent_small( + + bp = xfs_btree_get_bufs(args->mp, args->tp, + args->agno, fbno, 0); ++ if (!bp) { ++ error = -EFSCORRUPTED; ++ goto error0; ++ } + xfs_trans_binval(args->tp, bp); + } + args->len = 1; +@@ -2136,6 +2140,10 @@ xfs_alloc_fix_freelist( + if (error) + goto out_agbp_relse; + bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); ++ if (!bp) { ++ error = -EFSCORRUPTED; ++ goto out_agbp_relse; ++ } + xfs_trans_binval(tp, bp); + } + diff --git a/queue-4.9/xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch b/queue-4.9/xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch new file mode 100644 index 00000000000..17d555b9ab9 --- /dev/null +++ b/queue-4.9/xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch @@ -0,0 +1,48 @@ +From e12199f85d0ad1b04ce6c425ad93cd847fe930bb Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Tue, 3 Oct 2017 08:58:33 -0700 +Subject: xfs: handle racy AIO in xfs_reflink_end_cow + +From: Christoph Hellwig + +commit e12199f85d0ad1b04ce6c425ad93cd847fe930bb upstream. + +If we got two AIO writes into a COW area the second one might not have any +COW extents left to convert. Handle that case gracefully instead of +triggering an assert or accessing beyond the bounds of the extent list. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/xfs/xfs_reflink.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -767,7 +767,13 @@ xfs_reflink_end_cow( + + /* If there is a hole at end_fsb - 1 go to the previous extent */ + if (eof || got.br_startoff > end_fsb) { +- ASSERT(idx > 0); ++ /* ++ * In case of racing, overlapping AIO writes no COW extents ++ * might be left by the time I/O completes for the loser of ++ * the race. In that case we are done. ++ */ ++ if (idx <= 0) ++ goto out_cancel; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + } + +@@ -841,6 +847,7 @@ next_extent: + + out_defer: + xfs_defer_cancel(&dfops); ++out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + out: diff --git a/queue-4.9/xfs-move-more-rt-specific-code-under-config_xfs_rt.patch b/queue-4.9/xfs-move-more-rt-specific-code-under-config_xfs_rt.patch new file mode 100644 index 00000000000..88cbd71c0ba --- /dev/null +++ b/queue-4.9/xfs-move-more-rt-specific-code-under-config_xfs_rt.patch @@ -0,0 +1,69 @@ +From bb9c2e5433250f5b477035dc478314f8e6dd5e36 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 9 Oct 2017 11:37:22 -0700 +Subject: xfs: move more RT specific code under CONFIG_XFS_RT + +From: Dave Chinner + +commit bb9c2e5433250f5b477035dc478314f8e6dd5e36 upstream. + +Various utility functions and interfaces that iterate internal +devices try to reference the realtime device even when RT support is +not compiled into the kernel. + +Make sure this code is excluded from the CONFIG_XFS_RT=n build, +and where appropriate stub functions to return fatal errors if +they ever get called when RT support is not present. + +Signed-Off-By: Dave Chinner +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 2 ++ + fs/xfs/xfs_bmap_util.h | 13 +++++++++++++ + 2 files changed, 15 insertions(+) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -84,6 +84,7 @@ xfs_zero_extent( + GFP_NOFS, true); + } + ++#ifdef CONFIG_XFS_RT + int + xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +@@ -195,6 +196,7 @@ xfs_bmap_rtalloc( + } + return 0; + } ++#endif /* CONFIG_XFS_RT */ + + /* + * Check if the endoff is outside the last extent. If so the caller will grow +--- a/fs/xfs/xfs_bmap_util.h ++++ b/fs/xfs/xfs_bmap_util.h +@@ -28,7 +28,20 @@ struct xfs_mount; + struct xfs_trans; + struct xfs_bmalloca; + ++#ifdef CONFIG_XFS_RT + int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); ++#else /* !CONFIG_XFS_RT */ ++/* ++ * Attempts to allocate RT extents when RT is disable indicates corruption and ++ * should trigger a shutdown. ++ */ ++static inline int ++xfs_bmap_rtalloc(struct xfs_bmalloca *ap) ++{ ++ return -EFSCORRUPTED; ++} ++#endif /* CONFIG_XFS_RT */ ++ + int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); + int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, diff --git a/queue-4.9/xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch b/queue-4.9/xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch new file mode 100644 index 00000000000..e161a52a942 --- /dev/null +++ b/queue-4.9/xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch @@ -0,0 +1,50 @@ +From 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 18 Sep 2017 09:42:09 -0700 +Subject: xfs: perag initialization should only touch m_ag_max_usable for AG 0 + +From: Darrick J. Wong + +commit 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb upstream. + +We call __xfs_ag_resv_init to make a per-AG reservation for each AG. +This makes the reservation per-AG, not per-filesystem. Therefore, it +is incorrect to adjust m_ag_max_usable for each AG. Adjust it only +when we're reserving AG 0's blocks so that we only do it once per fs. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_ag_resv.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/fs/xfs/libxfs/xfs_ag_resv.c ++++ b/fs/xfs/libxfs/xfs_ag_resv.c +@@ -157,7 +157,8 @@ __xfs_ag_resv_free( + trace_xfs_ag_resv_free(pag, type, 0); + + resv = xfs_perag_resv(pag, type); +- pag->pag_mount->m_ag_max_usable += resv->ar_asked; ++ if (pag->pag_agno == 0) ++ pag->pag_mount->m_ag_max_usable += resv->ar_asked; + /* + * AGFL blocks are always considered "free", so whatever + * was reserved at mount time must be given back at umount. +@@ -217,7 +218,14 @@ __xfs_ag_resv_init( + return error; + } + +- mp->m_ag_max_usable -= ask; ++ /* ++ * Reduce the maximum per-AG allocation length by however much we're ++ * trying to reserve for an AG. Since this is a filesystem-wide ++ * counter, we only make the adjustment for AG 0. This assumes that ++ * there aren't any AGs hungrier for per-AG reservation than AG 0. ++ */ ++ if (pag->pag_agno == 0) ++ mp->m_ag_max_usable -= ask; + + resv = xfs_perag_resv(pag, type); + resv->ar_asked = ask; diff --git a/queue-4.9/xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch b/queue-4.9/xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch new file mode 100644 index 00000000000..ab2abafef3d --- /dev/null +++ b/queue-4.9/xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch @@ -0,0 +1,54 @@ +From f35c5e10c6ed6ba52a8dd8573924a80b6a02f03f Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Mon, 9 Oct 2017 11:38:56 -0700 +Subject: xfs: reinit btree pointer on attr tree inactivation walk + +From: Brian Foster + +commit f35c5e10c6ed6ba52a8dd8573924a80b6a02f03f upstream. + +xfs_attr3_root_inactive() walks the attr fork tree to invalidate the +associated blocks. xfs_attr3_node_inactive() recursively descends +from internal blocks to leaf blocks, caching block address values +along the way to revisit parent blocks, locate the next entry and +descend down that branch of the tree. + +The code that attempts to reread the parent block is unsafe because +it assumes that the local xfs_da_node_entry pointer remains valid +after an xfs_trans_brelse() and re-read of the parent buffer. Under +heavy memory pressure, it is possible that the buffer has been +reclaimed and reallocated by the time the parent block is reread. +This means that 'btree' can point to an invalid memory address, lead +to a random/garbage value for child_fsb and cause the subsequent +read of the attr fork to go off the rails and return a NULL buffer +for an attr fork offset that is most likely not allocated. + +Note that this problem can be manufactured by setting +XFS_ATTR_BTREE_REF to 0 to prevent LRU caching of attr buffers, +creating a file with a multi-level attr fork and removing it to +trigger inactivation. + +To address this problem, reinit the node/btree pointers to the +parent buffer after it has been re-read. This ensures btree points +to a valid record and allows the walk to proceed. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_attr_inactive.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/xfs/xfs_attr_inactive.c ++++ b/fs/xfs/xfs_attr_inactive.c +@@ -302,6 +302,8 @@ xfs_attr3_node_inactive( + &bp, XFS_ATTR_FORK); + if (error) + return error; ++ node = bp->b_addr; ++ btree = dp->d_ops->node_tree_p(node); + child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_trans_brelse(*trans, bp); + } diff --git a/queue-4.9/xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch b/queue-4.9/xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch new file mode 100644 index 00000000000..e9774532bb0 --- /dev/null +++ b/queue-4.9/xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch @@ -0,0 +1,36 @@ +From d20a5e3851969fa685f118a80e4df670255a4e8d Mon Sep 17 00:00:00 2001 +From: Eryu Guan +Date: Mon, 18 Sep 2017 11:39:23 -0700 +Subject: xfs: report zeroed or not correctly in xfs_zero_range() + +From: Eryu Guan + +commit d20a5e3851969fa685f118a80e4df670255a4e8d upstream. + +The 'did_zero' param of xfs_zero_range() was not passed to +iomap_zero_range() correctly. This was introduced by commit +7bb41db3ea16 ("xfs: handle 64-bit length in xfs_iozero"), and found +by code inspection. + +Signed-off-by: Eryu Guan +Reviewed-by: Carlos Maiolino +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -92,7 +92,7 @@ xfs_zero_range( + xfs_off_t count, + bool *did_zero) + { +- return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); ++ return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); + } + + int diff --git a/queue-4.9/xfs-trim-writepage-mapping-to-within-eof.patch b/queue-4.9/xfs-trim-writepage-mapping-to-within-eof.patch new file mode 100644 index 00000000000..f48e07aa504 --- /dev/null +++ b/queue-4.9/xfs-trim-writepage-mapping-to-within-eof.patch @@ -0,0 +1,119 @@ +From 40214d128e07dd21bb07a8ed6a7fe2f911281ab2 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 13 Oct 2017 09:47:46 -0700 +Subject: xfs: trim writepage mapping to within eof + +From: Brian Foster + +commit 40214d128e07dd21bb07a8ed6a7fe2f911281ab2 upstream. + +The writeback rework in commit fbcc02561359 ("xfs: Introduce +writeback context for writepages") introduced a subtle change in +behavior with regard to the block mapping used across the +->writepages() sequence. The previous xfs_cluster_write() code would +only flush pages up to EOF at the time of the writepage, thus +ensuring that any pages due to file-extending writes would be +handled on a separate cycle and with a new, updated block mapping. + +The updated code establishes a block mapping in xfs_writepage_map() +that could extend beyond EOF if the file has post-eof preallocation. +Because we now use the generic writeback infrastructure and pass the +cached mapping to each writepage call, there is no implicit EOF +limit in place. If eofblocks trimming occurs during ->writepages(), +any post-eof portion of the cached mapping becomes invalid. The +eofblocks code has no means to serialize against writeback because +there are no pages associated with post-eof blocks. Therefore if an +eofblocks trim occurs and is followed by a file-extending buffered +write, not only has the mapping become invalid, but we could end up +writing a page to disk based on the invalid mapping. + +Consider the following sequence of events: + +- A buffered write creates a delalloc extent and post-eof + speculative preallocation. +- Writeback starts and on the first writepage cycle, the delalloc + extent is converted to real blocks (including the post-eof blocks) + and the mapping is cached. +- The file is closed and xfs_release() trims post-eof blocks. The + cached writeback mapping is now invalid. +- Another buffered write appends the file with a delalloc extent. +- The concurrent writeback cycle picks up the just written page + because the writeback range end is LLONG_MAX. xfs_writepage_map() + attributes it to the (now invalid) cached mapping and writes the + data to an incorrect location on disk (and where the file offset is + still backed by a delalloc extent). + +This problem is reproduced by xfstests test generic/464, which +triggers racing writes, appends, open/closes and writeback requests. + +To address this problem, trim the mapping used during writeback to +within EOF when the mapping is validated. This ensures the mapping +is revalidated for any pages encountered beyond EOF as of the time +the current mapping was cached or last validated. + +Reported-by: Eryu Guan +Diagnosed-by: Eryu Guan +Signed-off-by: Brian Foster +Reviewed-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 11 +++++++++++ + fs/xfs/libxfs/xfs_bmap.h | 1 + + fs/xfs/xfs_aops.c | 13 +++++++++++++ + 3 files changed, 25 insertions(+) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4057,6 +4057,17 @@ xfs_trim_extent( + } + } + ++/* trim extent to within eof */ ++void ++xfs_trim_extent_eof( ++ struct xfs_bmbt_irec *irec, ++ struct xfs_inode *ip) ++ ++{ ++ xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, ++ i_size_read(VFS_I(ip)))); ++} ++ + /* + * Trim the returned map to the required bounds + */ +--- a/fs/xfs/libxfs/xfs_bmap.h ++++ b/fs/xfs/libxfs/xfs_bmap.h +@@ -196,6 +196,7 @@ void xfs_bmap_trace_exlist(struct xfs_in + + void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, + xfs_filblks_t len); ++void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); + int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); + void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); + void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -438,6 +438,19 @@ xfs_imap_valid( + { + offset >>= inode->i_blkbits; + ++ /* ++ * We have to make sure the cached mapping is within EOF to protect ++ * against eofblocks trimming on file release leaving us with a stale ++ * mapping. Otherwise, a page for a subsequent file extending buffered ++ * write could get picked up by this writeback cycle and written to the ++ * wrong blocks. ++ * ++ * Note that what we really want here is a generic mapping invalidation ++ * mechanism to protect us from arbitrary extent modifying contexts, not ++ * just eofblocks. ++ */ ++ xfs_trim_extent_eof(imap, XFS_I(inode)); ++ + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; + } diff --git a/queue-4.9/xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch b/queue-4.9/xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch new file mode 100644 index 00000000000..fc7d6e6f2aa --- /dev/null +++ b/queue-4.9/xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch @@ -0,0 +1,140 @@ +From ee70daaba82d70766d0723b743d9fdeb3b06102a Mon Sep 17 00:00:00 2001 +From: Eryu Guan +Date: Thu, 21 Sep 2017 11:26:18 -0700 +Subject: xfs: update i_size after unwritten conversion in dio completion + +From: Eryu Guan + +commit ee70daaba82d70766d0723b743d9fdeb3b06102a upstream. + +Since commit d531d91d6990 ("xfs: always use unwritten extents for +direct I/O writes"), we start allocating unwritten extents for all +direct writes to allow appending aio in XFS. + +But for dio writes that could extend file size we update the in-core +inode size first, then convert the unwritten extents to real +allocations at dio completion time in xfs_dio_write_end_io(). Thus a +racing direct read could see the new i_size and find the unwritten +extents first and read zeros instead of actual data, if the direct +writer also takes a shared iolock. + +Fix it by updating the in-core inode size after the unwritten extent +conversion. To do this, introduce a new boolean argument to +xfs_iomap_write_unwritten() to tell if we want to update in-core +i_size or not. + +Suggested-by: Brian Foster +Reviewed-by: Brian Foster +Signed-off-by: Eryu Guan +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +[hch: backported to the old direct I/O code before Linux 4.10] +Signed-off-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_aops.c | 25 +++++++++++++++++-------- + fs/xfs/xfs_iomap.c | 7 +++++-- + fs/xfs/xfs_iomap.h | 2 +- + fs/xfs/xfs_pnfs.c | 2 +- + 4 files changed, 24 insertions(+), 12 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -335,7 +335,8 @@ xfs_end_io( + error = xfs_reflink_end_cow(ip, offset, size); + break; + case XFS_IO_UNWRITTEN: +- error = xfs_iomap_write_unwritten(ip, offset, size); ++ /* writeback should never update isize */ ++ error = xfs_iomap_write_unwritten(ip, offset, size, false); + break; + default: + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); +@@ -1532,6 +1533,21 @@ xfs_end_io_direct_write( + return 0; + } + ++ if (flags & XFS_DIO_FLAG_COW) ++ error = xfs_reflink_end_cow(ip, offset, size); ++ ++ /* ++ * Unwritten conversion updates the in-core isize after extent ++ * conversion but before updating the on-disk size. Updating isize any ++ * earlier allows a racing dio read to find unwritten extents before ++ * they are converted. ++ */ ++ if (flags & XFS_DIO_FLAG_UNWRITTEN) { ++ trace_xfs_end_io_direct_write_unwritten(ip, offset, size); ++ ++ return xfs_iomap_write_unwritten(ip, offset, size, true); ++ } ++ + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We +@@ -1548,13 +1564,6 @@ xfs_end_io_direct_write( + i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); + +- if (flags & XFS_DIO_FLAG_COW) +- error = xfs_reflink_end_cow(ip, offset, size); +- if (flags & XFS_DIO_FLAG_UNWRITTEN) { +- trace_xfs_end_io_direct_write_unwritten(ip, offset, size); +- +- error = xfs_iomap_write_unwritten(ip, offset, size); +- } + if (flags & XFS_DIO_FLAG_APPEND) { + trace_xfs_end_io_direct_write_append(ip, offset, size); + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -836,7 +836,8 @@ int + xfs_iomap_write_unwritten( + xfs_inode_t *ip, + xfs_off_t offset, +- xfs_off_t count) ++ xfs_off_t count, ++ bool update_isize) + { + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; +@@ -847,6 +848,7 @@ xfs_iomap_write_unwritten( + xfs_trans_t *tp; + xfs_bmbt_irec_t imap; + struct xfs_defer_ops dfops; ++ struct inode *inode = VFS_I(ip); + xfs_fsize_t i_size; + uint resblks; + int error; +@@ -906,7 +908,8 @@ xfs_iomap_write_unwritten( + i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); + if (i_size > offset + count) + i_size = offset + count; +- ++ if (update_isize && i_size > i_size_read(inode)) ++ i_size_write(inode, i_size); + i_size = xfs_new_eof(ip, i_size); + if (i_size) { + ip->i_d.di_size = i_size; +--- a/fs/xfs/xfs_iomap.h ++++ b/fs/xfs/xfs_iomap.h +@@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_in + struct xfs_bmbt_irec *, int); + int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, + struct xfs_bmbt_irec *); +-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); ++int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); + + void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *); +--- a/fs/xfs/xfs_pnfs.c ++++ b/fs/xfs/xfs_pnfs.c +@@ -279,7 +279,7 @@ xfs_fs_commit_blocks( + (end - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(error); + +- error = xfs_iomap_write_unwritten(ip, start, length); ++ error = xfs_iomap_write_unwritten(ip, start, length, false); + if (error) + goto out_drop_iolock; + }