From: Sasha Levin Date: Wed, 24 Jul 2019 00:35:28 +0000 (-0400) Subject: fixes for 4.19 X-Git-Tag: v5.2.3~20 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=13aeb4c34be0f1326a839b7ff8117b91a04fe59a;p=thirdparty%2Fkernel%2Fstable-queue.git fixes for 4.19 Signed-off-by: Sasha Levin --- diff --git a/queue-4.19/series b/queue-4.19/series index 4506eb49e0b..e6d18c151ff 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -240,3 +240,12 @@ libnvdimm-pfn-fix-fsdax-mode-namespace-info-block-zero-fields.patch coda-pass-the-host-file-in-vma-vm_file-on-mmap.patch include-asm-generic-bug.h-fix-cut-here-for-warn_on-for-__warn_taint-architectures.patch btrfs-correctly-validate-compression-type.patch +xfs-fix-pagecache-truncation-prior-to-reflink.patch +xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch +xfs-don-t-overflow-xattr-listent-buffer.patch +xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch +xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch +xfs-reserve-blocks-for-ifree-transaction-during-log-.patch +xfs-fix-reporting-supported-extra-file-attributes-fo.patch +xfs-serialize-unaligned-dio-writes-against-all-other.patch +xfs-abort-unaligned-nowait-directio-early.patch diff --git a/queue-4.19/xfs-abort-unaligned-nowait-directio-early.patch b/queue-4.19/xfs-abort-unaligned-nowait-directio-early.patch new file mode 100644 index 00000000000..82aacfd71ef --- /dev/null +++ b/queue-4.19/xfs-abort-unaligned-nowait-directio-early.patch @@ -0,0 +1,52 @@ +From 3526c599d528f6918381c8b0a3e64a7736d036c5 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 18 Jul 2019 23:06:17 +0000 +Subject: xfs: abort unaligned nowait directio early + +commit 1fdeaea4d92c69fb9f871a787af6ad00f32eeea7 upstream. + +Dave Chinner noticed that xfs_file_dio_aio_write returns EAGAIN without +dropping the IOLOCK when its deciding not to wait, which means that we +leak the IOLOCK there. Since we now make unaligned directio always +wait, we have the opportunity to bail out before trying to take the +lock, which should reduce the overhead of this never-gonna-work case +considerably while also solving the dropped lock problem. + +Reported-by: Dave Chinner +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Reviewed-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_file.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c +index 10f75965243c..259549698ba7 100644 +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -517,6 +517,9 @@ xfs_file_dio_aio_write( + } + + if (iocb->ki_flags & IOCB_NOWAIT) { ++ /* unaligned dio always waits, bail */ ++ if (unaligned_io) ++ return -EAGAIN; + if (!xfs_ilock_nowait(ip, iolock)) + return -EAGAIN; + } else { +@@ -536,9 +539,6 @@ xfs_file_dio_aio_write( + * xfs_file_aio_write_checks() for other reasons. + */ + if (unaligned_io) { +- /* unaligned dio always waits, bail */ +- if (iocb->ki_flags & IOCB_NOWAIT) +- return -EAGAIN; + inode_dio_wait(inode); + } else if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); +-- +2.20.1 + diff --git a/queue-4.19/xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch b/queue-4.19/xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch new file mode 100644 index 00000000000..e582762876b --- /dev/null +++ b/queue-4.19/xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch @@ -0,0 +1,112 @@ +From 9f96f052297fac5e16c781105f30bcc9bd1f0c9c Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 18 Jul 2019 23:06:13 +0000 +Subject: xfs: don't ever put nlink > 0 inodes on the unlinked list + +commit c4a6bf7f6cc7eb4cce120fb7eb1e1fb8b2d65e09 upstream. + +When XFS creates an O_TMPFILE file, the inode is created with nlink = 1, +put on the unlinked list, and then the VFS sets nlink = 0 in d_tmpfile. +If we crash before anything logs the inode (it's dirty incore but the +vfs doesn't tell us it's dirty so we never log that change), the iunlink +processing part of recovery will then explode with a pile of: + +XFS: Assertion failed: VFS_I(ip)->i_nlink == 0, file: +fs/xfs/xfs_log_recover.c, line: 5072 + +Worse yet, since nlink is nonzero, the inodes also don't get cleaned up +and they just leak until the next xfs_repair run. + +Therefore, change xfs_iunlink to require that inodes being put on the +unlinked list have nlink == 0, change the tmpfile callers to instantiate +nodes that way, and set the nlink to 1 just prior to calling d_tmpfile. +Fix the comment for xfs_iunlink while we're at it. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Suggested-by: Amir Goldstein +Reviewed-by: Amir Goldstein +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_inode.c | 16 ++++++---------- + fs/xfs/xfs_iops.c | 13 +++++++++++-- + 2 files changed, 17 insertions(+), 12 deletions(-) + +diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c +index ae07baa7bdbf..5ed84d6c7059 100644 +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1332,7 +1332,7 @@ xfs_create_tmpfile( + if (error) + goto out_trans_cancel; + +- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); ++ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); + if (error) + goto out_trans_cancel; + +@@ -1907,11 +1907,8 @@ xfs_inactive( + } + + /* +- * This is called when the inode's link count goes to 0 or we are creating a +- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be +- * set to true as the link count is dropped to zero by the VFS after we've +- * created the file successfully, so we have to add it to the unlinked list +- * while the link count is non-zero. ++ * This is called when the inode's link count has gone to 0 or we are creating ++ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. + * + * We place the on-disk inode on a list in the AGI. It will be pulled from this + * list when the inode is freed. +@@ -1931,6 +1928,7 @@ xfs_iunlink( + int offset; + int error; + ++ ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + + /* +@@ -2837,11 +2835,9 @@ xfs_rename_alloc_whiteout( + + /* + * Prepare the tmpfile inode as if it were created through the VFS. +- * Otherwise, the link increment paths will complain about nlink 0->1. +- * Drop the link count as done by d_tmpfile(), complete the inode setup +- * and flag it as linkable. ++ * Complete the inode setup and flag it as linkable. nlink is already ++ * zero, so we can skip the drop_nlink. + */ +- drop_nlink(VFS_I(tmpfile)); + xfs_setup_iops(tmpfile); + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; +diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c +index f48ffd7a8d3e..1efef69a7f1c 100644 +--- a/fs/xfs/xfs_iops.c ++++ b/fs/xfs/xfs_iops.c +@@ -191,9 +191,18 @@ xfs_generic_create( + + xfs_setup_iops(ip); + +- if (tmpfile) ++ if (tmpfile) { ++ /* ++ * The VFS requires that any inode fed to d_tmpfile must have ++ * nlink == 1 so that it can decrement the nlink in d_tmpfile. ++ * However, we created the temp file with nlink == 0 because ++ * we're not allowed to put an inode with nlink > 0 on the ++ * unlinked list. Therefore we have to set nlink to 1 so that ++ * d_tmpfile can immediately set it back to zero. ++ */ ++ set_nlink(inode, 1); + d_tmpfile(dentry, inode); +- else ++ } else + d_instantiate(dentry, inode); + + xfs_finish_inode_setup(ip); +-- +2.20.1 + diff --git a/queue-4.19/xfs-don-t-overflow-xattr-listent-buffer.patch b/queue-4.19/xfs-don-t-overflow-xattr-listent-buffer.patch new file mode 100644 index 00000000000..4de5c9a5b8b --- /dev/null +++ b/queue-4.19/xfs-don-t-overflow-xattr-listent-buffer.patch @@ -0,0 +1,78 @@ +From d033080f9b71c2a30a9e4c5d6777bc7d18ad9d6f Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 18 Jul 2019 23:06:11 +0000 +Subject: xfs: don't overflow xattr listent buffer + +commit 3b50086f0c0d78c144d9483fa292c1509c931b70 upstream. + +For VFS listxattr calls, xfs_xattr_put_listent calls +__xfs_xattr_put_listent twice if it sees an attribute +"trusted.SGI_ACL_FILE": once for that name, and again for +"system.posix_acl_access". Unfortunately, if we happen to run out of +buffer space while emitting the first name, we set count to -1 (so that +we can feed ERANGE to the caller). The second invocation doesn't check that +the context parameters make sense and overwrites the byte before the +buffer, triggering a KASAN report: + +================================================================== +BUG: KASAN: slab-out-of-bounds in strncpy+0xb3/0xd0 +Write of size 1 at addr ffff88807fbd317f by task syz/1113 + +CPU: 3 PID: 1113 Comm: syz Not tainted 5.0.0-rc6-xfsx #rc6 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-1ubuntu1 04/01/2014 +Call Trace: + dump_stack+0xcc/0x180 + print_address_description+0x6c/0x23c + kasan_report.cold.3+0x1c/0x35 + strncpy+0xb3/0xd0 + __xfs_xattr_put_listent+0x1a9/0x2c0 [xfs] + xfs_attr_list_int_ilocked+0x11af/0x1800 [xfs] + xfs_attr_list_int+0x20c/0x2e0 [xfs] + xfs_vn_listxattr+0x225/0x320 [xfs] + listxattr+0x11f/0x1b0 + path_listxattr+0xbd/0x130 + do_syscall_64+0x139/0x560 + +While we're at it we add an assert to the other put_listent to avoid +this sort of thing ever happening to the attrlist_by_handle code. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Suggested-by: Amir Goldstein +Reviewed-by: Amir Goldstein +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_attr_list.c | 1 + + fs/xfs/xfs_xattr.c | 3 +++ + 2 files changed, 4 insertions(+) + +diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c +index a58034049995..3d213a7394c5 100644 +--- a/fs/xfs/xfs_attr_list.c ++++ b/fs/xfs/xfs_attr_list.c +@@ -555,6 +555,7 @@ xfs_attr_put_listent( + attrlist_ent_t *aep; + int arraytop; + ++ ASSERT(!context->seen_enough); + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); +diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c +index 63ee1d5bf1d7..9a63016009a1 100644 +--- a/fs/xfs/xfs_xattr.c ++++ b/fs/xfs/xfs_xattr.c +@@ -129,6 +129,9 @@ __xfs_xattr_put_listent( + char *offset; + int arraytop; + ++ if (context->count < 0 || context->seen_enough) ++ return; ++ + if (!context->alist) + goto compute_size; + +-- +2.20.1 + diff --git a/queue-4.19/xfs-fix-pagecache-truncation-prior-to-reflink.patch b/queue-4.19/xfs-fix-pagecache-truncation-prior-to-reflink.patch new file mode 100644 index 00000000000..d24fc802097 --- /dev/null +++ b/queue-4.19/xfs-fix-pagecache-truncation-prior-to-reflink.patch @@ -0,0 +1,43 @@ +From 8cde848ea596034b2b9f72227fcfc3b70b793567 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 18 Jul 2019 23:06:09 +0000 +Subject: xfs: fix pagecache truncation prior to reflink + +commit 4918ef4ea008cd2ff47eb852894e3f9b9047f4f3 upstream. + +Prior to remapping blocks, it is necessary to remove pages from the +destination file's page cache. Unfortunately, the truncation is not +aggressive enough -- if page size > block size, we'll end up zeroing +subpage blocks instead of removing them. So, round the start offset +down and the end offset up to page boundaries. We already wrote all +the dirty data so the larger range shouldn't be a problem. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_reflink.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c +index 7088f44c0c59..38ea08a3dd1d 100644 +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -1369,8 +1369,9 @@ xfs_reflink_remap_prep( + goto out_unlock; + + /* Zap any page cache for the destination file's range. */ +- truncate_inode_pages_range(&inode_out->i_data, pos_out, +- PAGE_ALIGN(pos_out + *len) - 1); ++ truncate_inode_pages_range(&inode_out->i_data, ++ round_down(pos_out, PAGE_SIZE), ++ round_up(pos_out + *len, PAGE_SIZE) - 1); + + /* If we're altering the file contents... */ + if (!is_dedupe) { +-- +2.20.1 + diff --git a/queue-4.19/xfs-fix-reporting-supported-extra-file-attributes-fo.patch b/queue-4.19/xfs-fix-reporting-supported-extra-file-attributes-fo.patch new file mode 100644 index 00000000000..ce93305a05b --- /dev/null +++ b/queue-4.19/xfs-fix-reporting-supported-extra-file-attributes-fo.patch @@ -0,0 +1,56 @@ +From 5f1b38140a9c4b7260f0e720509cba82b2c90bf2 Mon Sep 17 00:00:00 2001 +From: "Luis R. Rodriguez" +Date: Thu, 18 Jul 2019 23:06:15 +0000 +Subject: xfs: fix reporting supported extra file attributes for statx() + +commit 1b9598c8fb9965fff901c4caa21fed9644c34df3 upstream. + +statx(2) notes that any attribute that is not indicated as supported by +stx_attributes_mask has no usable value. Commit 5f955f26f3d42d ("xfs: report +crtime and attribute flags to statx") added support for informing userspace +of extra file attributes but forgot to list these flags as supported +making reporting them rather useless for the pedantic userspace author. + +$ git describe --contains 5f955f26f3d42d04aba65590a32eb70eedb7f37d +v4.11-rc6~5^2^2~2 + +Fixes: 5f955f26f3d42d ("xfs: report crtime and attribute flags to statx") +Signed-off-by: Luis R. Rodriguez +Reviewed-by: Darrick J. Wong +[darrick: add a comment reminding people to keep attributes_mask up to date] +Signed-off-by: Darrick J. Wong +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_iops.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c +index 1efef69a7f1c..74047bd0c1ae 100644 +--- a/fs/xfs/xfs_iops.c ++++ b/fs/xfs/xfs_iops.c +@@ -531,6 +531,10 @@ xfs_vn_getattr( + } + } + ++ /* ++ * Note: If you add another clause to set an attribute flag, please ++ * update attributes_mask below. ++ */ + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) +@@ -538,6 +542,10 @@ xfs_vn_getattr( + if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + ++ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | ++ STATX_ATTR_APPEND | ++ STATX_ATTR_NODUMP); ++ + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: +-- +2.20.1 + diff --git a/queue-4.19/xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch b/queue-4.19/xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch new file mode 100644 index 00000000000..5070ae8d75a --- /dev/null +++ b/queue-4.19/xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch @@ -0,0 +1,102 @@ +From be409aea9de66a543d0ce43418dc0a8164cd0141 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Thu, 18 Jul 2019 23:06:10 +0000 +Subject: xfs: flush removing page cache in xfs_reflink_remap_prep + +commit 2c307174ab77e34645e75e12827646e044d273c3 upstream. + +On a sub-page block size filesystem, fsx is failing with a data +corruption after a series of operations involving copying a file +with the destination offset beyond EOF of the destination of the file: + +8093(157 mod 256): TRUNCATE DOWN from 0x7a120 to 0x50000 ******WWWW +8094(158 mod 256): INSERT 0x25000 thru 0x25fff (0x1000 bytes) +8095(159 mod 256): COPY 0x18000 thru 0x1afff (0x3000 bytes) to 0x2f400 +8096(160 mod 256): WRITE 0x5da00 thru 0x651ff (0x7800 bytes) HOLE +8097(161 mod 256): COPY 0x2000 thru 0x5fff (0x4000 bytes) to 0x6fc00 + +The second copy here is beyond EOF, and it is to sub-page (4k) but +block aligned (1k) offset. The clone runs the EOF zeroing, landing +in a pre-existing post-eof delalloc extent. This zeroes the post-eof +extents in the page cache just fine, dirtying the pages correctly. + +The problem is that xfs_reflink_remap_prep() now truncates the page +cache over the range that it is copying it to, and rounds that down +to cover the entire start page. This removes the dirty page over the +delalloc extent from the page cache without having written it back. +Hence later, when the page cache is flushed, the page at offset +0x6f000 has not been written back and hence exposes stale data, +which fsx trips over less than 10 operations later. + +Fix this by changing xfs_reflink_remap_prep() to use +xfs_flush_unmap_range(). + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_bmap_util.c | 2 +- + fs/xfs/xfs_bmap_util.h | 2 ++ + fs/xfs/xfs_reflink.c | 17 +++++++++++++---- + 3 files changed, 16 insertions(+), 5 deletions(-) + +diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c +index 211b06e4702e..41ad9eaab6ce 100644 +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -1080,7 +1080,7 @@ xfs_adjust_extent_unmap_boundaries( + return 0; + } + +-static int ++int + xfs_flush_unmap_range( + struct xfs_inode *ip, + xfs_off_t offset, +diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h +index 87363d136bb6..9c73d012f56a 100644 +--- a/fs/xfs/xfs_bmap_util.h ++++ b/fs/xfs/xfs_bmap_util.h +@@ -76,6 +76,8 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + + xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count); ++int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, ++ xfs_off_t len); + int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_extnum_t *nextents, + xfs_filblks_t *count); +diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c +index 38ea08a3dd1d..f3c393f309e1 100644 +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -1368,10 +1368,19 @@ xfs_reflink_remap_prep( + if (ret) + goto out_unlock; + +- /* Zap any page cache for the destination file's range. */ +- truncate_inode_pages_range(&inode_out->i_data, +- round_down(pos_out, PAGE_SIZE), +- round_up(pos_out + *len, PAGE_SIZE) - 1); ++ /* ++ * If pos_out > EOF, we may have dirtied blocks between EOF and ++ * pos_out. In that case, we need to extend the flush and unmap to cover ++ * from EOF to the end of the copy length. ++ */ ++ if (pos_out > XFS_ISIZE(dest)) { ++ loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); ++ ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); ++ } else { ++ ret = xfs_flush_unmap_range(dest, pos_out, *len); ++ } ++ if (ret) ++ goto out_unlock; + + /* If we're altering the file contents... */ + if (!is_dedupe) { +-- +2.20.1 + diff --git a/queue-4.19/xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch b/queue-4.19/xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch new file mode 100644 index 00000000000..2d42d70f8d4 --- /dev/null +++ b/queue-4.19/xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch @@ -0,0 +1,88 @@ +From 734afb4260f4207ee1dae99812eb66bf43d3d375 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 18 Jul 2019 23:06:12 +0000 +Subject: xfs: rename m_inotbt_nores to m_finobt_nores + +commit e1f6ca11381588e3ef138c10de60eeb34cb8466a upstream. + +Rename this flag variable to imply more strongly that it's related to +the free inode btree (finobt) operation. No functional changes. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Dave Chinner +Suggested-by: Amir Goldstein +Reviewed-by: Amir Goldstein +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/libxfs/xfs_ag_resv.c | 2 +- + fs/xfs/libxfs/xfs_ialloc_btree.c | 4 ++-- + fs/xfs/xfs_inode.c | 2 +- + fs/xfs/xfs_mount.h | 2 +- + 4 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c +index e701ebc36c06..e2ba2a3b63b2 100644 +--- a/fs/xfs/libxfs/xfs_ag_resv.c ++++ b/fs/xfs/libxfs/xfs_ag_resv.c +@@ -281,7 +281,7 @@ xfs_ag_resv_init( + */ + ask = used = 0; + +- mp->m_inotbt_nores = true; ++ mp->m_finobt_nores = true; + + error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, + &used); +diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c +index 86c50208a143..adb2f6df5a11 100644 +--- a/fs/xfs/libxfs/xfs_ialloc_btree.c ++++ b/fs/xfs/libxfs/xfs_ialloc_btree.c +@@ -124,7 +124,7 @@ xfs_finobt_alloc_block( + union xfs_btree_ptr *new, + int *stat) + { +- if (cur->bc_mp->m_inotbt_nores) ++ if (cur->bc_mp->m_finobt_nores) + return xfs_inobt_alloc_block(cur, start, new, stat); + return __xfs_inobt_alloc_block(cur, start, new, stat, + XFS_AG_RESV_METADATA); +@@ -157,7 +157,7 @@ xfs_finobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) + { +- if (cur->bc_mp->m_inotbt_nores) ++ if (cur->bc_mp->m_finobt_nores) + return xfs_inobt_free_block(cur, bp); + return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); + } +diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c +index 05db9540e459..ae07baa7bdbf 100644 +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1754,7 +1754,7 @@ xfs_inactive_ifree( + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ +- if (unlikely(mp->m_inotbt_nores)) { ++ if (unlikely(mp->m_finobt_nores)) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); +diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h +index 7964513c3128..7e0bf952e087 100644 +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -127,7 +127,7 @@ typedef struct xfs_mount { + struct mutex m_growlock; /* growfs mutex */ + int m_fixedfsid[2]; /* unchanged for life of FS */ + uint64_t m_flags; /* global mount flags */ +- bool m_inotbt_nores; /* no per-AG finobt resv. */ ++ bool m_finobt_nores; /* no per-AG finobt resv. */ + int m_ialloc_inos; /* inodes in inode allocation */ + int m_ialloc_blks; /* blocks in inode allocation */ + int m_ialloc_min_blks;/* min blocks in sparse inode +-- +2.20.1 + diff --git a/queue-4.19/xfs-reserve-blocks-for-ifree-transaction-during-log-.patch b/queue-4.19/xfs-reserve-blocks-for-ifree-transaction-during-log-.patch new file mode 100644 index 00000000000..2cf67073c21 --- /dev/null +++ b/queue-4.19/xfs-reserve-blocks-for-ifree-transaction-during-log-.patch @@ -0,0 +1,63 @@ +From a47a5308bd902ccef6a601c439c4529f3743d11e Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 18 Jul 2019 23:06:14 +0000 +Subject: xfs: reserve blocks for ifree transaction during log recovery + +commit 15a268d9f263ed3a0601a1296568241a5a3da7aa upstream. + +Log recovery frees all the inodes stored in the unlinked list, which can +cause expansion of the free inode btree. The ifree code skips block +reservations if it thinks there's a per-AG space reservation, but we +don't set up the reservation until after log recovery, which means that +a finobt expansion blows up in xfs_trans_mod_sb when we exceed the +transaction's block reservation. + +To fix this, we set the "no finobt reservation" flag to true when we +create the xfs_mount and only set it to false if we confirm that every +AG had enough free space to put aside for the finobt. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Dave Chinner +Suggested-by: Amir Goldstein +Reviewed-by: Amir Goldstein +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_fsops.c | 1 + + fs/xfs/xfs_super.c | 7 +++++++ + 2 files changed, 8 insertions(+) + +diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c +index 7c00b8bedfe3..09fd602507ef 100644 +--- a/fs/xfs/xfs_fsops.c ++++ b/fs/xfs/xfs_fsops.c +@@ -534,6 +534,7 @@ xfs_fs_reserve_ag_blocks( + int error = 0; + int err2; + ++ mp->m_finobt_nores = false; + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_init(pag, NULL); +diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c +index 207ee302b1bb..dce8114e3198 100644 +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1561,6 +1561,13 @@ xfs_mount_alloc( + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); + mp->m_kobj.kobject.kset = xfs_kset; ++ /* ++ * We don't create the finobt per-ag space reservation until after log ++ * recovery, so we must set this to true so that an ifree transaction ++ * started during log recovery will not depend on space reservations ++ * for finobt expansion. ++ */ ++ mp->m_finobt_nores = true; + return mp; + } + +-- +2.20.1 + diff --git a/queue-4.19/xfs-serialize-unaligned-dio-writes-against-all-other.patch b/queue-4.19/xfs-serialize-unaligned-dio-writes-against-all-other.patch new file mode 100644 index 00000000000..d5cbf952f97 --- /dev/null +++ b/queue-4.19/xfs-serialize-unaligned-dio-writes-against-all-other.patch @@ -0,0 +1,92 @@ +From 877391011e228577a3c413111983ae1286e4106b Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Thu, 18 Jul 2019 23:06:16 +0000 +Subject: xfs: serialize unaligned dio writes against all other dio writes + +commit 2032a8a27b5cc0f578d37fa16fa2494b80a0d00a upstream. + +XFS applies more strict serialization constraints to unaligned +direct writes to accommodate things like direct I/O layer zeroing, +unwritten extent conversion, etc. Unaligned submissions acquire the +exclusive iolock and wait for in-flight dio to complete to ensure +multiple submissions do not race on the same block and cause data +corruption. + +This generally works in the case of an aligned dio followed by an +unaligned dio, but the serialization is lost if I/Os occur in the +opposite order. If an unaligned write is submitted first and +immediately followed by an overlapping, aligned write, the latter +submits without the typical unaligned serialization barriers because +there is no indication of an unaligned dio still in-flight. This can +lead to unpredictable results. + +To provide proper unaligned dio serialization, require that such +direct writes are always the only dio allowed in-flight at one time +for a particular inode. We already acquire the exclusive iolock and +drain pending dio before submitting the unaligned dio. Wait once +more after the dio submission to hold the iolock across the I/O and +prevent further submissions until the unaligned I/O completes. This +is heavy handed, but consistent with the current pre-submission +serialization for unaligned direct writes. + +Signed-off-by: Brian Foster +Reviewed-by: Allison Henderson +Reviewed-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Luis Chamberlain +Signed-off-by: Sasha Levin +--- + fs/xfs/xfs_file.c | 27 +++++++++++++++++---------- + 1 file changed, 17 insertions(+), 10 deletions(-) + +diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c +index 61a5ad2600e8..10f75965243c 100644 +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -529,18 +529,17 @@ xfs_file_dio_aio_write( + count = iov_iter_count(from); + + /* +- * If we are doing unaligned IO, wait for all other IO to drain, +- * otherwise demote the lock if we had to take the exclusive lock +- * for other reasons in xfs_file_aio_write_checks. ++ * If we are doing unaligned IO, we can't allow any other overlapping IO ++ * in-flight at the same time or we risk data corruption. Wait for all ++ * other IO to drain before we submit. If the IO is aligned, demote the ++ * iolock if we had to take the exclusive lock in ++ * xfs_file_aio_write_checks() for other reasons. + */ + if (unaligned_io) { +- /* If we are going to wait for other DIO to finish, bail */ +- if (iocb->ki_flags & IOCB_NOWAIT) { +- if (atomic_read(&inode->i_dio_count)) +- return -EAGAIN; +- } else { +- inode_dio_wait(inode); +- } ++ /* unaligned dio always waits, bail */ ++ if (iocb->ki_flags & IOCB_NOWAIT) ++ return -EAGAIN; ++ inode_dio_wait(inode); + } else if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; +@@ -548,6 +547,14 @@ xfs_file_dio_aio_write( + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); ++ ++ /* ++ * If unaligned, this is the only IO in-flight. If it has not yet ++ * completed, wait on it before we release the iolock to prevent ++ * subsequent overlapping IO. ++ */ ++ if (ret == -EIOCBQUEUED && unaligned_io) ++ inode_dio_wait(inode); + out: + xfs_iunlock(ip, iolock); + +-- +2.20.1 +