]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
fixes for 4.19
authorSasha Levin <sashal@kernel.org>
Wed, 24 Jul 2019 00:35:28 +0000 (20:35 -0400)
committerSasha Levin <sashal@kernel.org>
Wed, 24 Jul 2019 00:35:28 +0000 (20:35 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
queue-4.19/series
queue-4.19/xfs-abort-unaligned-nowait-directio-early.patch [new file with mode: 0644]
queue-4.19/xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch [new file with mode: 0644]
queue-4.19/xfs-don-t-overflow-xattr-listent-buffer.patch [new file with mode: 0644]
queue-4.19/xfs-fix-pagecache-truncation-prior-to-reflink.patch [new file with mode: 0644]
queue-4.19/xfs-fix-reporting-supported-extra-file-attributes-fo.patch [new file with mode: 0644]
queue-4.19/xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch [new file with mode: 0644]
queue-4.19/xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch [new file with mode: 0644]
queue-4.19/xfs-reserve-blocks-for-ifree-transaction-during-log-.patch [new file with mode: 0644]
queue-4.19/xfs-serialize-unaligned-dio-writes-against-all-other.patch [new file with mode: 0644]

index 4506eb49e0b3d593e6473d4c1ee3a1a5d1a0f033..e6d18c151ffbbeb4e1496293e9809d9be93948dc 100644 (file)
@@ -240,3 +240,12 @@ libnvdimm-pfn-fix-fsdax-mode-namespace-info-block-zero-fields.patch
 coda-pass-the-host-file-in-vma-vm_file-on-mmap.patch
 include-asm-generic-bug.h-fix-cut-here-for-warn_on-for-__warn_taint-architectures.patch
 btrfs-correctly-validate-compression-type.patch
+xfs-fix-pagecache-truncation-prior-to-reflink.patch
+xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch
+xfs-don-t-overflow-xattr-listent-buffer.patch
+xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch
+xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch
+xfs-reserve-blocks-for-ifree-transaction-during-log-.patch
+xfs-fix-reporting-supported-extra-file-attributes-fo.patch
+xfs-serialize-unaligned-dio-writes-against-all-other.patch
+xfs-abort-unaligned-nowait-directio-early.patch
diff --git a/queue-4.19/xfs-abort-unaligned-nowait-directio-early.patch b/queue-4.19/xfs-abort-unaligned-nowait-directio-early.patch
new file mode 100644 (file)
index 0000000..82aacfd
--- /dev/null
@@ -0,0 +1,52 @@
+From 3526c599d528f6918381c8b0a3e64a7736d036c5 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:17 +0000
+Subject: xfs: abort unaligned nowait directio early
+
+commit 1fdeaea4d92c69fb9f871a787af6ad00f32eeea7 upstream.
+
+Dave Chinner noticed that xfs_file_dio_aio_write returns EAGAIN without
+dropping the IOLOCK when its deciding not to wait, which means that we
+leak the IOLOCK there.  Since we now make unaligned directio always
+wait, we have the opportunity to bail out before trying to take the
+lock, which should reduce the overhead of this never-gonna-work case
+considerably while also solving the dropped lock problem.
+
+Reported-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_file.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
+index 10f75965243c..259549698ba7 100644
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -517,6 +517,9 @@ xfs_file_dio_aio_write(
+       }
+       if (iocb->ki_flags & IOCB_NOWAIT) {
++              /* unaligned dio always waits, bail */
++              if (unaligned_io)
++                      return -EAGAIN;
+               if (!xfs_ilock_nowait(ip, iolock))
+                       return -EAGAIN;
+       } else {
+@@ -536,9 +539,6 @@ xfs_file_dio_aio_write(
+        * xfs_file_aio_write_checks() for other reasons.
+        */
+       if (unaligned_io) {
+-              /* unaligned dio always waits, bail */
+-              if (iocb->ki_flags & IOCB_NOWAIT)
+-                      return -EAGAIN;
+               inode_dio_wait(inode);
+       } else if (iolock == XFS_IOLOCK_EXCL) {
+               xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch b/queue-4.19/xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch
new file mode 100644 (file)
index 0000000..e582762
--- /dev/null
@@ -0,0 +1,112 @@
+From 9f96f052297fac5e16c781105f30bcc9bd1f0c9c Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:13 +0000
+Subject: xfs: don't ever put nlink > 0 inodes on the unlinked list
+
+commit c4a6bf7f6cc7eb4cce120fb7eb1e1fb8b2d65e09 upstream.
+
+When XFS creates an O_TMPFILE file, the inode is created with nlink = 1,
+put on the unlinked list, and then the VFS sets nlink = 0 in d_tmpfile.
+If we crash before anything logs the inode (it's dirty incore but the
+vfs doesn't tell us it's dirty so we never log that change), the iunlink
+processing part of recovery will then explode with a pile of:
+
+XFS: Assertion failed: VFS_I(ip)->i_nlink == 0, file:
+fs/xfs/xfs_log_recover.c, line: 5072
+
+Worse yet, since nlink is nonzero, the inodes also don't get cleaned up
+and they just leak until the next xfs_repair run.
+
+Therefore, change xfs_iunlink to require that inodes being put on the
+unlinked list have nlink == 0, change the tmpfile callers to instantiate
+nodes that way, and set the nlink to 1 just prior to calling d_tmpfile.
+Fix the comment for xfs_iunlink while we're at it.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_inode.c | 16 ++++++----------
+ fs/xfs/xfs_iops.c  | 13 +++++++++++--
+ 2 files changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
+index ae07baa7bdbf..5ed84d6c7059 100644
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
+       if (error)
+               goto out_trans_cancel;
+-      error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
++      error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
+       if (error)
+               goto out_trans_cancel;
+@@ -1907,11 +1907,8 @@ xfs_inactive(
+ }
+ /*
+- * This is called when the inode's link count goes to 0 or we are creating a
+- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
+- * set to true as the link count is dropped to zero by the VFS after we've
+- * created the file successfully, so we have to add it to the unlinked list
+- * while the link count is non-zero.
++ * This is called when the inode's link count has gone to 0 or we are creating
++ * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
+  *
+  * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+  * list when the inode is freed.
+@@ -1931,6 +1928,7 @@ xfs_iunlink(
+       int             offset;
+       int             error;
++      ASSERT(VFS_I(ip)->i_nlink == 0);
+       ASSERT(VFS_I(ip)->i_mode != 0);
+       /*
+@@ -2837,11 +2835,9 @@ xfs_rename_alloc_whiteout(
+       /*
+        * Prepare the tmpfile inode as if it were created through the VFS.
+-       * Otherwise, the link increment paths will complain about nlink 0->1.
+-       * Drop the link count as done by d_tmpfile(), complete the inode setup
+-       * and flag it as linkable.
++       * Complete the inode setup and flag it as linkable.  nlink is already
++       * zero, so we can skip the drop_nlink.
+        */
+-      drop_nlink(VFS_I(tmpfile));
+       xfs_setup_iops(tmpfile);
+       xfs_finish_inode_setup(tmpfile);
+       VFS_I(tmpfile)->i_state |= I_LINKABLE;
+diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
+index f48ffd7a8d3e..1efef69a7f1c 100644
+--- a/fs/xfs/xfs_iops.c
++++ b/fs/xfs/xfs_iops.c
+@@ -191,9 +191,18 @@ xfs_generic_create(
+       xfs_setup_iops(ip);
+-      if (tmpfile)
++      if (tmpfile) {
++              /*
++               * The VFS requires that any inode fed to d_tmpfile must have
++               * nlink == 1 so that it can decrement the nlink in d_tmpfile.
++               * However, we created the temp file with nlink == 0 because
++               * we're not allowed to put an inode with nlink > 0 on the
++               * unlinked list.  Therefore we have to set nlink to 1 so that
++               * d_tmpfile can immediately set it back to zero.
++               */
++              set_nlink(inode, 1);
+               d_tmpfile(dentry, inode);
+-      else
++      } else
+               d_instantiate(dentry, inode);
+       xfs_finish_inode_setup(ip);
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-don-t-overflow-xattr-listent-buffer.patch b/queue-4.19/xfs-don-t-overflow-xattr-listent-buffer.patch
new file mode 100644 (file)
index 0000000..4de5c9a
--- /dev/null
@@ -0,0 +1,78 @@
+From d033080f9b71c2a30a9e4c5d6777bc7d18ad9d6f Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:11 +0000
+Subject: xfs: don't overflow xattr listent buffer
+
+commit 3b50086f0c0d78c144d9483fa292c1509c931b70 upstream.
+
+For VFS listxattr calls, xfs_xattr_put_listent calls
+__xfs_xattr_put_listent twice if it sees an attribute
+"trusted.SGI_ACL_FILE": once for that name, and again for
+"system.posix_acl_access".  Unfortunately, if we happen to run out of
+buffer space while emitting the first name, we set count to -1 (so that
+we can feed ERANGE to the caller).  The second invocation doesn't check that
+the context parameters make sense and overwrites the byte before the
+buffer, triggering a KASAN report:
+
+==================================================================
+BUG: KASAN: slab-out-of-bounds in strncpy+0xb3/0xd0
+Write of size 1 at addr ffff88807fbd317f by task syz/1113
+
+CPU: 3 PID: 1113 Comm: syz Not tainted 5.0.0-rc6-xfsx #rc6
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-1ubuntu1 04/01/2014
+Call Trace:
+ dump_stack+0xcc/0x180
+ print_address_description+0x6c/0x23c
+ kasan_report.cold.3+0x1c/0x35
+ strncpy+0xb3/0xd0
+ __xfs_xattr_put_listent+0x1a9/0x2c0 [xfs]
+ xfs_attr_list_int_ilocked+0x11af/0x1800 [xfs]
+ xfs_attr_list_int+0x20c/0x2e0 [xfs]
+ xfs_vn_listxattr+0x225/0x320 [xfs]
+ listxattr+0x11f/0x1b0
+ path_listxattr+0xbd/0x130
+ do_syscall_64+0x139/0x560
+
+While we're at it we add an assert to the other put_listent to avoid
+this sort of thing ever happening to the attrlist_by_handle code.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_attr_list.c | 1 +
+ fs/xfs/xfs_xattr.c     | 3 +++
+ 2 files changed, 4 insertions(+)
+
+diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
+index a58034049995..3d213a7394c5 100644
+--- a/fs/xfs/xfs_attr_list.c
++++ b/fs/xfs/xfs_attr_list.c
+@@ -555,6 +555,7 @@ xfs_attr_put_listent(
+       attrlist_ent_t *aep;
+       int arraytop;
++      ASSERT(!context->seen_enough);
+       ASSERT(!(context->flags & ATTR_KERNOVAL));
+       ASSERT(context->count >= 0);
+       ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
+index 63ee1d5bf1d7..9a63016009a1 100644
+--- a/fs/xfs/xfs_xattr.c
++++ b/fs/xfs/xfs_xattr.c
+@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
+       char *offset;
+       int arraytop;
++      if (context->count < 0 || context->seen_enough)
++              return;
++
+       if (!context->alist)
+               goto compute_size;
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-fix-pagecache-truncation-prior-to-reflink.patch b/queue-4.19/xfs-fix-pagecache-truncation-prior-to-reflink.patch
new file mode 100644 (file)
index 0000000..d24fc80
--- /dev/null
@@ -0,0 +1,43 @@
+From 8cde848ea596034b2b9f72227fcfc3b70b793567 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:09 +0000
+Subject: xfs: fix pagecache truncation prior to reflink
+
+commit 4918ef4ea008cd2ff47eb852894e3f9b9047f4f3 upstream.
+
+Prior to remapping blocks, it is necessary to remove pages from the
+destination file's page cache.  Unfortunately, the truncation is not
+aggressive enough -- if page size > block size, we'll end up zeroing
+subpage blocks instead of removing them.  So, round the start offset
+down and the end offset up to page boundaries.  We already wrote all
+the dirty data so the larger range shouldn't be a problem.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_reflink.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
+index 7088f44c0c59..38ea08a3dd1d 100644
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -1369,8 +1369,9 @@ xfs_reflink_remap_prep(
+               goto out_unlock;
+       /* Zap any page cache for the destination file's range. */
+-      truncate_inode_pages_range(&inode_out->i_data, pos_out,
+-                                 PAGE_ALIGN(pos_out + *len) - 1);
++      truncate_inode_pages_range(&inode_out->i_data,
++                      round_down(pos_out, PAGE_SIZE),
++                      round_up(pos_out + *len, PAGE_SIZE) - 1);
+       /* If we're altering the file contents... */
+       if (!is_dedupe) {
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-fix-reporting-supported-extra-file-attributes-fo.patch b/queue-4.19/xfs-fix-reporting-supported-extra-file-attributes-fo.patch
new file mode 100644 (file)
index 0000000..ce93305
--- /dev/null
@@ -0,0 +1,56 @@
+From 5f1b38140a9c4b7260f0e720509cba82b2c90bf2 Mon Sep 17 00:00:00 2001
+From: "Luis R. Rodriguez" <mcgrof@kernel.org>
+Date: Thu, 18 Jul 2019 23:06:15 +0000
+Subject: xfs: fix reporting supported extra file attributes for statx()
+
+commit 1b9598c8fb9965fff901c4caa21fed9644c34df3 upstream.
+
+statx(2) notes that any attribute that is not indicated as supported by
+stx_attributes_mask has no usable value. Commit 5f955f26f3d42d ("xfs: report
+crtime and attribute flags to statx") added support for informing userspace
+of extra file attributes but forgot to list these flags as supported
+making reporting them rather useless for the pedantic userspace author.
+
+$ git describe --contains 5f955f26f3d42d04aba65590a32eb70eedb7f37d
+v4.11-rc6~5^2^2~2
+
+Fixes: 5f955f26f3d42d ("xfs: report crtime and attribute flags to statx")
+Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: add a comment reminding people to keep attributes_mask up to date]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_iops.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
+index 1efef69a7f1c..74047bd0c1ae 100644
+--- a/fs/xfs/xfs_iops.c
++++ b/fs/xfs/xfs_iops.c
+@@ -531,6 +531,10 @@ xfs_vn_getattr(
+               }
+       }
++      /*
++       * Note: If you add another clause to set an attribute flag, please
++       * update attributes_mask below.
++       */
+       if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+               stat->attributes |= STATX_ATTR_IMMUTABLE;
+       if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+@@ -538,6 +542,10 @@ xfs_vn_getattr(
+       if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
+               stat->attributes |= STATX_ATTR_NODUMP;
++      stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
++                                STATX_ATTR_APPEND |
++                                STATX_ATTR_NODUMP);
++
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFBLK:
+       case S_IFCHR:
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch b/queue-4.19/xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch
new file mode 100644 (file)
index 0000000..5070ae8
--- /dev/null
@@ -0,0 +1,102 @@
+From be409aea9de66a543d0ce43418dc0a8164cd0141 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Thu, 18 Jul 2019 23:06:10 +0000
+Subject: xfs: flush removing page cache in xfs_reflink_remap_prep
+
+commit 2c307174ab77e34645e75e12827646e044d273c3 upstream.
+
+On a sub-page block size filesystem, fsx is failing with a data
+corruption after a series of operations involving copying a file
+with the destination offset beyond EOF of the destination of the file:
+
+8093(157 mod 256): TRUNCATE DOWN        from 0x7a120 to 0x50000 ******WWWW
+8094(158 mod 256): INSERT 0x25000 thru 0x25fff  (0x1000 bytes)
+8095(159 mod 256): COPY 0x18000 thru 0x1afff    (0x3000 bytes) to 0x2f400
+8096(160 mod 256): WRITE    0x5da00 thru 0x651ff        (0x7800 bytes) HOLE
+8097(161 mod 256): COPY 0x2000 thru 0x5fff      (0x4000 bytes) to 0x6fc00
+
+The second copy here is beyond EOF, and it is to sub-page (4k) but
+block aligned (1k) offset. The clone runs the EOF zeroing, landing
+in a pre-existing post-eof delalloc extent. This zeroes the post-eof
+extents in the page cache just fine, dirtying the pages correctly.
+
+The problem is that xfs_reflink_remap_prep() now truncates the page
+cache over the range that it is copying it to, and rounds that down
+to cover the entire start page. This removes the dirty page over the
+delalloc extent from the page cache without having written it back.
+Hence later, when the page cache is flushed, the page at offset
+0x6f000 has not been written back and hence exposes stale data,
+which fsx trips over less than 10 operations later.
+
+Fix this by changing xfs_reflink_remap_prep() to use
+xfs_flush_unmap_range().
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_bmap_util.c |  2 +-
+ fs/xfs/xfs_bmap_util.h |  2 ++
+ fs/xfs/xfs_reflink.c   | 17 +++++++++++++----
+ 3 files changed, 16 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
+index 211b06e4702e..41ad9eaab6ce 100644
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1080,7 +1080,7 @@ xfs_adjust_extent_unmap_boundaries(
+       return 0;
+ }
+-static int
++int
+ xfs_flush_unmap_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
+index 87363d136bb6..9c73d012f56a 100644
+--- a/fs/xfs/xfs_bmap_util.h
++++ b/fs/xfs/xfs_bmap_util.h
+@@ -76,6 +76,8 @@ int  xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+ xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count);
++int   xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
++                          xfs_off_t len);
+ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+                         int whichfork, xfs_extnum_t *nextents,
+                         xfs_filblks_t *count);
+diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
+index 38ea08a3dd1d..f3c393f309e1 100644
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -1368,10 +1368,19 @@ xfs_reflink_remap_prep(
+       if (ret)
+               goto out_unlock;
+-      /* Zap any page cache for the destination file's range. */
+-      truncate_inode_pages_range(&inode_out->i_data,
+-                      round_down(pos_out, PAGE_SIZE),
+-                      round_up(pos_out + *len, PAGE_SIZE) - 1);
++      /*
++       * If pos_out > EOF, we may have dirtied blocks between EOF and
++       * pos_out. In that case, we need to extend the flush and unmap to cover
++       * from EOF to the end of the copy length.
++       */
++      if (pos_out > XFS_ISIZE(dest)) {
++              loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
++              ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
++      } else {
++              ret = xfs_flush_unmap_range(dest, pos_out, *len);
++      }
++      if (ret)
++              goto out_unlock;
+       /* If we're altering the file contents... */
+       if (!is_dedupe) {
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch b/queue-4.19/xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch
new file mode 100644 (file)
index 0000000..2d42d70
--- /dev/null
@@ -0,0 +1,88 @@
+From 734afb4260f4207ee1dae99812eb66bf43d3d375 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:12 +0000
+Subject: xfs: rename m_inotbt_nores to m_finobt_nores
+
+commit e1f6ca11381588e3ef138c10de60eeb34cb8466a upstream.
+
+Rename this flag variable to imply more strongly that it's related to
+the free inode btree (finobt) operation.  No functional changes.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/libxfs/xfs_ag_resv.c      | 2 +-
+ fs/xfs/libxfs/xfs_ialloc_btree.c | 4 ++--
+ fs/xfs/xfs_inode.c               | 2 +-
+ fs/xfs/xfs_mount.h               | 2 +-
+ 4 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
+index e701ebc36c06..e2ba2a3b63b2 100644
+--- a/fs/xfs/libxfs/xfs_ag_resv.c
++++ b/fs/xfs/libxfs/xfs_ag_resv.c
+@@ -281,7 +281,7 @@ xfs_ag_resv_init(
+                        */
+                       ask = used = 0;
+-                      mp->m_inotbt_nores = true;
++                      mp->m_finobt_nores = true;
+                       error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
+                                       &used);
+diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
+index 86c50208a143..adb2f6df5a11 100644
+--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
++++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
+@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
+       union xfs_btree_ptr     *new,
+       int                     *stat)
+ {
+-      if (cur->bc_mp->m_inotbt_nores)
++      if (cur->bc_mp->m_finobt_nores)
+               return xfs_inobt_alloc_block(cur, start, new, stat);
+       return __xfs_inobt_alloc_block(cur, start, new, stat,
+                       XFS_AG_RESV_METADATA);
+@@ -157,7 +157,7 @@ xfs_finobt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+ {
+-      if (cur->bc_mp->m_inotbt_nores)
++      if (cur->bc_mp->m_finobt_nores)
+               return xfs_inobt_free_block(cur, bp);
+       return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
+ }
+diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
+index 05db9540e459..ae07baa7bdbf 100644
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
+        * now remains allocated and sits on the unlinked list until the fs is
+        * repaired.
+        */
+-      if (unlikely(mp->m_inotbt_nores)) {
++      if (unlikely(mp->m_finobt_nores)) {
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+                               XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
+                               &tp);
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 7964513c3128..7e0bf952e087 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -127,7 +127,7 @@ typedef struct xfs_mount {
+       struct mutex            m_growlock;     /* growfs mutex */
+       int                     m_fixedfsid[2]; /* unchanged for life of FS */
+       uint64_t                m_flags;        /* global mount flags */
+-      bool                    m_inotbt_nores; /* no per-AG finobt resv. */
++      bool                    m_finobt_nores; /* no per-AG finobt resv. */
+       int                     m_ialloc_inos;  /* inodes in inode allocation */
+       int                     m_ialloc_blks;  /* blocks in inode allocation */
+       int                     m_ialloc_min_blks;/* min blocks in sparse inode
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-reserve-blocks-for-ifree-transaction-during-log-.patch b/queue-4.19/xfs-reserve-blocks-for-ifree-transaction-during-log-.patch
new file mode 100644 (file)
index 0000000..2cf6707
--- /dev/null
@@ -0,0 +1,63 @@
+From a47a5308bd902ccef6a601c439c4529f3743d11e Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:14 +0000
+Subject: xfs: reserve blocks for ifree transaction during log recovery
+
+commit 15a268d9f263ed3a0601a1296568241a5a3da7aa upstream.
+
+Log recovery frees all the inodes stored in the unlinked list, which can
+cause expansion of the free inode btree.  The ifree code skips block
+reservations if it thinks there's a per-AG space reservation, but we
+don't set up the reservation until after log recovery, which means that
+a finobt expansion blows up in xfs_trans_mod_sb when we exceed the
+transaction's block reservation.
+
+To fix this, we set the "no finobt reservation" flag to true when we
+create the xfs_mount and only set it to false if we confirm that every
+AG had enough free space to put aside for the finobt.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_fsops.c | 1 +
+ fs/xfs/xfs_super.c | 7 +++++++
+ 2 files changed, 8 insertions(+)
+
+diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
+index 7c00b8bedfe3..09fd602507ef 100644
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -534,6 +534,7 @@ xfs_fs_reserve_ag_blocks(
+       int                     error = 0;
+       int                     err2;
++      mp->m_finobt_nores = false;
+       for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+               pag = xfs_perag_get(mp, agno);
+               err2 = xfs_ag_resv_init(pag, NULL);
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 207ee302b1bb..dce8114e3198 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1561,6 +1561,13 @@ xfs_mount_alloc(
+       INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+       INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
+       mp->m_kobj.kobject.kset = xfs_kset;
++      /*
++       * We don't create the finobt per-ag space reservation until after log
++       * recovery, so we must set this to true so that an ifree transaction
++       * started during log recovery will not depend on space reservations
++       * for finobt expansion.
++       */
++      mp->m_finobt_nores = true;
+       return mp;
+ }
+-- 
+2.20.1
+
diff --git a/queue-4.19/xfs-serialize-unaligned-dio-writes-against-all-other.patch b/queue-4.19/xfs-serialize-unaligned-dio-writes-against-all-other.patch
new file mode 100644 (file)
index 0000000..d5cbf95
--- /dev/null
@@ -0,0 +1,92 @@
+From 877391011e228577a3c413111983ae1286e4106b Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 18 Jul 2019 23:06:16 +0000
+Subject: xfs: serialize unaligned dio writes against all other dio writes
+
+commit 2032a8a27b5cc0f578d37fa16fa2494b80a0d00a upstream.
+
+XFS applies more strict serialization constraints to unaligned
+direct writes to accommodate things like direct I/O layer zeroing,
+unwritten extent conversion, etc. Unaligned submissions acquire the
+exclusive iolock and wait for in-flight dio to complete to ensure
+multiple submissions do not race on the same block and cause data
+corruption.
+
+This generally works in the case of an aligned dio followed by an
+unaligned dio, but the serialization is lost if I/Os occur in the
+opposite order. If an unaligned write is submitted first and
+immediately followed by an overlapping, aligned write, the latter
+submits without the typical unaligned serialization barriers because
+there is no indication of an unaligned dio still in-flight. This can
+lead to unpredictable results.
+
+To provide proper unaligned dio serialization, require that such
+direct writes are always the only dio allowed in-flight at one time
+for a particular inode. We already acquire the exclusive iolock and
+drain pending dio before submitting the unaligned dio. Wait once
+more after the dio submission to hold the iolock across the I/O and
+prevent further submissions until the unaligned I/O completes. This
+is heavy handed, but consistent with the current pre-submission
+serialization for unaligned direct writes.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_file.c | 27 +++++++++++++++++----------
+ 1 file changed, 17 insertions(+), 10 deletions(-)
+
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
+index 61a5ad2600e8..10f75965243c 100644
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -529,18 +529,17 @@ xfs_file_dio_aio_write(
+       count = iov_iter_count(from);
+       /*
+-       * If we are doing unaligned IO, wait for all other IO to drain,
+-       * otherwise demote the lock if we had to take the exclusive lock
+-       * for other reasons in xfs_file_aio_write_checks.
++       * If we are doing unaligned IO, we can't allow any other overlapping IO
++       * in-flight at the same time or we risk data corruption. Wait for all
++       * other IO to drain before we submit. If the IO is aligned, demote the
++       * iolock if we had to take the exclusive lock in
++       * xfs_file_aio_write_checks() for other reasons.
+        */
+       if (unaligned_io) {
+-              /* If we are going to wait for other DIO to finish, bail */
+-              if (iocb->ki_flags & IOCB_NOWAIT) {
+-                      if (atomic_read(&inode->i_dio_count))
+-                              return -EAGAIN;
+-              } else {
+-                      inode_dio_wait(inode);
+-              }
++              /* unaligned dio always waits, bail */
++              if (iocb->ki_flags & IOCB_NOWAIT)
++                      return -EAGAIN;
++              inode_dio_wait(inode);
+       } else if (iolock == XFS_IOLOCK_EXCL) {
+               xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               iolock = XFS_IOLOCK_SHARED;
+@@ -548,6 +547,14 @@ xfs_file_dio_aio_write(
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+       ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
++
++      /*
++       * If unaligned, this is the only IO in-flight. If it has not yet
++       * completed, wait on it before we release the iolock to prevent
++       * subsequent overlapping IO.
++       */
++      if (ret == -EIOCBQUEUED && unaligned_io)
++              inode_dio_wait(inode);
+ out:
+       xfs_iunlock(ip, iolock);
+-- 
+2.20.1
+