coda-pass-the-host-file-in-vma-vm_file-on-mmap.patch
include-asm-generic-bug.h-fix-cut-here-for-warn_on-for-__warn_taint-architectures.patch
btrfs-correctly-validate-compression-type.patch
+xfs-fix-pagecache-truncation-prior-to-reflink.patch
+xfs-flush-removing-page-cache-in-xfs_reflink_remap_p.patch
+xfs-don-t-overflow-xattr-listent-buffer.patch
+xfs-rename-m_inotbt_nores-to-m_finobt_nores.patch
+xfs-don-t-ever-put-nlink-0-inodes-on-the-unlinked-li.patch
+xfs-reserve-blocks-for-ifree-transaction-during-log-.patch
+xfs-fix-reporting-supported-extra-file-attributes-fo.patch
+xfs-serialize-unaligned-dio-writes-against-all-other.patch
+xfs-abort-unaligned-nowait-directio-early.patch
--- /dev/null
+From 3526c599d528f6918381c8b0a3e64a7736d036c5 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:17 +0000
+Subject: xfs: abort unaligned nowait directio early
+
+commit 1fdeaea4d92c69fb9f871a787af6ad00f32eeea7 upstream.
+
+Dave Chinner noticed that xfs_file_dio_aio_write returns EAGAIN without
+dropping the IOLOCK when its deciding not to wait, which means that we
+leak the IOLOCK there. Since we now make unaligned directio always
+wait, we have the opportunity to bail out before trying to take the
+lock, which should reduce the overhead of this never-gonna-work case
+considerably while also solving the dropped lock problem.
+
+Reported-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_file.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
+index 10f75965243c..259549698ba7 100644
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -517,6 +517,9 @@ xfs_file_dio_aio_write(
+ }
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
++ /* unaligned dio always waits, bail */
++ if (unaligned_io)
++ return -EAGAIN;
+ if (!xfs_ilock_nowait(ip, iolock))
+ return -EAGAIN;
+ } else {
+@@ -536,9 +539,6 @@ xfs_file_dio_aio_write(
+ * xfs_file_aio_write_checks() for other reasons.
+ */
+ if (unaligned_io) {
+- /* unaligned dio always waits, bail */
+- if (iocb->ki_flags & IOCB_NOWAIT)
+- return -EAGAIN;
+ inode_dio_wait(inode);
+ } else if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+--
+2.20.1
+
--- /dev/null
+From 9f96f052297fac5e16c781105f30bcc9bd1f0c9c Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:13 +0000
+Subject: xfs: don't ever put nlink > 0 inodes on the unlinked list
+
+commit c4a6bf7f6cc7eb4cce120fb7eb1e1fb8b2d65e09 upstream.
+
+When XFS creates an O_TMPFILE file, the inode is created with nlink = 1,
+put on the unlinked list, and then the VFS sets nlink = 0 in d_tmpfile.
+If we crash before anything logs the inode (it's dirty incore but the
+vfs doesn't tell us it's dirty so we never log that change), the iunlink
+processing part of recovery will then explode with a pile of:
+
+XFS: Assertion failed: VFS_I(ip)->i_nlink == 0, file:
+fs/xfs/xfs_log_recover.c, line: 5072
+
+Worse yet, since nlink is nonzero, the inodes also don't get cleaned up
+and they just leak until the next xfs_repair run.
+
+Therefore, change xfs_iunlink to require that inodes being put on the
+unlinked list have nlink == 0, change the tmpfile callers to instantiate
+nodes that way, and set the nlink to 1 just prior to calling d_tmpfile.
+Fix the comment for xfs_iunlink while we're at it.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_inode.c | 16 ++++++----------
+ fs/xfs/xfs_iops.c | 13 +++++++++++--
+ 2 files changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
+index ae07baa7bdbf..5ed84d6c7059 100644
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
+ if (error)
+ goto out_trans_cancel;
+
+- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
++ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
+ if (error)
+ goto out_trans_cancel;
+
+@@ -1907,11 +1907,8 @@ xfs_inactive(
+ }
+
+ /*
+- * This is called when the inode's link count goes to 0 or we are creating a
+- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
+- * set to true as the link count is dropped to zero by the VFS after we've
+- * created the file successfully, so we have to add it to the unlinked list
+- * while the link count is non-zero.
++ * This is called when the inode's link count has gone to 0 or we are creating
++ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI. It will be pulled from this
+ * list when the inode is freed.
+@@ -1931,6 +1928,7 @@ xfs_iunlink(
+ int offset;
+ int error;
+
++ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+
+ /*
+@@ -2837,11 +2835,9 @@ xfs_rename_alloc_whiteout(
+
+ /*
+ * Prepare the tmpfile inode as if it were created through the VFS.
+- * Otherwise, the link increment paths will complain about nlink 0->1.
+- * Drop the link count as done by d_tmpfile(), complete the inode setup
+- * and flag it as linkable.
++ * Complete the inode setup and flag it as linkable. nlink is already
++ * zero, so we can skip the drop_nlink.
+ */
+- drop_nlink(VFS_I(tmpfile));
+ xfs_setup_iops(tmpfile);
+ xfs_finish_inode_setup(tmpfile);
+ VFS_I(tmpfile)->i_state |= I_LINKABLE;
+diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
+index f48ffd7a8d3e..1efef69a7f1c 100644
+--- a/fs/xfs/xfs_iops.c
++++ b/fs/xfs/xfs_iops.c
+@@ -191,9 +191,18 @@ xfs_generic_create(
+
+ xfs_setup_iops(ip);
+
+- if (tmpfile)
++ if (tmpfile) {
++ /*
++ * The VFS requires that any inode fed to d_tmpfile must have
++ * nlink == 1 so that it can decrement the nlink in d_tmpfile.
++ * However, we created the temp file with nlink == 0 because
++ * we're not allowed to put an inode with nlink > 0 on the
++ * unlinked list. Therefore we have to set nlink to 1 so that
++ * d_tmpfile can immediately set it back to zero.
++ */
++ set_nlink(inode, 1);
+ d_tmpfile(dentry, inode);
+- else
++ } else
+ d_instantiate(dentry, inode);
+
+ xfs_finish_inode_setup(ip);
+--
+2.20.1
+
--- /dev/null
+From d033080f9b71c2a30a9e4c5d6777bc7d18ad9d6f Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:11 +0000
+Subject: xfs: don't overflow xattr listent buffer
+
+commit 3b50086f0c0d78c144d9483fa292c1509c931b70 upstream.
+
+For VFS listxattr calls, xfs_xattr_put_listent calls
+__xfs_xattr_put_listent twice if it sees an attribute
+"trusted.SGI_ACL_FILE": once for that name, and again for
+"system.posix_acl_access". Unfortunately, if we happen to run out of
+buffer space while emitting the first name, we set count to -1 (so that
+we can feed ERANGE to the caller). The second invocation doesn't check that
+the context parameters make sense and overwrites the byte before the
+buffer, triggering a KASAN report:
+
+==================================================================
+BUG: KASAN: slab-out-of-bounds in strncpy+0xb3/0xd0
+Write of size 1 at addr ffff88807fbd317f by task syz/1113
+
+CPU: 3 PID: 1113 Comm: syz Not tainted 5.0.0-rc6-xfsx #rc6
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-1ubuntu1 04/01/2014
+Call Trace:
+ dump_stack+0xcc/0x180
+ print_address_description+0x6c/0x23c
+ kasan_report.cold.3+0x1c/0x35
+ strncpy+0xb3/0xd0
+ __xfs_xattr_put_listent+0x1a9/0x2c0 [xfs]
+ xfs_attr_list_int_ilocked+0x11af/0x1800 [xfs]
+ xfs_attr_list_int+0x20c/0x2e0 [xfs]
+ xfs_vn_listxattr+0x225/0x320 [xfs]
+ listxattr+0x11f/0x1b0
+ path_listxattr+0xbd/0x130
+ do_syscall_64+0x139/0x560
+
+While we're at it we add an assert to the other put_listent to avoid
+this sort of thing ever happening to the attrlist_by_handle code.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_attr_list.c | 1 +
+ fs/xfs/xfs_xattr.c | 3 +++
+ 2 files changed, 4 insertions(+)
+
+diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
+index a58034049995..3d213a7394c5 100644
+--- a/fs/xfs/xfs_attr_list.c
++++ b/fs/xfs/xfs_attr_list.c
+@@ -555,6 +555,7 @@ xfs_attr_put_listent(
+ attrlist_ent_t *aep;
+ int arraytop;
+
++ ASSERT(!context->seen_enough);
+ ASSERT(!(context->flags & ATTR_KERNOVAL));
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
+index 63ee1d5bf1d7..9a63016009a1 100644
+--- a/fs/xfs/xfs_xattr.c
++++ b/fs/xfs/xfs_xattr.c
+@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
+ char *offset;
+ int arraytop;
+
++ if (context->count < 0 || context->seen_enough)
++ return;
++
+ if (!context->alist)
+ goto compute_size;
+
+--
+2.20.1
+
--- /dev/null
+From 8cde848ea596034b2b9f72227fcfc3b70b793567 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:09 +0000
+Subject: xfs: fix pagecache truncation prior to reflink
+
+commit 4918ef4ea008cd2ff47eb852894e3f9b9047f4f3 upstream.
+
+Prior to remapping blocks, it is necessary to remove pages from the
+destination file's page cache. Unfortunately, the truncation is not
+aggressive enough -- if page size > block size, we'll end up zeroing
+subpage blocks instead of removing them. So, round the start offset
+down and the end offset up to page boundaries. We already wrote all
+the dirty data so the larger range shouldn't be a problem.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_reflink.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
+index 7088f44c0c59..38ea08a3dd1d 100644
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -1369,8 +1369,9 @@ xfs_reflink_remap_prep(
+ goto out_unlock;
+
+ /* Zap any page cache for the destination file's range. */
+- truncate_inode_pages_range(&inode_out->i_data, pos_out,
+- PAGE_ALIGN(pos_out + *len) - 1);
++ truncate_inode_pages_range(&inode_out->i_data,
++ round_down(pos_out, PAGE_SIZE),
++ round_up(pos_out + *len, PAGE_SIZE) - 1);
+
+ /* If we're altering the file contents... */
+ if (!is_dedupe) {
+--
+2.20.1
+
--- /dev/null
+From 5f1b38140a9c4b7260f0e720509cba82b2c90bf2 Mon Sep 17 00:00:00 2001
+From: "Luis R. Rodriguez" <mcgrof@kernel.org>
+Date: Thu, 18 Jul 2019 23:06:15 +0000
+Subject: xfs: fix reporting supported extra file attributes for statx()
+
+commit 1b9598c8fb9965fff901c4caa21fed9644c34df3 upstream.
+
+statx(2) notes that any attribute that is not indicated as supported by
+stx_attributes_mask has no usable value. Commit 5f955f26f3d42d ("xfs: report
+crtime and attribute flags to statx") added support for informing userspace
+of extra file attributes but forgot to list these flags as supported
+making reporting them rather useless for the pedantic userspace author.
+
+$ git describe --contains 5f955f26f3d42d04aba65590a32eb70eedb7f37d
+v4.11-rc6~5^2^2~2
+
+Fixes: 5f955f26f3d42d ("xfs: report crtime and attribute flags to statx")
+Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: add a comment reminding people to keep attributes_mask up to date]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_iops.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
+index 1efef69a7f1c..74047bd0c1ae 100644
+--- a/fs/xfs/xfs_iops.c
++++ b/fs/xfs/xfs_iops.c
+@@ -531,6 +531,10 @@ xfs_vn_getattr(
+ }
+ }
+
++ /*
++ * Note: If you add another clause to set an attribute flag, please
++ * update attributes_mask below.
++ */
+ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+@@ -538,6 +542,10 @@ xfs_vn_getattr(
+ if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
+ stat->attributes |= STATX_ATTR_NODUMP;
+
++ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
++ STATX_ATTR_APPEND |
++ STATX_ATTR_NODUMP);
++
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+--
+2.20.1
+
--- /dev/null
+From be409aea9de66a543d0ce43418dc0a8164cd0141 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Thu, 18 Jul 2019 23:06:10 +0000
+Subject: xfs: flush removing page cache in xfs_reflink_remap_prep
+
+commit 2c307174ab77e34645e75e12827646e044d273c3 upstream.
+
+On a sub-page block size filesystem, fsx is failing with a data
+corruption after a series of operations involving copying a file
+with the destination offset beyond EOF of the destination of the file:
+
+8093(157 mod 256): TRUNCATE DOWN from 0x7a120 to 0x50000 ******WWWW
+8094(158 mod 256): INSERT 0x25000 thru 0x25fff (0x1000 bytes)
+8095(159 mod 256): COPY 0x18000 thru 0x1afff (0x3000 bytes) to 0x2f400
+8096(160 mod 256): WRITE 0x5da00 thru 0x651ff (0x7800 bytes) HOLE
+8097(161 mod 256): COPY 0x2000 thru 0x5fff (0x4000 bytes) to 0x6fc00
+
+The second copy here is beyond EOF, and it is to sub-page (4k) but
+block aligned (1k) offset. The clone runs the EOF zeroing, landing
+in a pre-existing post-eof delalloc extent. This zeroes the post-eof
+extents in the page cache just fine, dirtying the pages correctly.
+
+The problem is that xfs_reflink_remap_prep() now truncates the page
+cache over the range that it is copying it to, and rounds that down
+to cover the entire start page. This removes the dirty page over the
+delalloc extent from the page cache without having written it back.
+Hence later, when the page cache is flushed, the page at offset
+0x6f000 has not been written back and hence exposes stale data,
+which fsx trips over less than 10 operations later.
+
+Fix this by changing xfs_reflink_remap_prep() to use
+xfs_flush_unmap_range().
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_bmap_util.c | 2 +-
+ fs/xfs/xfs_bmap_util.h | 2 ++
+ fs/xfs/xfs_reflink.c | 17 +++++++++++++----
+ 3 files changed, 16 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
+index 211b06e4702e..41ad9eaab6ce 100644
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1080,7 +1080,7 @@ xfs_adjust_extent_unmap_boundaries(
+ return 0;
+ }
+
+-static int
++int
+ xfs_flush_unmap_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
+index 87363d136bb6..9c73d012f56a 100644
+--- a/fs/xfs/xfs_bmap_util.h
++++ b/fs/xfs/xfs_bmap_util.h
+@@ -76,6 +76,8 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
+ xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count);
++int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
++ xfs_off_t len);
+ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, xfs_extnum_t *nextents,
+ xfs_filblks_t *count);
+diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
+index 38ea08a3dd1d..f3c393f309e1 100644
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -1368,10 +1368,19 @@ xfs_reflink_remap_prep(
+ if (ret)
+ goto out_unlock;
+
+- /* Zap any page cache for the destination file's range. */
+- truncate_inode_pages_range(&inode_out->i_data,
+- round_down(pos_out, PAGE_SIZE),
+- round_up(pos_out + *len, PAGE_SIZE) - 1);
++ /*
++ * If pos_out > EOF, we may have dirtied blocks between EOF and
++ * pos_out. In that case, we need to extend the flush and unmap to cover
++ * from EOF to the end of the copy length.
++ */
++ if (pos_out > XFS_ISIZE(dest)) {
++ loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
++ ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
++ } else {
++ ret = xfs_flush_unmap_range(dest, pos_out, *len);
++ }
++ if (ret)
++ goto out_unlock;
+
+ /* If we're altering the file contents... */
+ if (!is_dedupe) {
+--
+2.20.1
+
--- /dev/null
+From 734afb4260f4207ee1dae99812eb66bf43d3d375 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:12 +0000
+Subject: xfs: rename m_inotbt_nores to m_finobt_nores
+
+commit e1f6ca11381588e3ef138c10de60eeb34cb8466a upstream.
+
+Rename this flag variable to imply more strongly that it's related to
+the free inode btree (finobt) operation. No functional changes.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/libxfs/xfs_ag_resv.c | 2 +-
+ fs/xfs/libxfs/xfs_ialloc_btree.c | 4 ++--
+ fs/xfs/xfs_inode.c | 2 +-
+ fs/xfs/xfs_mount.h | 2 +-
+ 4 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
+index e701ebc36c06..e2ba2a3b63b2 100644
+--- a/fs/xfs/libxfs/xfs_ag_resv.c
++++ b/fs/xfs/libxfs/xfs_ag_resv.c
+@@ -281,7 +281,7 @@ xfs_ag_resv_init(
+ */
+ ask = used = 0;
+
+- mp->m_inotbt_nores = true;
++ mp->m_finobt_nores = true;
+
+ error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
+ &used);
+diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
+index 86c50208a143..adb2f6df5a11 100644
+--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
++++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
+@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
+ union xfs_btree_ptr *new,
+ int *stat)
+ {
+- if (cur->bc_mp->m_inotbt_nores)
++ if (cur->bc_mp->m_finobt_nores)
+ return xfs_inobt_alloc_block(cur, start, new, stat);
+ return __xfs_inobt_alloc_block(cur, start, new, stat,
+ XFS_AG_RESV_METADATA);
+@@ -157,7 +157,7 @@ xfs_finobt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+ {
+- if (cur->bc_mp->m_inotbt_nores)
++ if (cur->bc_mp->m_finobt_nores)
+ return xfs_inobt_free_block(cur, bp);
+ return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
+ }
+diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
+index 05db9540e459..ae07baa7bdbf 100644
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
+ * now remains allocated and sits on the unlinked list until the fs is
+ * repaired.
+ */
+- if (unlikely(mp->m_inotbt_nores)) {
++ if (unlikely(mp->m_finobt_nores)) {
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
+ &tp);
+diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
+index 7964513c3128..7e0bf952e087 100644
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -127,7 +127,7 @@ typedef struct xfs_mount {
+ struct mutex m_growlock; /* growfs mutex */
+ int m_fixedfsid[2]; /* unchanged for life of FS */
+ uint64_t m_flags; /* global mount flags */
+- bool m_inotbt_nores; /* no per-AG finobt resv. */
++ bool m_finobt_nores; /* no per-AG finobt resv. */
+ int m_ialloc_inos; /* inodes in inode allocation */
+ int m_ialloc_blks; /* blocks in inode allocation */
+ int m_ialloc_min_blks;/* min blocks in sparse inode
+--
+2.20.1
+
--- /dev/null
+From a47a5308bd902ccef6a601c439c4529f3743d11e Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 18 Jul 2019 23:06:14 +0000
+Subject: xfs: reserve blocks for ifree transaction during log recovery
+
+commit 15a268d9f263ed3a0601a1296568241a5a3da7aa upstream.
+
+Log recovery frees all the inodes stored in the unlinked list, which can
+cause expansion of the free inode btree. The ifree code skips block
+reservations if it thinks there's a per-AG space reservation, but we
+don't set up the reservation until after log recovery, which means that
+a finobt expansion blows up in xfs_trans_mod_sb when we exceed the
+transaction's block reservation.
+
+To fix this, we set the "no finobt reservation" flag to true when we
+create the xfs_mount and only set it to false if we confirm that every
+AG had enough free space to put aside for the finobt.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Suggested-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_fsops.c | 1 +
+ fs/xfs/xfs_super.c | 7 +++++++
+ 2 files changed, 8 insertions(+)
+
+diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
+index 7c00b8bedfe3..09fd602507ef 100644
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -534,6 +534,7 @@ xfs_fs_reserve_ag_blocks(
+ int error = 0;
+ int err2;
+
++ mp->m_finobt_nores = false;
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+ err2 = xfs_ag_resv_init(pag, NULL);
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 207ee302b1bb..dce8114e3198 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1561,6 +1561,13 @@ xfs_mount_alloc(
+ INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+ INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
+ mp->m_kobj.kobject.kset = xfs_kset;
++ /*
++ * We don't create the finobt per-ag space reservation until after log
++ * recovery, so we must set this to true so that an ifree transaction
++ * started during log recovery will not depend on space reservations
++ * for finobt expansion.
++ */
++ mp->m_finobt_nores = true;
+ return mp;
+ }
+
+--
+2.20.1
+
--- /dev/null
+From 877391011e228577a3c413111983ae1286e4106b Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Thu, 18 Jul 2019 23:06:16 +0000
+Subject: xfs: serialize unaligned dio writes against all other dio writes
+
+commit 2032a8a27b5cc0f578d37fa16fa2494b80a0d00a upstream.
+
+XFS applies more strict serialization constraints to unaligned
+direct writes to accommodate things like direct I/O layer zeroing,
+unwritten extent conversion, etc. Unaligned submissions acquire the
+exclusive iolock and wait for in-flight dio to complete to ensure
+multiple submissions do not race on the same block and cause data
+corruption.
+
+This generally works in the case of an aligned dio followed by an
+unaligned dio, but the serialization is lost if I/Os occur in the
+opposite order. If an unaligned write is submitted first and
+immediately followed by an overlapping, aligned write, the latter
+submits without the typical unaligned serialization barriers because
+there is no indication of an unaligned dio still in-flight. This can
+lead to unpredictable results.
+
+To provide proper unaligned dio serialization, require that such
+direct writes are always the only dio allowed in-flight at one time
+for a particular inode. We already acquire the exclusive iolock and
+drain pending dio before submitting the unaligned dio. Wait once
+more after the dio submission to hold the iolock across the I/O and
+prevent further submissions until the unaligned I/O completes. This
+is heavy handed, but consistent with the current pre-submission
+serialization for unaligned direct writes.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/xfs/xfs_file.c | 27 +++++++++++++++++----------
+ 1 file changed, 17 insertions(+), 10 deletions(-)
+
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
+index 61a5ad2600e8..10f75965243c 100644
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -529,18 +529,17 @@ xfs_file_dio_aio_write(
+ count = iov_iter_count(from);
+
+ /*
+- * If we are doing unaligned IO, wait for all other IO to drain,
+- * otherwise demote the lock if we had to take the exclusive lock
+- * for other reasons in xfs_file_aio_write_checks.
++ * If we are doing unaligned IO, we can't allow any other overlapping IO
++ * in-flight at the same time or we risk data corruption. Wait for all
++ * other IO to drain before we submit. If the IO is aligned, demote the
++ * iolock if we had to take the exclusive lock in
++ * xfs_file_aio_write_checks() for other reasons.
+ */
+ if (unaligned_io) {
+- /* If we are going to wait for other DIO to finish, bail */
+- if (iocb->ki_flags & IOCB_NOWAIT) {
+- if (atomic_read(&inode->i_dio_count))
+- return -EAGAIN;
+- } else {
+- inode_dio_wait(inode);
+- }
++ /* unaligned dio always waits, bail */
++ if (iocb->ki_flags & IOCB_NOWAIT)
++ return -EAGAIN;
++ inode_dio_wait(inode);
+ } else if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+@@ -548,6 +547,14 @@ xfs_file_dio_aio_write(
+
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
++
++ /*
++ * If unaligned, this is the only IO in-flight. If it has not yet
++ * completed, wait on it before we release the iolock to prevent
++ * subsequent overlapping IO.
++ */
++ if (ret == -EIOCBQUEUED && unaligned_io)
++ inode_dio_wait(inode);
+ out:
+ xfs_iunlock(ip, iolock);
+
+--
+2.20.1
+