4.9-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 24 Oct 2017 12:54:24 +0000 (14:54 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 24 Oct 2017 12:54:24 +0000 (14:54 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 24 Oct 2017 12:54:24 +0000 (14:54 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 24 Oct 2017 12:54:24 +0000 (14:54 +0200)
diff --git a/queue-4.9/fs-xfs-use-ps-printk-format-for-direct-addresses.patch b/queue-4.9/fs-xfs-use-ps-printk-format-for-direct-addresses.patch

new file mode 100644 (file)

index 0000000..55bd9d5
--- /dev/null
+++ b/queue-4.9/fs-xfs-use-ps-printk-format-for-direct-addresses.patch
@@ -0,0 +1,34 @@
+From e150dcd459e1b441eaf08f341a986f04e61bf3b8 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Mon, 18 Sep 2017 11:34:16 -0700
+Subject: fs/xfs: Use %pS printk format for direct addresses
+
+From: Helge Deller <deller@gmx.de>
+
+commit e150dcd459e1b441eaf08f341a986f04e61bf3b8 upstream.
+
+Use the %pS instead of the %pF printk format specifier for printing symbols
+from direct addresses. This is needed for the ia64, ppc64 and parisc64
+architectures.
+
+Signed-off-by: Helge Deller <deller@gmx.de>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_error.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_error.c
++++ b/fs/xfs/xfs_error.c
+@@ -167,7 +167,7 @@ xfs_verifier_error(
+ {
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+ 
+-      xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
++      xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx",
+                 bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
+                 __return_address, bp->b_ops->name, bp->b_bn);
+ 
diff --git a/queue-4.9/series b/queue-4.9/series

index 302aa6f59d462951c27585d75f56f1d53e8de9ba..367f2797e1d5277426b012f0c4140d32803166bd 100644 (file)
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -30,3 +30,19 @@ lib-digsig-fix-dereference-of-null-user_key_payload.patch
  keys-don-t-let-add_key-update-an-uninstantiated-key.patch
  pkcs7-prevent-null-pointer-dereference-since-sinfo-is-not-always-set.patch
  vmbus-fix-missing-signaling-in-hv_signal_on_read.patch
+xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch
+xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch
+fs-xfs-use-ps-printk-format-for-direct-addresses.patch
+xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch
+xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch
+xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch
+xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch
+xfs-always-swap-the-cow-forks-when-swapping-extents.patch
+xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch
+xfs-don-t-log-uninitialised-fields-in-inode-structures.patch
+xfs-move-more-rt-specific-code-under-config_xfs_rt.patch
+xfs-don-t-change-inode-mode-if-acl-update-fails.patch
+xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch
+xfs-handle-error-if-xfs_btree_get_bufs-fails.patch
+xfs-cancel-dirty-pages-on-invalidation.patch
+xfs-trim-writepage-mapping-to-within-eof.patch
diff --git a/queue-4.9/xfs-always-swap-the-cow-forks-when-swapping-extents.patch b/queue-4.9/xfs-always-swap-the-cow-forks-when-swapping-extents.patch

new file mode 100644 (file)

index 0000000..8311f3a
--- /dev/null
+++ b/queue-4.9/xfs-always-swap-the-cow-forks-when-swapping-extents.patch
@@ -0,0 +1,58 @@
+From 52bfcdd7adbc26639bc7b2356ab9a3f5dad68ad6 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 18 Sep 2017 09:41:18 -0700
+Subject: xfs: always swap the cow forks when swapping extents
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 52bfcdd7adbc26639bc7b2356ab9a3f5dad68ad6 upstream.
+
+Since the CoW fork exists as a secondary data structure to the data
+fork, we must always swap cow forks during swapext.  We also need to
+swap the extent counts and reset the cowblocks tags.
+
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |   24 ++++++++++++++++++++++--
+ 1 file changed, 22 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -2106,11 +2106,31 @@ xfs_swap_extents(
+               ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+               tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+               tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
++      }
++
++      /* Swap the cow forks. */
++      if (xfs_sb_version_hasreflink(&mp->m_sb)) {
++              xfs_extnum_t    extnum;
++
++              ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
++              ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
++
++              extnum = ip->i_cnextents;
++              ip->i_cnextents = tip->i_cnextents;
++              tip->i_cnextents = extnum;
++
+               cowfp = ip->i_cowfp;
+               ip->i_cowfp = tip->i_cowfp;
+               tip->i_cowfp = cowfp;
+-              xfs_inode_set_cowblocks_tag(ip);
+-              xfs_inode_set_cowblocks_tag(tip);
++
++              if (ip->i_cowfp && ip->i_cnextents)
++                      xfs_inode_set_cowblocks_tag(ip);
++              else
++                      xfs_inode_clear_cowblocks_tag(ip);
++              if (tip->i_cowfp && tip->i_cnextents)
++                      xfs_inode_set_cowblocks_tag(tip);
++              else
++                      xfs_inode_clear_cowblocks_tag(tip);
+       }
+ 
+       xfs_trans_log_inode(tp, ip,  src_log_flags);
diff --git a/queue-4.9/xfs-cancel-dirty-pages-on-invalidation.patch b/queue-4.9/xfs-cancel-dirty-pages-on-invalidation.patch

new file mode 100644 (file)

index 0000000..db79bb9
--- /dev/null
+++ b/queue-4.9/xfs-cancel-dirty-pages-on-invalidation.patch
@@ -0,0 +1,103 @@
+From 793d7dbe6d82a50b9d14bf992b9eaacb70a11ce6 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 13 Oct 2017 09:47:45 -0700
+Subject: xfs: cancel dirty pages on invalidation
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 793d7dbe6d82a50b9d14bf992b9eaacb70a11ce6 upstream.
+
+Recently we've had warnings arise from the vm handing us pages
+without bufferheads attached to them. This should not ever occur
+in XFS, but we don't defend against it properly if it does. The only
+place where we remove bufferheads from a page is in
+xfs_vm_releasepage(), but we can't tell the difference here between
+"page is dirty so don't release" and "page is dirty but is being
+invalidated so release it".
+
+In some places that are invalidating pages ask for pages to be
+released and follow up afterward calling ->releasepage by checking
+whether the page was dirty and then aborting the invalidation. This
+is a possible vector for releasing buffers from a page but then
+leaving it in the mapping, so we really do need to avoid dirty pages
+in xfs_vm_releasepage().
+
+To differentiate between invalidated pages and normal pages, we need
+to clear the page dirty flag when invalidating the pages. This can
+be done through xfs_vm_invalidatepage(), and will result
+xfs_vm_releasepage() seeing the page as clean which matches the
+bufferhead state on the page after calling block_invalidatepage().
+
+Hence we can re-add the page dirty check in xfs_vm_releasepage to
+catch the case where we might be releasing a page that is actually
+dirty and so should not have the bufferheads on it removed. This
+will remove one possible vector of "dirty page with no bufferheads"
+and so help narrow down the search for the root cause of that
+problem.
+
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c |   34 ++++++++++++++++++++++------------
+ 1 file changed, 22 insertions(+), 12 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -726,6 +726,14 @@ xfs_vm_invalidatepage(
+ {
+       trace_xfs_invalidatepage(page->mapping->host, page, offset,
+                                length);
++
++      /*
++       * If we are invalidating the entire page, clear the dirty state from it
++       * so that we can check for attempts to release dirty cached pages in
++       * xfs_vm_releasepage().
++       */
++      if (offset == 0 && length >= PAGE_SIZE)
++              cancel_dirty_page(page);
+       block_invalidatepage(page, offset, length);
+ }
+ 
+@@ -1181,25 +1189,27 @@ xfs_vm_releasepage(
+        * mm accommodates an old ext3 case where clean pages might not have had
+        * the dirty bit cleared. Thus, it can send actual dirty pages to
+        * ->releasepage() via shrink_active_list(). Conversely,
+-       * block_invalidatepage() can send pages that are still marked dirty
+-       * but otherwise have invalidated buffers.
++       * block_invalidatepage() can send pages that are still marked dirty but
++       * otherwise have invalidated buffers.
+        *
+        * We want to release the latter to avoid unnecessary buildup of the
+-       * LRU, skip the former and warn if we've left any lingering
+-       * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
+-       * or unwritten buffers and warn if the page is not dirty. Otherwise
+-       * try to release the buffers.
++       * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
++       * that are entirely invalidated and need to be released.  Hence the
++       * only time we should get dirty pages here is through
++       * shrink_active_list() and so we can simply skip those now.
++       *
++       * warn if we've left any lingering delalloc/unwritten buffers on clean
++       * or invalidated pages we are about to release.
+        */
++      if (PageDirty(page))
++              return 0;
++
+       xfs_count_page_state(page, &delalloc, &unwritten);
+ 
+-      if (delalloc) {
+-              WARN_ON_ONCE(!PageDirty(page));
++      if (WARN_ON_ONCE(delalloc))
+               return 0;
+-      }
+-      if (unwritten) {
+-              WARN_ON_ONCE(!PageDirty(page));
++      if (WARN_ON_ONCE(unwritten))
+               return 0;
+-      }
+ 
+       return try_to_free_buffers(page);
+ }
diff --git a/queue-4.9/xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch b/queue-4.9/xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch

new file mode 100644 (file)

index 0000000..ec52d50
--- /dev/null
+++ b/queue-4.9/xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch
@@ -0,0 +1,40 @@
+From 842f6e9f786226c58fcbd5ef80eadca72fdfe652 Mon Sep 17 00:00:00 2001
+From: Carlos Maiolino <cmaiolino@redhat.com>
+Date: Fri, 22 Sep 2017 11:47:46 -0700
+Subject: xfs: Capture state of the right inode in xfs_iflush_done
+
+From: Carlos Maiolino <cmaiolino@redhat.com>
+
+commit 842f6e9f786226c58fcbd5ef80eadca72fdfe652 upstream.
+
+My previous patch: d3a304b6292168b83b45d624784f973fdc1ca674 check for
+XFS_LI_FAILED flag xfs_iflush done, so the failed item can be properly
+resubmitted.
+
+In the loop scanning other inodes being completed, it should check the
+current item for the XFS_LI_FAILED, and not the initial one.
+
+The state of the initial inode is checked after the loop ends
+
+Kudos to Eric for catching this.
+
+Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_inode_item.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -745,7 +745,7 @@ xfs_iflush_done(
+                */
+               iip = INODE_ITEM(blip);
+               if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
+-                  lip->li_flags & XFS_LI_FAILED)
++                  (blip->li_flags & XFS_LI_FAILED))
+                       need_ail++;
+ 
+               blip = next;
diff --git a/queue-4.9/xfs-don-t-change-inode-mode-if-acl-update-fails.patch b/queue-4.9/xfs-don-t-change-inode-mode-if-acl-update-fails.patch

new file mode 100644 (file)

index 0000000..8c0ff50
--- /dev/null
+++ b/queue-4.9/xfs-don-t-change-inode-mode-if-acl-update-fails.patch
@@ -0,0 +1,72 @@
+From 67f2ffe31d1a683170c2ba0ecc643e42a5fdd397 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Mon, 9 Oct 2017 11:37:23 -0700
+Subject: xfs: don't change inode mode if ACL update fails
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 67f2ffe31d1a683170c2ba0ecc643e42a5fdd397 upstream.
+
+If we get ENOSPC half way through setting the ACL, the inode mode
+can still be changed even though the ACL does not exist. Reorder the
+operation to only change the mode of the inode if the ACL is set
+correctly.
+
+Whilst this does not fix the problem with crash consistency (that requires
+attribute addition to be a deferred op) it does prevent ENOSPC and other
+non-fatal errors setting an xattr to be handled sanely.
+
+This fixes xfstests generic/449.
+
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_acl.c |   22 ++++++++++++++++------
+ 1 file changed, 16 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/xfs_acl.c
++++ b/fs/xfs/xfs_acl.c
+@@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_
+ int
+ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ {
++      umode_t mode;
++      bool set_mode = false;
+       int error = 0;
+ 
+       if (!acl)
+@@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct
+               return error;
+ 
+       if (type == ACL_TYPE_ACCESS) {
+-              umode_t mode;
+-
+               error = posix_acl_update_mode(inode, &mode, &acl);
+               if (error)
+                       return error;
+-              error = xfs_set_mode(inode, mode);
+-              if (error)
+-                      return error;
++              set_mode = true;
+       }
+ 
+  set_acl:
+-      return __xfs_set_acl(inode, acl, type);
++      error =  __xfs_set_acl(inode, acl, type);
++      if (error)
++              return error;
++
++      /*
++       * We set the mode after successfully updating the ACL xattr because the
++       * xattr update can fail at ENOSPC and we don't want to change the mode
++       * if the ACL update hasn't been applied.
++       */
++      if (set_mode)
++              error = xfs_set_mode(inode, mode);
++
++      return error;
+ }
diff --git a/queue-4.9/xfs-don-t-log-uninitialised-fields-in-inode-structures.patch b/queue-4.9/xfs-don-t-log-uninitialised-fields-in-inode-structures.patch

new file mode 100644 (file)

index 0000000..bb9121e
--- /dev/null
+++ b/queue-4.9/xfs-don-t-log-uninitialised-fields-in-inode-structures.patch
@@ -0,0 +1,226 @@
+From 20413e37d71befd02b5846acdaf5e2564dd1c38e Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Mon, 9 Oct 2017 11:37:22 -0700
+Subject: xfs: Don't log uninitialised fields in inode structures
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 20413e37d71befd02b5846acdaf5e2564dd1c38e upstream.
+
+Prevent kmemcheck from throwing warnings about reading uninitialised
+memory when formatting inodes into the incore log buffer. There are
+several issues here - we don't always log all the fields in the
+inode log format item, and we never log the inode the
+di_next_unlinked field.
+
+In the case of the inode log format item, this is exacerbated
+by the old xfs_inode_log_format structure padding issue. Hence make
+the padded, 64 bit aligned version of the structure the one we always
+use for formatting the log and get rid of the 64 bit variant. This
+means we'll always log the 64-bit version and so recovery only needs
+to convert from the unpadded 32 bit version from older 32 bit
+kernels.
+
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/xfs/libxfs/xfs_log_format.h |   27 ++++---------
+ fs/xfs/xfs_inode_item.c        |   82 ++++++++++++++++++++---------------------
+ fs/xfs/xfs_ondisk.h            |    2 -
+ 3 files changed, 50 insertions(+), 61 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_log_format.h
++++ b/fs/xfs/libxfs/xfs_log_format.h
+@@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format {
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
++      __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+@@ -280,29 +281,17 @@ typedef struct xfs_inode_log_format {
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+ } xfs_inode_log_format_t;
+ 
+-typedef struct xfs_inode_log_format_32 {
+-      __uint16_t              ilf_type;       /* inode log item type */
+-      __uint16_t              ilf_size;       /* size of this item */
+-      __uint32_t              ilf_fields;     /* flags for fields logged */
+-      __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+-      __uint16_t              ilf_dsize;      /* size of data/ext/root */
+-      __uint64_t              ilf_ino;        /* inode number */
+-      union {
+-              __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+-              uuid_t          ilfu_uuid;      /* mount point value */
+-      } ilf_u;
+-      __int64_t               ilf_blkno;      /* blkno of inode buffer */
+-      __int32_t               ilf_len;        /* len of inode buffer */
+-      __int32_t               ilf_boffset;    /* off of inode in buffer */
+-} __attribute__((packed)) xfs_inode_log_format_32_t;
+-
+-typedef struct xfs_inode_log_format_64 {
++/*
++ * Old 32 bit systems will log in this format without the 64 bit
++ * alignment padding. Recovery will detect this and convert it to the
++ * correct format.
++ */
++struct xfs_inode_log_format_32 {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+-      __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+@@ -311,7 +300,7 @@ typedef struct xfs_inode_log_format_64 {
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+-} xfs_inode_log_format_64_t;
++} __attribute__((packed));
+ 
+ 
+ /*
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -364,6 +364,9 @@ xfs_inode_to_log_dinode(
+       to->di_dmstate = from->di_dmstate;
+       to->di_flags = from->di_flags;
+ 
++      /* log a dummy value to ensure log structure is fully initialised */
++      to->di_next_unlinked = NULLAGINO;
++
+       if (from->di_version == 3) {
+               to->di_changecount = inode->i_version;
+               to->di_crtime.t_sec = from->di_crtime.t_sec;
+@@ -404,6 +407,11 @@ xfs_inode_item_format_core(
+  * the second with the on-disk inode structure, and a possible third and/or
+  * fourth with the inode data/extents/b-tree root and inode attributes
+  * data/extents/b-tree root.
++ *
++ * Note: Always use the 64 bit inode log format structure so we don't
++ * leave an uninitialised hole in the format item on 64 bit systems. Log
++ * recovery on 32 bit systems handles this just fine, so there's no reason
++ * for not using an initialising the properly padded structure all the time.
+  */
+ STATIC void
+ xfs_inode_item_format(
+@@ -412,8 +420,8 @@ xfs_inode_item_format(
+ {
+       struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+       struct xfs_inode        *ip = iip->ili_inode;
+-      struct xfs_inode_log_format *ilf;
+       struct xfs_log_iovec    *vecp = NULL;
++      struct xfs_inode_log_format *ilf;
+ 
+       ASSERT(ip->i_d.di_version > 1);
+ 
+@@ -425,7 +433,17 @@ xfs_inode_item_format(
+       ilf->ilf_boffset = ip->i_imap.im_boffset;
+       ilf->ilf_fields = XFS_ILOG_CORE;
+       ilf->ilf_size = 2; /* format + core */
+-      xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
++
++      /*
++       * make sure we don't leak uninitialised data into the log in the case
++       * when we don't log every field in the inode.
++       */
++      ilf->ilf_dsize = 0;
++      ilf->ilf_asize = 0;
++      ilf->ilf_pad = 0;
++      memset(&ilf->ilf_u.ilfu_uuid, 0, sizeof(ilf->ilf_u.ilfu_uuid));
++
++      xlog_finish_iovec(lv, vecp, sizeof(*ilf));
+ 
+       xfs_inode_item_format_core(ip, lv, &vecp);
+       xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
+@@ -855,48 +873,30 @@ xfs_istale_done(
+ }
+ 
+ /*
+- * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
+- * (which can have different field alignments) to the native version
++ * convert an xfs_inode_log_format struct from the old 32 bit version
++ * (which can have different field alignments) to the native 64 bit version
+  */
+ int
+ xfs_inode_item_format_convert(
+-      xfs_log_iovec_t         *buf,
+-      xfs_inode_log_format_t  *in_f)
++      struct xfs_log_iovec            *buf,
++      struct xfs_inode_log_format     *in_f)
+ {
+-      if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
+-              xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
++      struct xfs_inode_log_format_32  *in_f32 = buf->i_addr;
+ 
+-              in_f->ilf_type = in_f32->ilf_type;
+-              in_f->ilf_size = in_f32->ilf_size;
+-              in_f->ilf_fields = in_f32->ilf_fields;
+-              in_f->ilf_asize = in_f32->ilf_asize;
+-              in_f->ilf_dsize = in_f32->ilf_dsize;
+-              in_f->ilf_ino = in_f32->ilf_ino;
+-              /* copy biggest field of ilf_u */
+-              memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+-                     in_f32->ilf_u.ilfu_uuid.__u_bits,
+-                     sizeof(uuid_t));
+-              in_f->ilf_blkno = in_f32->ilf_blkno;
+-              in_f->ilf_len = in_f32->ilf_len;
+-              in_f->ilf_boffset = in_f32->ilf_boffset;
+-              return 0;
+-      } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
+-              xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
+-
+-              in_f->ilf_type = in_f64->ilf_type;
+-              in_f->ilf_size = in_f64->ilf_size;
+-              in_f->ilf_fields = in_f64->ilf_fields;
+-              in_f->ilf_asize = in_f64->ilf_asize;
+-              in_f->ilf_dsize = in_f64->ilf_dsize;
+-              in_f->ilf_ino = in_f64->ilf_ino;
+-              /* copy biggest field of ilf_u */
+-              memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+-                     in_f64->ilf_u.ilfu_uuid.__u_bits,
+-                     sizeof(uuid_t));
+-              in_f->ilf_blkno = in_f64->ilf_blkno;
+-              in_f->ilf_len = in_f64->ilf_len;
+-              in_f->ilf_boffset = in_f64->ilf_boffset;
+-              return 0;
+-      }
+-      return -EFSCORRUPTED;
++      if (buf->i_len != sizeof(*in_f32))
++              return -EFSCORRUPTED;
++
++      in_f->ilf_type = in_f32->ilf_type;
++      in_f->ilf_size = in_f32->ilf_size;
++      in_f->ilf_fields = in_f32->ilf_fields;
++      in_f->ilf_asize = in_f32->ilf_asize;
++      in_f->ilf_dsize = in_f32->ilf_dsize;
++      in_f->ilf_ino = in_f32->ilf_ino;
++      /* copy biggest field of ilf_u */
++      memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
++             in_f32->ilf_u.ilfu_uuid.__u_bits, sizeof(uuid_t));
++      in_f->ilf_blkno = in_f32->ilf_blkno;
++      in_f->ilf_len = in_f32->ilf_len;
++      in_f->ilf_boffset = in_f32->ilf_boffset;
++      return 0;
+ }
+--- a/fs/xfs/xfs_ondisk.h
++++ b/fs/xfs/xfs_ondisk.h
+@@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void)
+       XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,           28);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp,           8);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,   52);
+-      XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64,   56);
++      XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,      56);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,        20);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,          16);
+ }
diff --git a/queue-4.9/xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch b/queue-4.9/xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch

new file mode 100644 (file)

index 0000000..4af8675
--- /dev/null
+++ b/queue-4.9/xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch
@@ -0,0 +1,42 @@
+From cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 18 Sep 2017 09:41:16 -0700
+Subject: xfs: don't unconditionally clear the reflink flag on zero-block files
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 upstream.
+
+If we have speculative cow preallocations hanging around in the cow
+fork, don't let a truncate operation clear the reflink flag because if
+we do then there's a chance we'll forget to free those extents when we
+destroy the incore inode.
+
+Reported-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_inode.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1632,10 +1632,12 @@ xfs_itruncate_extents(
+               goto out;
+ 
+       /*
+-       * Clear the reflink flag if we truncated everything.
++       * Clear the reflink flag if there are no data fork blocks and
++       * there are no extents staged in the cow fork.
+        */
+-      if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
+-              ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
++      if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
++              if (ip->i_d.di_nblocks == 0)
++                      ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+               xfs_inode_clear_cowblocks_tag(ip);
+       }
+ 
diff --git a/queue-4.9/xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch b/queue-4.9/xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch

new file mode 100644 (file)

index 0000000..9b6e05f
--- /dev/null
+++ b/queue-4.9/xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch
@@ -0,0 +1,46 @@
+From 3af423b03435c81036fa710623d3ae92fbe346a3 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 18 Sep 2017 09:41:17 -0700
+Subject: xfs: evict CoW fork extents when performing finsert/fcollapse
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 3af423b03435c81036fa710623d3ae92fbe346a3 upstream.
+
+When we perform an finsert/fcollapse operation, cancel all the CoW
+extents for the affected file offset range so that they don't end up
+pointing to the wrong blocks.
+
+Reported-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |   14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1445,7 +1445,19 @@ xfs_shift_file_space(
+               return error;
+ 
+       /*
+-       * The extent shiting code works on extent granularity. So, if
++       * Clean out anything hanging around in the cow fork now that
++       * we've flushed all the dirty data out to disk to avoid having
++       * CoW extents at the wrong offsets.
++       */
++      if (xfs_is_reflink_inode(ip)) {
++              error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
++                              true);
++              if (error)
++                      return error;
++      }
++
++      /*
++       * The extent shifting code works on extent granularity. So, if
+        * stop_fsb is not the starting block of extent, we need to split
+        * the extent at stop_fsb.
+        */
diff --git a/queue-4.9/xfs-handle-error-if-xfs_btree_get_bufs-fails.patch b/queue-4.9/xfs-handle-error-if-xfs_btree_get_bufs-fails.patch

new file mode 100644 (file)

index 0000000..b157b28
--- /dev/null
+++ b/queue-4.9/xfs-handle-error-if-xfs_btree_get_bufs-fails.patch
@@ -0,0 +1,60 @@
+From 93e8befc17f6d6ea92b0aee3741ceac8bca4590f Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@sandeen.net>
+Date: Mon, 9 Oct 2017 21:08:06 -0700
+Subject: xfs: handle error if xfs_btree_get_bufs fails
+
+From: Eric Sandeen <sandeen@sandeen.net>
+
+commit 93e8befc17f6d6ea92b0aee3741ceac8bca4590f upstream.
+
+Jason reported that a corrupted filesystem failed to replay
+the log with a metadata block out of bounds warning:
+
+XFS (dm-2): _xfs_buf_find: Block out of range: block 0x80270fff8, EOFS 0x9c40000
+
+_xfs_buf_find() and xfs_btree_get_bufs() return NULL if
+that happens, and then when xfs_alloc_fix_freelist() calls
+xfs_trans_binval() on that NULL bp, we oops with:
+
+BUG: unable to handle kernel NULL pointer dereference at 00000000000000f8
+
+We don't handle _xfs_buf_find errors very well, every
+caller higher up the stack gets to guess at why it failed.
+But we should at least handle it somehow, so return
+EFSCORRUPTED here.
+
+Reported-by: Jason L Tibbitts III <tibbs@math.uh.edu>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_alloc.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_alloc.c
++++ b/fs/xfs/libxfs/xfs_alloc.c
+@@ -1579,6 +1579,10 @@ xfs_alloc_ag_vextent_small(
+ 
+                               bp = xfs_btree_get_bufs(args->mp, args->tp,
+                                       args->agno, fbno, 0);
++                              if (!bp) {
++                                      error = -EFSCORRUPTED;
++                                      goto error0;
++                              }
+                               xfs_trans_binval(args->tp, bp);
+                       }
+                       args->len = 1;
+@@ -2136,6 +2140,10 @@ xfs_alloc_fix_freelist(
+               if (error)
+                       goto out_agbp_relse;
+               bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
++              if (!bp) {
++                      error = -EFSCORRUPTED;
++                      goto out_agbp_relse;
++              }
+               xfs_trans_binval(tp, bp);
+       }
+ 
diff --git a/queue-4.9/xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch b/queue-4.9/xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch

new file mode 100644 (file)

index 0000000..17d555b
--- /dev/null
+++ b/queue-4.9/xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch
@@ -0,0 +1,48 @@
+From e12199f85d0ad1b04ce6c425ad93cd847fe930bb Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 3 Oct 2017 08:58:33 -0700
+Subject:  xfs: handle racy AIO in xfs_reflink_end_cow
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e12199f85d0ad1b04ce6c425ad93cd847fe930bb upstream.
+
+If we got two AIO writes into a COW area the second one might not have any
+COW extents left to convert.  Handle that case gracefully instead of
+triggering an assert or accessing beyond the bounds of the extent list.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ fs/xfs/xfs_reflink.c |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -767,7 +767,13 @@ xfs_reflink_end_cow(
+ 
+       /* If there is a hole at end_fsb - 1 go to the previous extent */
+       if (eof || got.br_startoff > end_fsb) {
+-              ASSERT(idx > 0);
++              /*
++               * In case of racing, overlapping AIO writes no COW extents
++               * might be left by the time I/O completes for the loser of
++               * the race.  In that case we are done.
++               */
++              if (idx <= 0)
++                      goto out_cancel;
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+       }
+ 
+@@ -841,6 +847,7 @@ next_extent:
+ 
+ out_defer:
+       xfs_defer_cancel(&dfops);
++out_cancel:
+       xfs_trans_cancel(tp);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ out:
diff --git a/queue-4.9/xfs-move-more-rt-specific-code-under-config_xfs_rt.patch b/queue-4.9/xfs-move-more-rt-specific-code-under-config_xfs_rt.patch

new file mode 100644 (file)

index 0000000..88cbd71
--- /dev/null
+++ b/queue-4.9/xfs-move-more-rt-specific-code-under-config_xfs_rt.patch
@@ -0,0 +1,69 @@
+From bb9c2e5433250f5b477035dc478314f8e6dd5e36 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Mon, 9 Oct 2017 11:37:22 -0700
+Subject: xfs: move more RT specific code under CONFIG_XFS_RT
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit bb9c2e5433250f5b477035dc478314f8e6dd5e36 upstream.
+
+Various utility functions and interfaces that iterate internal
+devices try to reference the realtime device even when RT support is
+not compiled into the kernel.
+
+Make sure this code is excluded from the CONFIG_XFS_RT=n build,
+and where appropriate stub functions to return fatal errors if
+they ever get called when RT support is not present.
+
+Signed-Off-By: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |    2 ++
+ fs/xfs/xfs_bmap_util.h |   13 +++++++++++++
+ 2 files changed, 15 insertions(+)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -84,6 +84,7 @@ xfs_zero_extent(
+               GFP_NOFS, true);
+ }
+ 
++#ifdef CONFIG_XFS_RT
+ int
+ xfs_bmap_rtalloc(
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+@@ -195,6 +196,7 @@ xfs_bmap_rtalloc(
+       }
+       return 0;
+ }
++#endif /* CONFIG_XFS_RT */
+ 
+ /*
+  * Check if the endoff is outside the last extent. If so the caller will grow
+--- a/fs/xfs/xfs_bmap_util.h
++++ b/fs/xfs/xfs_bmap_util.h
+@@ -28,7 +28,20 @@ struct xfs_mount;
+ struct xfs_trans;
+ struct xfs_bmalloca;
+ 
++#ifdef CONFIG_XFS_RT
+ int   xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
++#else /* !CONFIG_XFS_RT */
++/*
++ * Attempts to allocate RT extents when RT is disable indicates corruption and
++ * should trigger a shutdown.
++ */
++static inline int
++xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
++{
++      return -EFSCORRUPTED;
++}
++#endif /* CONFIG_XFS_RT */
++
+ int   xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+                    int whichfork, int *eof);
+ int   xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
diff --git a/queue-4.9/xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch b/queue-4.9/xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch

new file mode 100644 (file)

index 0000000..e161a52
--- /dev/null
+++ b/queue-4.9/xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch
@@ -0,0 +1,50 @@
+From 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 18 Sep 2017 09:42:09 -0700
+Subject: xfs: perag initialization should only touch m_ag_max_usable for AG 0
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb upstream.
+
+We call __xfs_ag_resv_init to make a per-AG reservation for each AG.
+This makes the reservation per-AG, not per-filesystem.  Therefore, it
+is incorrect to adjust m_ag_max_usable for each AG.  Adjust it only
+when we're reserving AG 0's blocks so that we only do it once per fs.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_ag_resv.c |   12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ag_resv.c
++++ b/fs/xfs/libxfs/xfs_ag_resv.c
+@@ -157,7 +157,8 @@ __xfs_ag_resv_free(
+       trace_xfs_ag_resv_free(pag, type, 0);
+ 
+       resv = xfs_perag_resv(pag, type);
+-      pag->pag_mount->m_ag_max_usable += resv->ar_asked;
++      if (pag->pag_agno == 0)
++              pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+       /*
+        * AGFL blocks are always considered "free", so whatever
+        * was reserved at mount time must be given back at umount.
+@@ -217,7 +218,14 @@ __xfs_ag_resv_init(
+               return error;
+       }
+ 
+-      mp->m_ag_max_usable -= ask;
++      /*
++       * Reduce the maximum per-AG allocation length by however much we're
++       * trying to reserve for an AG.  Since this is a filesystem-wide
++       * counter, we only make the adjustment for AG 0.  This assumes that
++       * there aren't any AGs hungrier for per-AG reservation than AG 0.
++       */
++      if (pag->pag_agno == 0)
++              mp->m_ag_max_usable -= ask;
+ 
+       resv = xfs_perag_resv(pag, type);
+       resv->ar_asked = ask;
diff --git a/queue-4.9/xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch b/queue-4.9/xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch

new file mode 100644 (file)

index 0000000..ab2abaf
--- /dev/null
+++ b/queue-4.9/xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch
@@ -0,0 +1,54 @@
+From f35c5e10c6ed6ba52a8dd8573924a80b6a02f03f Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Mon, 9 Oct 2017 11:38:56 -0700
+Subject: xfs: reinit btree pointer on attr tree inactivation walk
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit f35c5e10c6ed6ba52a8dd8573924a80b6a02f03f upstream.
+
+xfs_attr3_root_inactive() walks the attr fork tree to invalidate the
+associated blocks. xfs_attr3_node_inactive() recursively descends
+from internal blocks to leaf blocks, caching block address values
+along the way to revisit parent blocks, locate the next entry and
+descend down that branch of the tree.
+
+The code that attempts to reread the parent block is unsafe because
+it assumes that the local xfs_da_node_entry pointer remains valid
+after an xfs_trans_brelse() and re-read of the parent buffer. Under
+heavy memory pressure, it is possible that the buffer has been
+reclaimed and reallocated by the time the parent block is reread.
+This means that 'btree' can point to an invalid memory address, lead
+to a random/garbage value for child_fsb and cause the subsequent
+read of the attr fork to go off the rails and return a NULL buffer
+for an attr fork offset that is most likely not allocated.
+
+Note that this problem can be manufactured by setting
+XFS_ATTR_BTREE_REF to 0 to prevent LRU caching of attr buffers,
+creating a file with a multi-level attr fork and removing it to
+trigger inactivation.
+
+To address this problem, reinit the node/btree pointers to the
+parent buffer after it has been re-read. This ensures btree points
+to a valid record and allows the walk to proceed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_attr_inactive.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_attr_inactive.c
++++ b/fs/xfs/xfs_attr_inactive.c
+@@ -302,6 +302,8 @@ xfs_attr3_node_inactive(
+                                                &bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
++                      node = bp->b_addr;
++                      btree = dp->d_ops->node_tree_p(node);
+                       child_fsb = be32_to_cpu(btree[i + 1].before);
+                       xfs_trans_brelse(*trans, bp);
+               }
diff --git a/queue-4.9/xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch b/queue-4.9/xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch

new file mode 100644 (file)

index 0000000..e977453
--- /dev/null
+++ b/queue-4.9/xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch
@@ -0,0 +1,36 @@
+From d20a5e3851969fa685f118a80e4df670255a4e8d Mon Sep 17 00:00:00 2001
+From: Eryu Guan <eguan@redhat.com>
+Date: Mon, 18 Sep 2017 11:39:23 -0700
+Subject: xfs: report zeroed or not correctly in xfs_zero_range()
+
+From: Eryu Guan <eguan@redhat.com>
+
+commit d20a5e3851969fa685f118a80e4df670255a4e8d upstream.
+
+The 'did_zero' param of xfs_zero_range() was not passed to
+iomap_zero_range() correctly. This was introduced by commit
+7bb41db3ea16 ("xfs: handle 64-bit length in xfs_iozero"), and found
+by code inspection.
+
+Signed-off-by: Eryu Guan <eguan@redhat.com>
+Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -92,7 +92,7 @@ xfs_zero_range(
+       xfs_off_t               count,
+       bool                    *did_zero)
+ {
+-      return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
++      return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops);
+ }
+ 
+ int
diff --git a/queue-4.9/xfs-trim-writepage-mapping-to-within-eof.patch b/queue-4.9/xfs-trim-writepage-mapping-to-within-eof.patch

new file mode 100644 (file)

index 0000000..f48e07a
--- /dev/null
+++ b/queue-4.9/xfs-trim-writepage-mapping-to-within-eof.patch
@@ -0,0 +1,119 @@
+From 40214d128e07dd21bb07a8ed6a7fe2f911281ab2 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 13 Oct 2017 09:47:46 -0700
+Subject: xfs: trim writepage mapping to within eof
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 40214d128e07dd21bb07a8ed6a7fe2f911281ab2 upstream.
+
+The writeback rework in commit fbcc02561359 ("xfs: Introduce
+writeback context for writepages") introduced a subtle change in
+behavior with regard to the block mapping used across the
+->writepages() sequence. The previous xfs_cluster_write() code would
+only flush pages up to EOF at the time of the writepage, thus
+ensuring that any pages due to file-extending writes would be
+handled on a separate cycle and with a new, updated block mapping.
+
+The updated code establishes a block mapping in xfs_writepage_map()
+that could extend beyond EOF if the file has post-eof preallocation.
+Because we now use the generic writeback infrastructure and pass the
+cached mapping to each writepage call, there is no implicit EOF
+limit in place. If eofblocks trimming occurs during ->writepages(),
+any post-eof portion of the cached mapping becomes invalid. The
+eofblocks code has no means to serialize against writeback because
+there are no pages associated with post-eof blocks. Therefore if an
+eofblocks trim occurs and is followed by a file-extending buffered
+write, not only has the mapping become invalid, but we could end up
+writing a page to disk based on the invalid mapping.
+
+Consider the following sequence of events:
+
+- A buffered write creates a delalloc extent and post-eof
+  speculative preallocation.
+- Writeback starts and on the first writepage cycle, the delalloc
+  extent is converted to real blocks (including the post-eof blocks)
+  and the mapping is cached.
+- The file is closed and xfs_release() trims post-eof blocks. The
+  cached writeback mapping is now invalid.
+- Another buffered write appends the file with a delalloc extent.
+- The concurrent writeback cycle picks up the just written page
+  because the writeback range end is LLONG_MAX. xfs_writepage_map()
+  attributes it to the (now invalid) cached mapping and writes the
+  data to an incorrect location on disk (and where the file offset is
+  still backed by a delalloc extent).
+
+This problem is reproduced by xfstests test generic/464, which
+triggers racing writes, appends, open/closes and writeback requests.
+
+To address this problem, trim the mapping used during writeback to
+within EOF when the mapping is validated. This ensures the mapping
+is revalidated for any pages encountered beyond EOF as of the time
+the current mapping was cached or last validated.
+
+Reported-by: Eryu Guan <eguan@redhat.com>
+Diagnosed-by: Eryu Guan <eguan@redhat.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c |   11 +++++++++++
+ fs/xfs/libxfs/xfs_bmap.h |    1 +
+ fs/xfs/xfs_aops.c        |   13 +++++++++++++
+ 3 files changed, 25 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4057,6 +4057,17 @@ xfs_trim_extent(
+       }
+ }
+ 
++/* trim extent to within eof */
++void
++xfs_trim_extent_eof(
++      struct xfs_bmbt_irec    *irec,
++      struct xfs_inode        *ip)
++
++{
++      xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
++                                            i_size_read(VFS_I(ip))));
++}
++
+ /*
+  * Trim the returned map to the required bounds
+  */
+--- a/fs/xfs/libxfs/xfs_bmap.h
++++ b/fs/xfs/libxfs/xfs_bmap.h
+@@ -196,6 +196,7 @@ void       xfs_bmap_trace_exlist(struct xfs_in
+ 
+ void  xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
+               xfs_filblks_t len);
++void  xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
+ int   xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+ void  xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+ void  xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -438,6 +438,19 @@ xfs_imap_valid(
+ {
+       offset >>= inode->i_blkbits;
+ 
++      /*
++       * We have to make sure the cached mapping is within EOF to protect
++       * against eofblocks trimming on file release leaving us with a stale
++       * mapping. Otherwise, a page for a subsequent file extending buffered
++       * write could get picked up by this writeback cycle and written to the
++       * wrong blocks.
++       *
++       * Note that what we really want here is a generic mapping invalidation
++       * mechanism to protect us from arbitrary extent modifying contexts, not
++       * just eofblocks.
++       */
++      xfs_trim_extent_eof(imap, XFS_I(inode));
++
+       return offset >= imap->br_startoff &&
+               offset < imap->br_startoff + imap->br_blockcount;
+ }
diff --git a/queue-4.9/xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch b/queue-4.9/xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch

new file mode 100644 (file)

index 0000000..fc7d6e6
--- /dev/null
+++ b/queue-4.9/xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch
@@ -0,0 +1,140 @@
+From ee70daaba82d70766d0723b743d9fdeb3b06102a Mon Sep 17 00:00:00 2001
+From: Eryu Guan <eguan@redhat.com>
+Date: Thu, 21 Sep 2017 11:26:18 -0700
+Subject: xfs: update i_size after unwritten conversion in dio completion
+
+From: Eryu Guan <eguan@redhat.com>
+
+commit ee70daaba82d70766d0723b743d9fdeb3b06102a upstream.
+
+Since commit d531d91d6990 ("xfs: always use unwritten extents for
+direct I/O writes"), we start allocating unwritten extents for all
+direct writes to allow appending aio in XFS.
+
+But for dio writes that could extend file size we update the in-core
+inode size first, then convert the unwritten extents to real
+allocations at dio completion time in xfs_dio_write_end_io(). Thus a
+racing direct read could see the new i_size and find the unwritten
+extents first and read zeros instead of actual data, if the direct
+writer also takes a shared iolock.
+
+Fix it by updating the in-core inode size after the unwritten extent
+conversion. To do this, introduce a new boolean argument to
+xfs_iomap_write_unwritten() to tell if we want to update in-core
+i_size or not.
+
+Suggested-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Eryu Guan <eguan@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+[hch: backported to the old direct I/O code before Linux 4.10]
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c  |   25 +++++++++++++++++--------
+ fs/xfs/xfs_iomap.c |    7 +++++--
+ fs/xfs/xfs_iomap.h |    2 +-
+ fs/xfs/xfs_pnfs.c  |    2 +-
+ 4 files changed, 24 insertions(+), 12 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -335,7 +335,8 @@ xfs_end_io(
+               error = xfs_reflink_end_cow(ip, offset, size);
+               break;
+       case XFS_IO_UNWRITTEN:
+-              error = xfs_iomap_write_unwritten(ip, offset, size);
++              /* writeback should never update isize */
++              error = xfs_iomap_write_unwritten(ip, offset, size, false);
+               break;
+       default:
+               ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
+@@ -1532,6 +1533,21 @@ xfs_end_io_direct_write(
+               return 0;
+       }
+ 
++      if (flags & XFS_DIO_FLAG_COW)
++              error = xfs_reflink_end_cow(ip, offset, size);
++
++      /*
++       * Unwritten conversion updates the in-core isize after extent
++       * conversion but before updating the on-disk size. Updating isize any
++       * earlier allows a racing dio read to find unwritten extents before
++       * they are converted.
++       */
++      if (flags & XFS_DIO_FLAG_UNWRITTEN) {
++              trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
++
++              return xfs_iomap_write_unwritten(ip, offset, size, true);
++      }
++
+       /*
+        * We need to update the in-core inode size here so that we don't end up
+        * with the on-disk inode size being outside the in-core inode size. We
+@@ -1548,13 +1564,6 @@ xfs_end_io_direct_write(
+               i_size_write(inode, offset + size);
+       spin_unlock(&ip->i_flags_lock);
+ 
+-      if (flags & XFS_DIO_FLAG_COW)
+-              error = xfs_reflink_end_cow(ip, offset, size);
+-      if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+-              trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
+-
+-              error = xfs_iomap_write_unwritten(ip, offset, size);
+-      }
+       if (flags & XFS_DIO_FLAG_APPEND) {
+               trace_xfs_end_io_direct_write_append(ip, offset, size);
+ 
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -836,7 +836,8 @@ int
+ xfs_iomap_write_unwritten(
+       xfs_inode_t     *ip,
+       xfs_off_t       offset,
+-      xfs_off_t       count)
++      xfs_off_t       count,
++      bool            update_isize)
+ {
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_fileoff_t   offset_fsb;
+@@ -847,6 +848,7 @@ xfs_iomap_write_unwritten(
+       xfs_trans_t     *tp;
+       xfs_bmbt_irec_t imap;
+       struct xfs_defer_ops dfops;
++      struct inode    *inode = VFS_I(ip);
+       xfs_fsize_t     i_size;
+       uint            resblks;
+       int             error;
+@@ -906,7 +908,8 @@ xfs_iomap_write_unwritten(
+               i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+               if (i_size > offset + count)
+                       i_size = offset + count;
+-
++              if (update_isize && i_size > i_size_read(inode))
++                      i_size_write(inode, i_size);
+               i_size = xfs_new_eof(ip, i_size);
+               if (i_size) {
+                       ip->i_d.di_size = i_size;
+--- a/fs/xfs/xfs_iomap.h
++++ b/fs/xfs/xfs_iomap.h
+@@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_in
+                       struct xfs_bmbt_irec *, int);
+ int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
+                       struct xfs_bmbt_irec *);
+-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
++int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+ 
+ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+               struct xfs_bmbt_irec *);
+--- a/fs/xfs/xfs_pnfs.c
++++ b/fs/xfs/xfs_pnfs.c
+@@ -279,7 +279,7 @@ xfs_fs_commit_blocks(
+                                       (end - 1) >> PAGE_SHIFT);
+               WARN_ON_ONCE(error);
+ 
+-              error = xfs_iomap_write_unwritten(ip, start, length);
++              error = xfs_iomap_write_unwritten(ip, start, length, false);
+               if (error)
+                       goto out_drop_iolock;
+       }
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 24 Oct 2017 12:54:24 +0000 (14:54 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 24 Oct 2017 12:54:24 +0000 (14:54 +0200)
queue-4.9/fs-xfs-use-ps-printk-format-for-direct-addresses.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/series		patch \| blob \| blame \| history
queue-4.9/xfs-always-swap-the-cow-forks-when-swapping-extents.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-cancel-dirty-pages-on-invalidation.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-capture-state-of-the-right-inode-in-xfs_iflush_done.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-don-t-change-inode-mode-if-acl-update-fails.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-don-t-log-uninitialised-fields-in-inode-structures.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-don-t-unconditionally-clear-the-reflink-flag-on-zero-block-files.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-evict-cow-fork-extents-when-performing-finsert-fcollapse.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-handle-error-if-xfs_btree_get_bufs-fails.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-handle-racy-aio-in-xfs_reflink_end_cow.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-move-more-rt-specific-code-under-config_xfs_rt.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-perag-initialization-should-only-touch-m_ag_max_usable-for-ag-0.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-reinit-btree-pointer-on-attr-tree-inactivation-walk.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-report-zeroed-or-not-correctly-in-xfs_zero_range.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-trim-writepage-mapping-to-within-eof.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/xfs-update-i_size-after-unwritten-conversion-in-dio-completion.patch	[new file with mode: 0644]	patch \| blob