]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 1 Apr 2017 16:40:40 +0000 (18:40 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 1 Apr 2017 16:40:40 +0000 (18:40 +0200)
added patches:
xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch

queue-4.10/series
queue-4.10/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch [new file with mode: 0644]
queue-4.10/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch [new file with mode: 0644]

index 68ee3f217d81469fad3b210e78463d3e5ce07876..2d2ac804b394e3af98af5983c9f5b38a8a675650 100644 (file)
@@ -1,2 +1,4 @@
 libceph-force-gfp_noio-for-socket-allocations.patch
 kvm-nvmx-fix-nested-ept-detection.patch
+xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
+xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
diff --git a/queue-4.10/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch b/queue-4.10/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
new file mode 100644 (file)
index 0000000..3a06e7b
--- /dev/null
@@ -0,0 +1,246 @@
+From a36b926180cda375ac2ec89e1748b47137cfc51c Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 27 Jan 2017 23:22:55 -0800
+Subject: xfs: pull up iolock from xfs_free_eofblocks()
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit a36b926180cda375ac2ec89e1748b47137cfc51c upstream.
+
+xfs_free_eofblocks() requires the IOLOCK_EXCL lock, but is called from
+different contexts where the lock may or may not be held. The
+need_iolock parameter exists for this reason, to indicate whether
+xfs_free_eofblocks() must acquire the iolock itself before it can
+proceed.
+
+This is ugly and confusing. Simplify the semantics of
+xfs_free_eofblocks() to require the caller to acquire the iolock
+appropriately and kill the need_iolock parameter. While here, the mp
+param can be removed as well as the xfs_mount is accessible from the
+xfs_inode structure. This patch does not change behavior.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c |   41 +++++++++++++++------------------------
+ fs/xfs/xfs_bmap_util.h |    3 --
+ fs/xfs/xfs_icache.c    |   24 ++++++++++++++---------
+ fs/xfs/xfs_inode.c     |   51 ++++++++++++++++++++++++++-----------------------
+ 4 files changed, 60 insertions(+), 59 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -917,17 +917,18 @@ xfs_can_free_eofblocks(struct xfs_inode
+  */
+ int
+ xfs_free_eofblocks(
+-      xfs_mount_t     *mp,
+-      xfs_inode_t     *ip,
+-      bool            need_iolock)
++      struct xfs_inode        *ip)
+ {
+-      xfs_trans_t     *tp;
+-      int             error;
+-      xfs_fileoff_t   end_fsb;
+-      xfs_fileoff_t   last_fsb;
+-      xfs_filblks_t   map_len;
+-      int             nimaps;
+-      xfs_bmbt_irec_t imap;
++      struct xfs_trans        *tp;
++      int                     error;
++      xfs_fileoff_t           end_fsb;
++      xfs_fileoff_t           last_fsb;
++      xfs_filblks_t           map_len;
++      int                     nimaps;
++      struct xfs_bmbt_irec    imap;
++      struct xfs_mount        *mp = ip->i_mount;
++
++      ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+       /*
+        * Figure out if there are any blocks beyond the end
+@@ -944,6 +945,10 @@ xfs_free_eofblocks(
+       error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
++      /*
++       * If there are blocks after the end of file, truncate the file to its
++       * current size to free them up.
++       */
+       if (!error && (nimaps != 0) &&
+           (imap.br_startblock != HOLESTARTBLOCK ||
+            ip->i_delayed_blks)) {
+@@ -954,22 +959,10 @@ xfs_free_eofblocks(
+               if (error)
+                       return error;
+-              /*
+-               * There are blocks after the end of file.
+-               * Free them up now by truncating the file to
+-               * its current size.
+-               */
+-              if (need_iolock) {
+-                      if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
+-                              return -EAGAIN;
+-              }
+-
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+                               &tp);
+               if (error) {
+                       ASSERT(XFS_FORCED_SHUTDOWN(mp));
+-                      if (need_iolock)
+-                              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                       return error;
+               }
+@@ -997,8 +990,6 @@ xfs_free_eofblocks(
+               }
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-              if (need_iolock)
+-                      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       }
+       return error;
+ }
+@@ -1415,7 +1406,7 @@ xfs_shift_file_space(
+        * into the accessible region of the file.
+        */
+       if (xfs_can_free_eofblocks(ip, true)) {
+-              error = xfs_free_eofblocks(mp, ip, false);
++              error = xfs_free_eofblocks(ip);
+               if (error)
+                       return error;
+       }
+--- a/fs/xfs/xfs_bmap_util.h
++++ b/fs/xfs/xfs_bmap_util.h
+@@ -63,8 +63,7 @@ int  xfs_insert_file_space(struct xfs_ino
+ /* EOF block manipulation functions */
+ bool  xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+-int   xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+-                         bool need_iolock);
++int   xfs_free_eofblocks(struct xfs_inode *ip);
+ int   xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+                        struct xfs_swapext *sx);
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1322,7 +1322,7 @@ xfs_inode_free_eofblocks(
+       int                     flags,
+       void                    *args)
+ {
+-      int ret;
++      int ret = 0;
+       struct xfs_eofblocks *eofb = args;
+       bool need_iolock = true;
+       int match;
+@@ -1358,19 +1358,25 @@ xfs_inode_free_eofblocks(
+                       return 0;
+               /*
+-               * A scan owner implies we already hold the iolock. Skip it in
+-               * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+-               * the possibility of EAGAIN being returned.
++               * A scan owner implies we already hold the iolock. Skip it here
++               * to avoid deadlock.
+                */
+               if (eofb->eof_scan_owner == ip->i_ino)
+                       need_iolock = false;
+       }
+-      ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
+-
+-      /* don't revisit the inode if we're not waiting */
+-      if (ret == -EAGAIN && !(flags & SYNC_WAIT))
+-              ret = 0;
++      /*
++       * If the caller is waiting, return -EAGAIN to keep the background
++       * scanner moving and revisit the inode in a subsequent pass.
++       */
++      if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++              if (flags & SYNC_WAIT)
++                      ret = -EAGAIN;
++              return ret;
++      }
++      ret = xfs_free_eofblocks(ip);
++      if (need_iolock)
++              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return ret;
+ }
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1692,32 +1692,34 @@ xfs_release(
+       if (xfs_can_free_eofblocks(ip, false)) {
+               /*
++               * Check if the inode is being opened, written and closed
++               * frequently and we have delayed allocation blocks outstanding
++               * (e.g. streaming writes from the NFS server), truncating the
++               * blocks past EOF will cause fragmentation to occur.
++               *
++               * In this case don't do the truncation, but we have to be
++               * careful how we detect this case. Blocks beyond EOF show up as
++               * i_delayed_blks even when the inode is clean, so we need to
++               * truncate them away first before checking for a dirty release.
++               * Hence on the first dirty close we will still remove the
++               * speculative allocation, but after that we will leave it in
++               * place.
++               */
++              if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
++                      return 0;
++              /*
+                * If we can't get the iolock just skip truncating the blocks
+                * past EOF because we could deadlock with the mmap_sem
+-               * otherwise.  We'll get another chance to drop them once the
++               * otherwise. We'll get another chance to drop them once the
+                * last reference to the inode is dropped, so we'll never leak
+                * blocks permanently.
+-               *
+-               * Further, check if the inode is being opened, written and
+-               * closed frequently and we have delayed allocation blocks
+-               * outstanding (e.g. streaming writes from the NFS server),
+-               * truncating the blocks past EOF will cause fragmentation to
+-               * occur.
+-               *
+-               * In this case don't do the truncation, either, but we have to
+-               * be careful how we detect this case. Blocks beyond EOF show
+-               * up as i_delayed_blks even when the inode is clean, so we
+-               * need to truncate them away first before checking for a dirty
+-               * release. Hence on the first dirty close we will still remove
+-               * the speculative allocation, but after that we will leave it
+-               * in place.
+                */
+-              if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+-                      return 0;
+-
+-              error = xfs_free_eofblocks(mp, ip, true);
+-              if (error && error != -EAGAIN)
+-                      return error;
++              if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++                      error = xfs_free_eofblocks(ip);
++                      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++                      if (error)
++                              return error;
++              }
+               /* delalloc blocks after truncation means it really is dirty */
+               if (ip->i_delayed_blks)
+@@ -1904,8 +1906,11 @@ xfs_inactive(
+                * cache. Post-eof blocks must be freed, lest we end up with
+                * broken free space accounting.
+                */
+-              if (xfs_can_free_eofblocks(ip, true))
+-                      xfs_free_eofblocks(mp, ip, false);
++              if (xfs_can_free_eofblocks(ip, true)) {
++                      xfs_ilock(ip, XFS_IOLOCK_EXCL);
++                      xfs_free_eofblocks(ip);
++                      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++              }
+               return;
+       }
diff --git a/queue-4.10/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch b/queue-4.10/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
new file mode 100644 (file)
index 0000000..048a92c
--- /dev/null
@@ -0,0 +1,228 @@
+From c3155097ad89a956579bc305856a1f2878494e52 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 27 Jan 2017 23:22:56 -0800
+Subject: xfs: sync eofblocks scans under iolock are livelock prone
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit c3155097ad89a956579bc305856a1f2878494e52 upstream.
+
+The xfs_eofblocks.eof_scan_owner field is an internal field to
+facilitate invoking eofb scans from the kernel while under the iolock.
+This is necessary because the eofb scan acquires the iolock of each
+inode. Synchronous scans are invoked on certain buffered write failures
+while under iolock. In such cases, the scan owner indicates that the
+context for the scan already owns the particular iolock and prevents a
+double lock deadlock.
+
+eofblocks scans while under iolock are still livelock prone in the event
+of multiple parallel scans, however. If multiple buffered writes to
+different inodes fail and invoke eofblocks scans at the same time, each
+scan avoids a deadlock with its own inode by virtue of the
+eof_scan_owner field, but will never be able to acquire the iolock of
+the inode from the parallel scan. Because the low free space scans are
+invoked with SYNC_WAIT, the scan will not return until it has processed
+every tagged inode and thus both scans will spin indefinitely on the
+iolock being held across the opposite scan. This problem can be
+reproduced reliably by generic/224 on systems with higher cpu counts
+(x16).
+
+To avoid this problem, simplify the semantics of eofblocks scans to
+never invoke a scan while under iolock. This means that the buffered
+write context must drop the iolock before the scan. It must reacquire
+the lock before the write retry and also repeat the initial write
+checks, as the original state might no longer be valid once the iolock
+was dropped.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c   |   13 +++++++++----
+ fs/xfs/xfs_icache.c |   45 +++++++--------------------------------------
+ fs/xfs/xfs_icache.h |    2 --
+ 3 files changed, 16 insertions(+), 44 deletions(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -614,8 +614,10 @@ xfs_file_buffered_aio_write(
+       struct xfs_inode        *ip = XFS_I(inode);
+       ssize_t                 ret;
+       int                     enospc = 0;
+-      int                     iolock = XFS_IOLOCK_EXCL;
++      int                     iolock;
++write_retry:
++      iolock = XFS_IOLOCK_EXCL;
+       xfs_ilock(ip, iolock);
+       ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+@@ -625,7 +627,6 @@ xfs_file_buffered_aio_write(
+       /* We can write back this queue in page reclaim */
+       current->backing_dev_info = inode_to_bdi(inode);
+-write_retry:
+       trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+       ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
+       if (likely(ret >= 0))
+@@ -641,18 +642,21 @@ write_retry:
+        * running at the same time.
+        */
+       if (ret == -EDQUOT && !enospc) {
++              xfs_iunlock(ip, iolock);
+               enospc = xfs_inode_free_quota_eofblocks(ip);
+               if (enospc)
+                       goto write_retry;
+               enospc = xfs_inode_free_quota_cowblocks(ip);
+               if (enospc)
+                       goto write_retry;
++              iolock = 0;
+       } else if (ret == -ENOSPC && !enospc) {
+               struct xfs_eofblocks eofb = {0};
+               enospc = 1;
+               xfs_flush_inodes(ip->i_mount);
+-              eofb.eof_scan_owner = ip->i_ino; /* for locking */
++
++              xfs_iunlock(ip, iolock);
+               eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+               xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+               goto write_retry;
+@@ -660,7 +664,8 @@ write_retry:
+       current->backing_dev_info = NULL;
+ out:
+-      xfs_iunlock(ip, iolock);
++      if (iolock)
++              xfs_iunlock(ip, iolock);
+       return ret;
+ }
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1324,11 +1324,8 @@ xfs_inode_free_eofblocks(
+ {
+       int ret = 0;
+       struct xfs_eofblocks *eofb = args;
+-      bool need_iolock = true;
+       int match;
+-      ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+-
+       if (!xfs_can_free_eofblocks(ip, false)) {
+               /* inode could be preallocated or append-only */
+               trace_xfs_inode_free_eofblocks_invalid(ip);
+@@ -1356,27 +1353,19 @@ xfs_inode_free_eofblocks(
+               if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+                   XFS_ISIZE(ip) < eofb->eof_min_file_size)
+                       return 0;
+-
+-              /*
+-               * A scan owner implies we already hold the iolock. Skip it here
+-               * to avoid deadlock.
+-               */
+-              if (eofb->eof_scan_owner == ip->i_ino)
+-                      need_iolock = false;
+       }
+       /*
+        * If the caller is waiting, return -EAGAIN to keep the background
+        * scanner moving and revisit the inode in a subsequent pass.
+        */
+-      if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++      if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+               if (flags & SYNC_WAIT)
+                       ret = -EAGAIN;
+               return ret;
+       }
+       ret = xfs_free_eofblocks(ip);
+-      if (need_iolock)
+-              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return ret;
+ }
+@@ -1423,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks(
+       struct xfs_eofblocks eofb = {0};
+       struct xfs_dquot *dq;
+-      ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+-
+       /*
+-       * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+-       * can repeatedly trylock on the inode we're currently processing. We
+-       * run a sync scan to increase effectiveness and use the union filter to
++       * Run a sync scan to increase effectiveness and use the union filter to
+        * cover all applicable quotas in a single scan.
+        */
+-      eofb.eof_scan_owner = ip->i_ino;
+       eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
+       if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
+@@ -1583,12 +1567,9 @@ xfs_inode_free_cowblocks(
+ {
+       int ret;
+       struct xfs_eofblocks *eofb = args;
+-      bool need_iolock = true;
+       int match;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+-      ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+-
+       /*
+        * Just clear the tag if we have an empty cow fork or none at all. It's
+        * possible the inode was fully unshared since it was originally tagged.
+@@ -1621,28 +1602,16 @@ xfs_inode_free_cowblocks(
+               if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+                   XFS_ISIZE(ip) < eofb->eof_min_file_size)
+                       return 0;
+-
+-              /*
+-               * A scan owner implies we already hold the iolock. Skip it in
+-               * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+-               * the possibility of EAGAIN being returned.
+-               */
+-              if (eofb->eof_scan_owner == ip->i_ino)
+-                      need_iolock = false;
+       }
+       /* Free the CoW blocks */
+-      if (need_iolock) {
+-              xfs_ilock(ip, XFS_IOLOCK_EXCL);
+-              xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+-      }
++      xfs_ilock(ip, XFS_IOLOCK_EXCL);
++      xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+       ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+-      if (need_iolock) {
+-              xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+-              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+-      }
++      xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
++      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return ret;
+ }
+--- a/fs/xfs/xfs_icache.h
++++ b/fs/xfs/xfs_icache.h
+@@ -27,7 +27,6 @@ struct xfs_eofblocks {
+       kgid_t          eof_gid;
+       prid_t          eof_prid;
+       __u64           eof_min_file_size;
+-      xfs_ino_t       eof_scan_owner;
+ };
+ #define SYNC_WAIT             0x0001  /* wait for i/o to complete */
+@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
+       dst->eof_flags = src->eof_flags;
+       dst->eof_prid = src->eof_prid;
+       dst->eof_min_file_size = src->eof_min_file_size;
+-      dst->eof_scan_owner = NULLFSINO;
+       dst->eof_uid = INVALID_UID;
+       if (src->eof_flags & XFS_EOF_FLAGS_UID) {