]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
.32 xfs patches
authorGreg Kroah-Hartman <gregkh@suse.de>
Fri, 2 Apr 2010 18:13:51 +0000 (11:13 -0700)
committerGreg Kroah-Hartman <gregkh@suse.de>
Fri, 2 Apr 2010 18:13:51 +0000 (11:13 -0700)
20 files changed:
queue-2.6.32/series
queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch [new file with mode: 0644]
queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch [new file with mode: 0644]
queue-2.6.32/xfs-don-t-flush-stale-inodes.patch [new file with mode: 0644]
queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch [new file with mode: 0644]
queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch [new file with mode: 0644]
queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch [new file with mode: 0644]
queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch [new file with mode: 0644]
queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch [new file with mode: 0644]
queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch [new file with mode: 0644]
queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch [new file with mode: 0644]
queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch [new file with mode: 0644]
queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch [new file with mode: 0644]
queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch [new file with mode: 0644]
queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch [new file with mode: 0644]
queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch [new file with mode: 0644]
queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch [new file with mode: 0644]
queue-2.6.32/xfs-simplify-inode-teardown.patch [new file with mode: 0644]
queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch [new file with mode: 0644]
queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch [new file with mode: 0644]

index 8e4a7266c65602e6aaf22eb567c0ba9ba12b5809..f15d7868dc1950e4f95e803f90a07083b23d505d 100644 (file)
@@ -6,3 +6,22 @@ oom-fix-the-unsafe-usage-of-badness-in-proc_oom_score.patch
 drm-radeon-kms-don-t-print-error-on-erestartsys.patch
 drm-radeon-kms-fix-pal-tv-out-support-on-legacy-igp-chips.patch
 drm-return-enodev-if-the-inode-mapping-changes.patch
+xfs-simplify-inode-teardown.patch
+xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch
+xfs-i-o-completion-handlers-must-use-nofs-allocations.patch
+xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch
+xfs-fix-error-return-for-fallocate-on-xfs.patch
+xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch
+xfs-fix-timestamp-handling-in-xfs_setattr.patch
+xfs-don-t-flush-stale-inodes.patch
+xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch
+xfs-reclaim-inodes-under-a-write-lock.patch
+xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch
+xfs-reclaim-all-inodes-by-background-tree-walks.patch
+xfs-fix-stale-inode-flush-avoidance.patch
+xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch
+xfs-quota-limit-statvfs-available-blocks.patch
+xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch
+xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch
+xfs-non-blocking-inode-locking-in-io-completion.patch
+xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch
diff --git a/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch b/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch
new file mode 100644 (file)
index 0000000..09eea86
--- /dev/null
@@ -0,0 +1,73 @@
+From david@fromorbit.com  Fri Apr  2 11:09:28 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:09 +1100
+Subject: xfs: Avoid inodes in reclaim when flushing from inode cache
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-12-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 018027be90a6946e8cf3f9b17b5582384f7ed117 upstream
+
+The reclaim code will handle flushing of dirty inodes before reclaim
+occurs, so avoid them when determining whether an inode is a
+candidate for flushing to disk when walking the radix trees.  This
+is based on a test patch from Christoph Hellwig.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_sync.c |   31 ++++++++++++++++++-------------
+ 1 file changed, 18 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -179,26 +179,31 @@ xfs_sync_inode_valid(
+       struct xfs_perag        *pag)
+ {
+       struct inode            *inode = VFS_I(ip);
++      int                     error = EFSCORRUPTED;
+       /* nothing to sync during shutdown */
+-      if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+-              read_unlock(&pag->pag_ici_lock);
+-              return EFSCORRUPTED;
+-      }
++      if (XFS_FORCED_SHUTDOWN(ip->i_mount))
++              goto out_unlock;
+-      /* If we can't get a reference on the inode, it must be in reclaim. */
+-      if (!igrab(inode)) {
+-              read_unlock(&pag->pag_ici_lock);
+-              return ENOENT;
+-      }
+-      read_unlock(&pag->pag_ici_lock);
++      /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
++      error = ENOENT;
++      if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
++              goto out_unlock;
+-      if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
++      /* If we can't grab the inode, it must on it's way to reclaim. */
++      if (!igrab(inode))
++              goto out_unlock;
++
++      if (is_bad_inode(inode)) {
+               IRELE(ip);
+-              return ENOENT;
++              goto out_unlock;
+       }
+-      return 0;
++      /* inode is valid */
++      error = 0;
++out_unlock:
++      read_unlock(&pag->pag_ici_lock);
++      return error;
+ }
+ STATIC int
diff --git a/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch b/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch
new file mode 100644 (file)
index 0000000..5ff112e
--- /dev/null
@@ -0,0 +1,52 @@
+From david@fromorbit.com  Fri Apr  2 11:07:09 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:04 +1100
+Subject: xfs: check for not fully initialized inodes in xfs_ireclaim
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-7-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit b44b1126279b60597f96bbe77507b1650f88a969 upstream
+
+Add an assert for inodes not added to the inode cache in xfs_ireclaim,
+to make sure we're not going to introduce something like the
+famous nfsd inode cache bug again.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_iget.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_iget.c
++++ b/fs/xfs/xfs_iget.c
+@@ -511,17 +511,21 @@ xfs_ireclaim(
+ {
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_perag        *pag;
++      xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+       XFS_STATS_INC(xs_ig_reclaims);
+       /*
+-       * Remove the inode from the per-AG radix tree.  It doesn't matter
+-       * if it was never added to it because radix_tree_delete can deal
+-       * with that case just fine.
++       * Remove the inode from the per-AG radix tree.
++       *
++       * Because radix_tree_delete won't complain even if the item was never
++       * added to the tree assert that it's been there before to catch
++       * problems with the inode life time early on.
+        */
+       pag = xfs_get_perag(mp, ip->i_ino);
+       write_lock(&pag->pag_ici_lock);
+-      radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
++      if (!radix_tree_delete(&pag->pag_ici_root, agino))
++              ASSERT(0);
+       write_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
diff --git a/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch b/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch
new file mode 100644 (file)
index 0000000..0960321
--- /dev/null
@@ -0,0 +1,46 @@
+From david@fromorbit.com  Fri Apr  2 11:08:07 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:06 +1100
+Subject: xfs: Don't flush stale inodes
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-9-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 44e08c45cc14e6190a424be8d450070c8e508fad upstream
+
+Because inodes remain in cache much longer than inode buffers do
+under memory pressure, we can get the situation where we have
+stale, dirty inodes being reclaimed but the backing storage has
+been freed.  Hence we should never, ever flush XFS_ISTALE inodes
+to disk as there is no guarantee that the backing buffer is in
+cache and still marked stale when the flush occurs.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_inode.c |   10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2877,10 +2877,14 @@ xfs_iflush(
+       mp = ip->i_mount;
+       /*
+-       * If the inode isn't dirty, then just release the inode
+-       * flush lock and do nothing.
++       * If the inode isn't dirty, then just release the inode flush lock and
++       * do nothing. Treat stale inodes the same; we cannot rely on the
++       * backing buffer remaining stale in cache for the remaining life of
++       * the stale inode and so xfs_itobp() below may give us a buffer that
++       * no longer contains inodes below. Doing this stale check here also
++       * avoids forcing the log on pinned, stale inodes.
+        */
+-      if (xfs_inode_clean(ip)) {
++      if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
+               xfs_ifunlock(ip);
+               return 0;
+       }
diff --git a/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch b/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch
new file mode 100644 (file)
index 0000000..d71b3ac
--- /dev/null
@@ -0,0 +1,94 @@
+From david@fromorbit.com  Fri Apr  2 11:11:43 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:14 +1100
+Subject: xfs: don't hold onto reserved blocks on remount, ro
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-17-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit cbe132a8bdcff0f9afd9060948fb50597c7400b8 upstream
+
+If we hold onto reserved blocks when doing a remount,ro we end
+up writing the blocks used count to disk that includes the reserved
+blocks. Reserved blocks are not actually used, so this results in
+the values in the superblock being incorrect.
+
+Hence if we run xfs_check or xfs_repair -n while the filesystem is
+mounted remount,ro we end up with an inconsistent filesystem being
+reported. Also, running xfs_copy on the remount,ro filesystem will
+result in an inconsistent image being generated.
+
+To fix this, unreserve the blocks when doing the remount,ro, and
+reserved them again on remount,rw. This way a remount,ro filesystem
+will appear consistent on disk to all utilities.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_super.c |   28 ++++++++++++++++++++++++++++
+ fs/xfs/xfs_mount.h           |    1 +
+ 2 files changed, 29 insertions(+)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -1323,6 +1323,8 @@ xfs_fs_remount(
+       /* ro -> rw */
+       if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
++              __uint64_t resblks;
++
+               mp->m_flags &= ~XFS_MOUNT_RDONLY;
+               if (mp->m_flags & XFS_MOUNT_BARRIER)
+                       xfs_mountfs_check_barriers(mp);
+@@ -1340,11 +1342,37 @@ xfs_fs_remount(
+                       }
+                       mp->m_update_flags = 0;
+               }
++
++              /*
++               * Fill out the reserve pool if it is empty. Use the stashed
++               * value if it is non-zero, otherwise go with the default.
++               */
++              if (mp->m_resblks_save) {
++                      resblks = mp->m_resblks_save;
++                      mp->m_resblks_save = 0;
++              } else {
++                      resblks = mp->m_sb.sb_dblocks;
++                      do_div(resblks, 20);
++                      resblks = min_t(__uint64_t, resblks, 1024);
++              }
++              xfs_reserve_blocks(mp, &resblks, NULL);
+       }
+       /* rw -> ro */
+       if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
++              /*
++               * After we have synced the data but before we sync the
++               * metadata, we need to free up the reserve block pool so that
++               * the used block count in the superblock on disk is correct at
++               * the end of the remount. Stash the current reserve pool size
++               * so that if we get remounted rw, we can return it to the same
++               * size.
++               */
++              __uint64_t resblks = 0;
++
+               xfs_quiesce_data(mp);
++              mp->m_resblks_save = mp->m_resblks;
++              xfs_reserve_blocks(mp, &resblks, NULL);
+               xfs_quiesce_attr(mp);
+               mp->m_flags |= XFS_MOUNT_RDONLY;
+       }
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -209,6 +209,7 @@ typedef struct xfs_mount {
+       __uint64_t              m_maxioffset;   /* maximum inode offset */
+       __uint64_t              m_resblks;      /* total reserved blocks */
+       __uint64_t              m_resblks_avail;/* available reserved blocks */
++      __uint64_t              m_resblks_save; /* reserved blks @ remount,ro */
+       int                     m_dalign;       /* stripe unit */
+       int                     m_swidth;       /* stripe width */
+       int                     m_sinoalign;    /* stripe unit inode alignment */
diff --git a/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch b/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch
new file mode 100644 (file)
index 0000000..78ea50a
--- /dev/null
@@ -0,0 +1,115 @@
+From david@fromorbit.com  Fri Apr  2 11:08:31 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:07 +1100
+Subject: xfs: Ensure we force all busy extents in range to disk
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-10-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit fd45e4784164d1017521086524e3442318c67370 upstream
+
+When we search for and find a busy extent during allocation we
+force the log out to ensure the extent free transaction is on
+disk before the allocation transaction. The current implementation
+has a subtle bug in it--it does not handle multiple overlapping
+ranges.
+
+That is, if we free lots of little extents into a single
+contiguous extent, then allocate the contiguous extent, the busy
+search code stops searching at the first extent it finds that
+overlaps the allocated range. It then uses the commit LSN of the
+transaction to force the log out to.
+
+Unfortunately, the other busy ranges might have more recent
+commit LSNs than the first busy extent that is found, and this
+results in xfs_alloc_search_busy() returning before all the
+extent free transactions are on disk for the range being
+allocated. This can lead to potential metadata corruption or
+stale data exposure after a crash because log replay won't replay
+all the extent free transactions that cover the allocation range.
+
+Modified-by: Alex Elder <aelder@sgi.com>
+
+(Dropped the "found" argument from the xfs_alloc_busysearch trace
+event.)
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_alloc.c |   52 +++++++++++++++++++++-------------------------------
+ 1 file changed, 21 insertions(+), 31 deletions(-)
+
+--- a/fs/xfs/xfs_alloc.c
++++ b/fs/xfs/xfs_alloc.c
+@@ -2703,45 +2703,35 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
+       xfs_mount_t             *mp;
+       xfs_perag_busy_t        *bsy;
+       xfs_agblock_t           uend, bend;
+-      xfs_lsn_t               lsn;
++      xfs_lsn_t               lsn = 0;
+       int                     cnt;
+       mp = tp->t_mountp;
+       spin_lock(&mp->m_perag[agno].pagb_lock);
+-      cnt = mp->m_perag[agno].pagb_count;
+-
+       uend = bno + len - 1;
+-      /* search pagb_list for this slot, skipping open slots */
+-      for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
+-
+-              /*
+-               * (start1,length1) within (start2, length2)
+-               */
+-              if (bsy->busy_tp != NULL) {
+-                      bend = bsy->busy_start + bsy->busy_length - 1;
+-                      if ((bno > bend) || (uend < bsy->busy_start)) {
+-                              cnt--;
+-                      } else {
+-                              TRACE_BUSYSEARCH("xfs_alloc_search_busy",
+-                                       "found1", agno, bno, len, tp);
+-                              break;
+-                      }
+-              }
+-      }
+-
+       /*
+-       * If a block was found, force the log through the LSN of the
+-       * transaction that freed the block
++       * search pagb_list for this slot, skipping open slots. We have to
++       * search the entire array as there may be multiple overlaps and
++       * we have to get the most recent LSN for the log force to push out
++       * all the transactions that span the range.
+        */
+-      if (cnt) {
+-              TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
+-              lsn = bsy->busy_tp->t_commit_lsn;
+-              spin_unlock(&mp->m_perag[agno].pagb_lock);
+-              xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+-      } else {
+-              TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
+-              spin_unlock(&mp->m_perag[agno].pagb_lock);
++      for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
++              bsy = &mp->m_perag[agno].pagb_list[cnt];
++              if (!bsy->busy_tp)
++                      continue;
++              bend = bsy->busy_start + bsy->busy_length - 1;
++              if (bno > bend || uend < bsy->busy_start)
++                      continue;
++
++              /* (start1,length1) within (start2, length2) */
++              if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
++                      lsn = bsy->busy_tp->t_commit_lsn;
+       }
++      spin_unlock(&mp->m_perag[agno].pagb_lock);
++      TRACE_BUSYSEARCH("xfs_alloc_search_busy", lsn ? "found" : "not-found",
++                                              agno, bno, len, tp);
++      if (lsn)
++              xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+ }
diff --git a/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch b/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch
new file mode 100644 (file)
index 0000000..24c6571
--- /dev/null
@@ -0,0 +1,49 @@
+From david@fromorbit.com  Fri Apr  2 11:06:34 2010
+From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+Date: Fri, 12 Mar 2010 09:42:03 +1100
+Subject: xfs: Fix error return for fallocate() on XFS
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-6-git-send-email-david@fromorbit.com>
+
+
+From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+
+commit 44a743f68705c681439f264deb05f8f38e9048d3 upstream
+
+Noticed that through glibc fallocate would return 28 rather than -1
+and errno = 28 for ENOSPC. The xfs routines uses XFS_ERROR format
+positive return error codes while the syscalls use negative return
+codes.  Fixup the two cases in xfs_vn_fallocate syscall to convert to
+negative.
+
+Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_iops.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_iops.c
++++ b/fs/xfs/linux-2.6/xfs_iops.c
+@@ -573,8 +573,8 @@ xfs_vn_fallocate(
+       bf.l_len = len;
+       xfs_ilock(ip, XFS_IOLOCK_EXCL);
+-      error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
+-                                    0, XFS_ATTR_NOLOCK);
++      error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
++                                     0, XFS_ATTR_NOLOCK);
+       if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
+           offset + len > i_size_read(inode))
+               new_size = offset + len;
+@@ -585,7 +585,7 @@ xfs_vn_fallocate(
+               iattr.ia_valid = ATTR_SIZE;
+               iattr.ia_size = new_size;
+-              error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
++              error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+       }
+       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch b/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch
new file mode 100644 (file)
index 0000000..92f6042
--- /dev/null
@@ -0,0 +1,84 @@
+From david@fromorbit.com  Fri Apr  2 11:12:53 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:17 +1100
+Subject: xfs: fix locking for inode cache radix tree tag updates
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-20-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit f1f724e4b523d444c5a598d74505aefa3d6844d2 upstream
+
+The radix-tree code requires it's users to serialize tag updates
+against other updates to the tree.  While XFS protects tag updates
+against each other it does not serialize them against updates of the
+tree contents, which can lead to tag corruption.  Fix the inode
+cache to always take pag_ici_lock in exclusive mode when updating
+radix tree tags.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Patrick Schreurs <patrick@news-service.com>
+Tested-by: Patrick Schreurs <patrick@news-service.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_sync.c |    4 ++--
+ fs/xfs/xfs_iget.c           |   19 +++++++++++++------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -692,12 +692,12 @@ xfs_inode_set_reclaim_tag(
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+-      read_lock(&pag->pag_ici_lock);
++      write_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       __xfs_inode_set_reclaim_tag(pag, ip);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+       spin_unlock(&ip->i_flags_lock);
+-      read_unlock(&pag->pag_ici_lock);
++      write_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
+ }
+--- a/fs/xfs/xfs_iget.c
++++ b/fs/xfs/xfs_iget.c
+@@ -228,13 +228,12 @@ xfs_iget_cache_hit(
+               xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+               /*
+-               * We need to set XFS_INEW atomically with clearing the
+-               * reclaimable tag so that we do have an indicator of the
+-               * inode still being initialized.
++               * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
++               * from stomping over us while we recycle the inode.  We can't
++               * clear the radix tree reclaimable tag yet as it requires
++               * pag_ici_lock to be held exclusive.
+                */
+-              ip->i_flags |= XFS_INEW;
+-              ip->i_flags &= ~XFS_IRECLAIMABLE;
+-              __xfs_inode_clear_reclaim_tag(mp, pag, ip);
++              ip->i_flags |= XFS_IRECLAIM;
+               spin_unlock(&ip->i_flags_lock);
+               read_unlock(&pag->pag_ici_lock);
+@@ -253,7 +252,15 @@ xfs_iget_cache_hit(
+                       __xfs_inode_set_reclaim_tag(pag, ip);
+                       goto out_error;
+               }
++
++              write_lock(&pag->pag_ici_lock);
++              spin_lock(&ip->i_flags_lock);
++              ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
++              ip->i_flags |= XFS_INEW;
++              __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+               inode->i_state = I_LOCK|I_NEW;
++              spin_unlock(&ip->i_flags_lock);
++              write_unlock(&pag->pag_ici_lock);
+       } else {
+               /* If the VFS inode is being torn down, pause and try again. */
+               if (!igrab(inode)) {
diff --git a/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch b/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch
new file mode 100644 (file)
index 0000000..aef2a50
--- /dev/null
@@ -0,0 +1,129 @@
+From david@fromorbit.com  Fri Apr  2 11:05:07 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:00 +1100
+Subject: xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-3-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit c56c9631cbe88f08854a56ff9776c1f310916830 upstream
+
+When xfs_free_eofblocks is called from ->release the VM might already
+hold the mmap_sem, but in the write path we take the iolock before
+taking the mmap_sem in the generic write code.
+
+Switch xfs_free_eofblocks to only trylock the iolock if called from
+->release and skip trimming the prellocated blocks in that case.
+We'll still free them later on the final iput.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_rw.h       |    7 -------
+ fs/xfs/xfs_vnodeops.c |   34 ++++++++++++++++++++++++++--------
+ 2 files changed, 26 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/xfs_rw.h
++++ b/fs/xfs/xfs_rw.h
+@@ -37,13 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_
+ }
+ /*
+- * Flags for xfs_free_eofblocks
+- */
+-#define XFS_FREE_EOF_LOCK     (1<<0)
+-#define XFS_FREE_EOF_NOLOCK   (1<<1)
+-
+-
+-/*
+  * helper function to extract extent size hint from inode
+  */
+ STATIC_INLINE xfs_extlen_t
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -709,6 +709,11 @@ xfs_fsync(
+ }
+ /*
++ * Flags for xfs_free_eofblocks
++ */
++#define XFS_FREE_EOF_TRYLOCK  (1<<0)
++
++/*
+  * This is called by xfs_inactive to free any blocks beyond eof
+  * when the link count isn't zero and by xfs_dm_punch_hole() when
+  * punching a hole to EOF.
+@@ -726,7 +731,6 @@ xfs_free_eofblocks(
+       xfs_filblks_t   map_len;
+       int             nimaps;
+       xfs_bmbt_irec_t imap;
+-      int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
+       /*
+        * Figure out if there are any blocks beyond the end
+@@ -768,14 +772,19 @@ xfs_free_eofblocks(
+                * cache and we can't
+                * do that within a transaction.
+                */
+-              if (use_iolock)
++              if (flags & XFS_FREE_EOF_TRYLOCK) {
++                      if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++                              xfs_trans_cancel(tp, 0);
++                              return 0;
++                      }
++              } else {
+                       xfs_ilock(ip, XFS_IOLOCK_EXCL);
++              }
+               error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+                                   ip->i_size);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+-                      if (use_iolock)
+-                              xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++                      xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                       return error;
+               }
+@@ -812,8 +821,7 @@ xfs_free_eofblocks(
+                       error = xfs_trans_commit(tp,
+                                               XFS_TRANS_RELEASE_LOG_RES);
+               }
+-              xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
+-                                          : XFS_ILOCK_EXCL));
++              xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+       }
+       return error;
+ }
+@@ -1113,7 +1121,17 @@ xfs_release(
+                    (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+                   (!(ip->i_d.di_flags &
+                               (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+-                      error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
++
++                      /*
++                       * If we can't get the iolock just skip truncating
++                       * the blocks past EOF because we could deadlock
++                       * with the mmap_sem otherwise.  We'll get another
++                       * chance to drop them once the last reference to
++                       * the inode is dropped, so we'll never leak blocks
++                       * permanently.
++                       */
++                      error = xfs_free_eofblocks(mp, ip,
++                                                 XFS_FREE_EOF_TRYLOCK);
+                       if (error)
+                               return error;
+               }
+@@ -1184,7 +1202,7 @@ xfs_inactive(
+                    (!(ip->i_d.di_flags &
+                               (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+                     (ip->i_delayed_blks != 0)))) {
+-                      error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
++                      error = xfs_free_eofblocks(mp, ip, 0);
+                       if (error)
+                               return VN_INACTIVE_CACHE;
+               }
diff --git a/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch b/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch
new file mode 100644 (file)
index 0000000..6cc19af
--- /dev/null
@@ -0,0 +1,64 @@
+From david@fromorbit.com  Fri Apr  2 11:10:21 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:11 +1100
+Subject: xfs: fix stale inode flush avoidance
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-14-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 4b6a46882cca8349e8942e2650c33b11bc571c92 upstream
+
+When reclaiming stale inodes, we need to guarantee that inodes are
+unpinned before returning with a "clean" status. If we don't we can
+reclaim inodes that are pinned, leading to use after free in the
+transaction subsystem as transactions complete.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_inode.c |   21 +++++++++++++++------
+ 1 file changed, 15 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2878,13 +2878,9 @@ xfs_iflush(
+       /*
+        * If the inode isn't dirty, then just release the inode flush lock and
+-       * do nothing. Treat stale inodes the same; we cannot rely on the
+-       * backing buffer remaining stale in cache for the remaining life of
+-       * the stale inode and so xfs_itobp() below may give us a buffer that
+-       * no longer contains inodes below. Doing this stale check here also
+-       * avoids forcing the log on pinned, stale inodes.
++       * do nothing.
+        */
+-      if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
++      if (xfs_inode_clean(ip)) {
+               xfs_ifunlock(ip);
+               return 0;
+       }
+@@ -2908,6 +2904,19 @@ xfs_iflush(
+       xfs_iunpin_wait(ip);
+       /*
++       * For stale inodes we cannot rely on the backing buffer remaining
++       * stale in cache for the remaining life of the stale inode and so
++       * xfs_itobp() below may give us a buffer that no longer contains
++       * inodes below. We have to check this after ensuring the inode is
++       * unpinned so that it is safe to reclaim the stale inode after the
++       * flush call.
++       */
++      if (xfs_iflags_test(ip, XFS_ISTALE)) {
++              xfs_ifunlock(ip);
++              return 0;
++      }
++
++      /*
+        * This may have been unpinned because the filesystem is shutting
+        * down forcibly. If that's the case we must not write this inode
+        * to disk, because the log record didn't make it to disk!
diff --git a/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch b/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch
new file mode 100644 (file)
index 0000000..025a4a1
--- /dev/null
@@ -0,0 +1,215 @@
+From david@fromorbit.com  Fri Apr  2 11:07:34 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:05 +1100
+Subject: xfs: fix timestamp handling in xfs_setattr
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-8-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit d6d59bada372bcf8bd36c3bbc71c485c29dd2a4b upstream
+
+We currently have some rather odd code in xfs_setattr for
+updating the a/c/mtime timestamps:
+
+ - first we do a non-transaction update if all three are updated
+   together
+ - second we implicitly update the ctime for various changes
+   instead of relying on the ATTR_CTIME flag
+ - third we set the timestamps to the current time instead of the
+   arguments in the iattr structure in many cases.
+
+This patch makes sure we update it in a consistent way:
+
+ - always transactional
+ - ctime is only updated if ATTR_CTIME is set or we do a size
+   update, which is a special case
+ - always to the times passed in from the caller instead of the
+   current time
+
+The only non-size caller of xfs_setattr that doesn't come from
+the VFS is updated to set ATTR_CTIME and pass in a valid ctime
+value.
+
+Reported-by: Eric Blake <ebb9@byu.net>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_acl.c |    3 -
+ fs/xfs/xfs_vnodeops.c      |   93 ++++++++++++++++++---------------------------
+ 2 files changed, 41 insertions(+), 55 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_acl.c
++++ b/fs/xfs/linux-2.6/xfs_acl.c
+@@ -250,8 +250,9 @@ xfs_set_mode(struct inode *inode, mode_t
+       if (mode != inode->i_mode) {
+               struct iattr iattr;
+-              iattr.ia_valid = ATTR_MODE;
++              iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+               iattr.ia_mode = mode;
++              iattr.ia_ctime = current_fs_time(inode->i_sb);
+               error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+       }
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -69,7 +69,6 @@ xfs_setattr(
+       uint                    commit_flags=0;
+       uid_t                   uid=0, iuid=0;
+       gid_t                   gid=0, igid=0;
+-      int                     timeflags = 0;
+       struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
+       int                     need_iolock = 1;
+@@ -134,16 +133,13 @@ xfs_setattr(
+       if (flags & XFS_ATTR_NOLOCK)
+               need_iolock = 0;
+       if (!(mask & ATTR_SIZE)) {
+-              if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
+-                  (mp->m_flags & XFS_MOUNT_WSYNC)) {
+-                      tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+-                      commit_flags = 0;
+-                      if ((code = xfs_trans_reserve(tp, 0,
+-                                                   XFS_ICHANGE_LOG_RES(mp), 0,
+-                                                   0, 0))) {
+-                              lock_flags = 0;
+-                              goto error_return;
+-                      }
++              tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
++              commit_flags = 0;
++              code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
++                                       0, 0, 0);
++              if (code) {
++                      lock_flags = 0;
++                      goto error_return;
+               }
+       } else {
+               if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
+@@ -294,15 +290,23 @@ xfs_setattr(
+                * or we are explicitly asked to change it. This handles
+                * the semantic difference between truncate() and ftruncate()
+                * as implemented in the VFS.
++               *
++               * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
++               * is a special case where we need to update the times despite
++               * not having these flags set.  For all other operations the
++               * VFS set these flags explicitly if it wants a timestamp
++               * update.
+                */
+-              if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
+-                      timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
++              if (iattr->ia_size != ip->i_size &&
++                  (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
++                      iattr->ia_ctime = iattr->ia_mtime =
++                              current_fs_time(inode->i_sb);
++                      mask |= ATTR_CTIME | ATTR_MTIME;
++              }
+               if (iattr->ia_size > ip->i_size) {
+                       ip->i_d.di_size = iattr->ia_size;
+                       ip->i_size = iattr->ia_size;
+-                      if (!(flags & XFS_ATTR_DMI))
+-                              xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+                       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+               } else if (iattr->ia_size <= ip->i_size ||
+                          (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+@@ -373,9 +377,6 @@ xfs_setattr(
+                       ip->i_d.di_gid = gid;
+                       inode->i_gid = gid;
+               }
+-
+-              xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+-              timeflags |= XFS_ICHGTIME_CHG;
+       }
+       /*
+@@ -392,51 +393,37 @@ xfs_setattr(
+               inode->i_mode &= S_IFMT;
+               inode->i_mode |= mode & ~S_IFMT;
+-
+-              xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+-              timeflags |= XFS_ICHGTIME_CHG;
+       }
+       /*
+        * Change file access or modified times.
+        */
+-      if (mask & (ATTR_ATIME|ATTR_MTIME)) {
+-              if (mask & ATTR_ATIME) {
+-                      inode->i_atime = iattr->ia_atime;
+-                      ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+-                      ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+-                      ip->i_update_core = 1;
+-              }
+-              if (mask & ATTR_MTIME) {
+-                      inode->i_mtime = iattr->ia_mtime;
+-                      ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+-                      ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+-                      timeflags &= ~XFS_ICHGTIME_MOD;
+-                      timeflags |= XFS_ICHGTIME_CHG;
+-              }
+-              if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
+-                      xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
++      if (mask & ATTR_ATIME) {
++              inode->i_atime = iattr->ia_atime;
++              ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
++              ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
++              ip->i_update_core = 1;
+       }
+-
+-      /*
+-       * Change file inode change time only if ATTR_CTIME set
+-       * AND we have been called by a DMI function.
+-       */
+-
+-      if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
++      if (mask & ATTR_CTIME) {
+               inode->i_ctime = iattr->ia_ctime;
+               ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+               ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+               ip->i_update_core = 1;
+-              timeflags &= ~XFS_ICHGTIME_CHG;
++      }
++      if (mask & ATTR_MTIME) {
++              inode->i_mtime = iattr->ia_mtime;
++              ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
++              ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
++              ip->i_update_core = 1;
+       }
+       /*
+-       * Send out timestamp changes that need to be set to the
+-       * current time.  Not done when called by a DMI function.
++       * And finally, log the inode core if any attribute in it
++       * has been changed.
+        */
+-      if (timeflags && !(flags & XFS_ATTR_DMI))
+-              xfs_ichgtime(ip, timeflags);
++      if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
++                  ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
++              xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       XFS_STATS_INC(xs_ig_attrchg);
+@@ -451,12 +438,10 @@ xfs_setattr(
+        * mix so this probably isn't worth the trouble to optimize.
+        */
+       code = 0;
+-      if (tp) {
+-              if (mp->m_flags & XFS_MOUNT_WSYNC)
+-                      xfs_trans_set_sync(tp);
++      if (mp->m_flags & XFS_MOUNT_WSYNC)
++              xfs_trans_set_sync(tp);
+-              code = xfs_trans_commit(tp, commit_flags);
+-      }
++      code = xfs_trans_commit(tp, commit_flags);
+       xfs_iunlock(ip, lock_flags);
diff --git a/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch b/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch
new file mode 100644 (file)
index 0000000..f215dec
--- /dev/null
@@ -0,0 +1,112 @@
+From david@fromorbit.com  Fri Apr  2 11:05:39 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:01 +1100
+Subject: xfs: I/O completion handlers must use NOFS allocations
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-4-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 80641dc66a2d6dfb22af4413227a92b8ab84c7bb upstream
+
+When completing I/O requests we must not allow the memory allocator to
+recurse into the filesystem, as we might deadlock on waiting for the
+I/O completion otherwise.  The only thing currently allocating normal
+GFP_KERNEL memory is the allocation of the transaction structure for
+the unwritten extent conversion.  Add a memflags argument to
+_xfs_trans_alloc to allow controlling the allocator behaviour.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Thomas Neumann <tneumann@users.sourceforge.net>
+Tested-by: Thomas Neumann <tneumann@users.sourceforge.net>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_fsops.c |    2 +-
+ fs/xfs/xfs_iomap.c |    9 ++++++++-
+ fs/xfs/xfs_mount.c |    2 +-
+ fs/xfs/xfs_trans.c |    7 ++++---
+ fs/xfs/xfs_trans.h |    2 +-
+ 5 files changed, 15 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -611,7 +611,7 @@ xfs_fs_log_dummy(
+       xfs_inode_t     *ip;
+       int             error;
+-      tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
++      tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
+       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
+                * set up a transaction to convert the range of extents
+                * from unwritten to real. Do allocations in a loop until
+                * we have covered the range passed in.
++               *
++               * Note that we open code the transaction allocation here
++               * to pass KM_NOFS--we can't risk to recursing back into
++               * the filesystem here as we might be asked to write out
++               * the same inode that we complete here and might deadlock
++               * on the iolock.
+                */
+-              tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
++              xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
++              tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+               tp->t_flags |= XFS_TRANS_RESERVE;
+               error = xfs_trans_reserve(tp, resblks,
+                               XFS_WRITE_LOG_RES(mp), 0,
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1471,7 +1471,7 @@ xfs_log_sbcount(
+       if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+               return 0;
+-      tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
++      tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+       error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                       XFS_DEFAULT_LOG_COUNT);
+       if (error) {
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -236,19 +236,20 @@ xfs_trans_alloc(
+       uint            type)
+ {
+       xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+-      return _xfs_trans_alloc(mp, type);
++      return _xfs_trans_alloc(mp, type, KM_SLEEP);
+ }
+ xfs_trans_t *
+ _xfs_trans_alloc(
+       xfs_mount_t     *mp,
+-      uint            type)
++      uint            type,
++      uint            memflags)
+ {
+       xfs_trans_t     *tp;
+       atomic_inc(&mp->m_active_trans);
+-      tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
++      tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
+       tp->t_magic = XFS_TRANS_MAGIC;
+       tp->t_type = type;
+       tp->t_mountp = mp;
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -924,7 +924,7 @@ typedef struct xfs_trans {
+  * XFS transaction mechanism exported interfaces.
+  */
+ xfs_trans_t   *xfs_trans_alloc(struct xfs_mount *, uint);
+-xfs_trans_t   *_xfs_trans_alloc(struct xfs_mount *, uint);
++xfs_trans_t   *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+ xfs_trans_t   *xfs_trans_dup(xfs_trans_t *);
+ int           xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+                                 uint, uint);
diff --git a/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch b/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch
new file mode 100644 (file)
index 0000000..f445c70
--- /dev/null
@@ -0,0 +1,222 @@
+From david@fromorbit.com  Fri Apr  2 11:12:28 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:16 +1100
+Subject: xfs: Non-blocking inode locking in IO completion
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-19-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 77d7a0c2eeb285c9069e15396703d0cb9690ac50 upstream
+
+The introduction of barriers to loop devices has created a new IO
+order completion dependency that XFS does not handle. The loop
+device implements barriers using fsync and so turns a log IO in the
+XFS filesystem on the loop device into a data IO in the backing
+filesystem. That is, the completion of log IOs in the loop
+filesystem are now dependent on completion of data IO in the backing
+filesystem.
+
+This can cause deadlocks when a flush daemon issues a log force with
+an inode locked because the IO completion of IO on the inode is
+blocked by the inode lock. This in turn prevents further data IO
+completion from occuring on all XFS filesystems on that CPU (due to
+the shared nature of the completion queues). This then prevents the
+log IO from completing because the log is waiting for data IO
+completion as well.
+
+The fix for this new completion order dependency issue is to make
+the IO completion inode locking non-blocking. If the inode lock
+can't be grabbed, simply requeue the IO completion back to the work
+queue so that it can be processed later. This prevents the
+completion queue from being blocked and allows data IO completion on
+other inodes to proceed, hence avoiding completion order dependent
+deadlocks.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_aops.c |  118 ++++++++++++++++++++++++++++++--------------
+ 1 file changed, 82 insertions(+), 36 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_aops.c
++++ b/fs/xfs/linux-2.6/xfs_aops.c
+@@ -204,14 +204,17 @@ xfs_ioend_new_eof(
+ }
+ /*
+- * Update on-disk file size now that data has been written to disk.
+- * The current in-memory file size is i_size.  If a write is beyond
+- * eof i_new_size will be the intended file size until i_size is
+- * updated.  If this write does not extend all the way to the valid
+- * file size then restrict this update to the end of the write.
++ * Update on-disk file size now that data has been written to disk.  The
++ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
++ * will be the intended file size until i_size is updated.  If this write does
++ * not extend all the way to the valid file size then restrict this update to
++ * the end of the write.
++ *
++ * This function does not block as blocking on the inode lock in IO completion
++ * can lead to IO completion order dependency deadlocks.. If it can't get the
++ * inode ilock it will return EAGAIN. Callers must handle this.
+  */
+-
+-STATIC void
++STATIC int
+ xfs_setfilesize(
+       xfs_ioend_t             *ioend)
+ {
+@@ -222,9 +225,11 @@ xfs_setfilesize(
+       ASSERT(ioend->io_type != IOMAP_READ);
+       if (unlikely(ioend->io_error))
+-              return;
++              return 0;
++
++      if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
++              return EAGAIN;
+-      xfs_ilock(ip, XFS_ILOCK_EXCL);
+       isize = xfs_ioend_new_eof(ioend);
+       if (isize) {
+               ip->i_d.di_size = isize;
+@@ -232,6 +237,28 @@ xfs_setfilesize(
+       }
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
++      return 0;
++}
++
++/*
++ * Schedule IO completion handling on a xfsdatad if this was
++ * the final hold on this ioend. If we are asked to wait,
++ * flush the workqueue.
++ */
++STATIC void
++xfs_finish_ioend(
++      xfs_ioend_t     *ioend,
++      int             wait)
++{
++      if (atomic_dec_and_test(&ioend->io_remaining)) {
++              struct workqueue_struct *wq;
++
++              wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
++                      xfsconvertd_workqueue : xfsdatad_workqueue;
++              queue_work(wq, &ioend->io_work);
++              if (wait)
++                      flush_workqueue(wq);
++      }
+ }
+ /*
+@@ -243,9 +270,23 @@ xfs_end_bio_delalloc(
+ {
+       xfs_ioend_t             *ioend =
+               container_of(work, xfs_ioend_t, io_work);
++      int                     error;
+-      xfs_setfilesize(ioend);
+-      xfs_destroy_ioend(ioend);
++      /*
++       * If we didn't complete processing of the ioend, requeue it to the
++       * tail of the workqueue for another attempt later. Otherwise destroy
++       * it.
++       */
++      error = xfs_setfilesize(ioend);
++      if (error == EAGAIN) {
++              atomic_inc(&ioend->io_remaining);
++              xfs_finish_ioend(ioend, 0);
++              /* ensure we don't spin on blocked ioends */
++              delay(1);
++      } else {
++              ASSERT(!error);
++              xfs_destroy_ioend(ioend);
++      }
+ }
+ /*
+@@ -257,9 +298,23 @@ xfs_end_bio_written(
+ {
+       xfs_ioend_t             *ioend =
+               container_of(work, xfs_ioend_t, io_work);
++      int                     error;
+-      xfs_setfilesize(ioend);
+-      xfs_destroy_ioend(ioend);
++      /*
++       * If we didn't complete processing of the ioend, requeue it to the
++       * tail of the workqueue for another attempt later. Otherwise destroy
++       * it.
++       */
++      error = xfs_setfilesize(ioend);
++      if (error == EAGAIN) {
++              atomic_inc(&ioend->io_remaining);
++              xfs_finish_ioend(ioend, 0);
++              /* ensure we don't spin on blocked ioends */
++              delay(1);
++      } else {
++              ASSERT(!error);
++              xfs_destroy_ioend(ioend);
++      }
+ }
+ /*
+@@ -279,13 +334,25 @@ xfs_end_bio_unwritten(
+       size_t                  size = ioend->io_size;
+       if (likely(!ioend->io_error)) {
++              int     error;
+               if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+-                      int error;
+                       error = xfs_iomap_write_unwritten(ip, offset, size);
+                       if (error)
+                               ioend->io_error = error;
+               }
+-              xfs_setfilesize(ioend);
++              /*
++               * If we didn't complete processing of the ioend, requeue it to the
++               * tail of the workqueue for another attempt later. Otherwise destroy
++               * it.
++               */
++              error = xfs_setfilesize(ioend);
++              if (error == EAGAIN) {
++                      atomic_inc(&ioend->io_remaining);
++                      xfs_finish_ioend(ioend, 0);
++                      /* ensure we don't spin on blocked ioends */
++                      delay(1);
++                      return;
++              }
+       }
+       xfs_destroy_ioend(ioend);
+ }
+@@ -304,27 +371,6 @@ xfs_end_bio_read(
+ }
+ /*
+- * Schedule IO completion handling on a xfsdatad if this was
+- * the final hold on this ioend. If we are asked to wait,
+- * flush the workqueue.
+- */
+-STATIC void
+-xfs_finish_ioend(
+-      xfs_ioend_t     *ioend,
+-      int             wait)
+-{
+-      if (atomic_dec_and_test(&ioend->io_remaining)) {
+-              struct workqueue_struct *wq = xfsdatad_workqueue;
+-              if (ioend->io_work.func == xfs_end_bio_unwritten)
+-                      wq = xfsconvertd_workqueue;
+-
+-              queue_work(wq, &ioend->io_work);
+-              if (wait)
+-                      flush_workqueue(wq);
+-      }
+-}
+-
+-/*
+  * Allocate and initialise an IO completion structure.
+  * We need to track unwritten extent write completion here initially.
+  * We'll need to extend this for updating the ondisk inode size later
diff --git a/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch b/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch
new file mode 100644 (file)
index 0000000..055cfcc
--- /dev/null
@@ -0,0 +1,38 @@
+From david@fromorbit.com  Fri Apr  2 11:11:19 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:13 +1100
+Subject: xfs: quota limit statvfs available blocks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-16-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 9b00f30762fe9f914eb6e03057a616ed63a4e8ca upstream
+
+A "df" run on an NFS client of an exported XFS file system reports
+the wrong information for "available" blocks.  When a block quota is
+enforced, the amount reported as free is limited by the quota, but
+the amount reported available is not (and should be).
+
+Reported-by: Guk-Bong, Kwon <gbkwon@gmail.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/quota/xfs_qm_bhv.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/quota/xfs_qm_bhv.c
++++ b/fs/xfs/quota/xfs_qm_bhv.c
+@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
+               be64_to_cpu(dp->d_blk_hardlimit);
+       if (limit && statp->f_blocks > limit) {
+               statp->f_blocks = limit;
+-              statp->f_bfree =
++              statp->f_bfree = statp->f_bavail =
+                       (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
+                        (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+       }
diff --git a/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch b/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch
new file mode 100644 (file)
index 0000000..7610deb
--- /dev/null
@@ -0,0 +1,64 @@
+From david@fromorbit.com  Fri Apr  2 11:09:55 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:10 +1100
+Subject: xfs: reclaim all inodes by background tree walks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-13-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 57817c68229984818fea9e614d6f95249c3fb098 upstream
+
+We cannot do direct inode reclaim without taking the flush lock to
+ensure that we do not reclaim an inode under IO. We check the inode
+is clean before doing direct reclaim, but this is not good enough
+because the inode flush code marks the inode clean once it has
+copied the in-core dirty state to the backing buffer.
+
+It is the flush lock that determines whether the inode is still
+under IO, even though it is marked clean, and the inode is still
+required at IO completion so we can't reclaim it even though it is
+clean in core. Hence the requirement that we need to take the flush
+lock even on clean inodes because this guarantees that the inode
+writeback IO has completed and it is safe to reclaim the inode.
+
+With delayed write inode flushing, we could end up waiting a long
+time on the flush lock even for a clean inode. The background
+reclaim already handles this efficiently, so avoid all the problems
+by killing the direct reclaim path altogether.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_super.c |   14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -953,16 +953,14 @@ xfs_fs_destroy_inode(
+       ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+       /*
+-       * If we have nothing to flush with this inode then complete the
+-       * teardown now, otherwise delay the flush operation.
++       * We always use background reclaim here because even if the
++       * inode is clean, it still may be under IO and hence we have
++       * to take the flush lock. The background reclaim path handles
++       * this more efficiently than we can here, so simply let background
++       * reclaim tear down all inodes.
+        */
+-      if (!xfs_inode_clean(ip)) {
+-              xfs_inode_set_reclaim_tag(ip);
+-              return;
+-      }
+-
+ out_reclaim:
+-      xfs_ireclaim(ip);
++      xfs_inode_set_reclaim_tag(ip);
+ }
+ /*
diff --git a/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch b/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch
new file mode 100644 (file)
index 0000000..dbe7c01
--- /dev/null
@@ -0,0 +1,309 @@
+From david@fromorbit.com  Fri Apr  2 11:09:00 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:08 +1100
+Subject: xfs: reclaim inodes under a write lock
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-11-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit c8e20be020f234c8d492927a424a7d8bbefd5b5d upstream
+
+Make the inode tree reclaim walk exclusive to avoid races with
+concurrent sync walkers and lookups. This is a version of a patch
+posted by Christoph Hellwig that avoids all the code duplication.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_sync.c    |  154 ++++++++++++++++++-----------------------
+ fs/xfs/linux-2.6/xfs_sync.h    |    2 
+ fs/xfs/quota/xfs_qm_syscalls.c |    2 
+ 3 files changed, 71 insertions(+), 87 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -64,7 +64,6 @@ xfs_inode_ag_lookup(
+        * as the tree is sparse and a gang lookup walks to find
+        * the number of objects requested.
+        */
+-      read_lock(&pag->pag_ici_lock);
+       if (tag == XFS_ICI_NO_TAG) {
+               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                               (void **)&ip, *first_index, 1);
+@@ -73,7 +72,7 @@ xfs_inode_ag_lookup(
+                               (void **)&ip, *first_index, 1, tag);
+       }
+       if (!nr_found)
+-              goto unlock;
++              return NULL;
+       /*
+        * Update the index for the next lookup. Catch overflows
+@@ -83,13 +82,8 @@ xfs_inode_ag_lookup(
+        */
+       *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+       if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+-              goto unlock;
+-
++              return NULL;
+       return ip;
+-
+-unlock:
+-      read_unlock(&pag->pag_ici_lock);
+-      return NULL;
+ }
+ STATIC int
+@@ -99,7 +93,8 @@ xfs_inode_ag_walk(
+       int                     (*execute)(struct xfs_inode *ip,
+                                          struct xfs_perag *pag, int flags),
+       int                     flags,
+-      int                     tag)
++      int                     tag,
++      int                     exclusive)
+ {
+       struct xfs_perag        *pag = &mp->m_perag[ag];
+       uint32_t                first_index;
+@@ -113,10 +108,20 @@ restart:
+               int             error = 0;
+               xfs_inode_t     *ip;
++              if (exclusive)
++                      write_lock(&pag->pag_ici_lock);
++              else
++                      read_lock(&pag->pag_ici_lock);
+               ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+-              if (!ip)
++              if (!ip) {
++                      if (exclusive)
++                              write_unlock(&pag->pag_ici_lock);
++                      else
++                              read_unlock(&pag->pag_ici_lock);
+                       break;
++              }
++              /* execute releases pag->pag_ici_lock */
+               error = execute(ip, pag, flags);
+               if (error == EAGAIN) {
+                       skipped++;
+@@ -124,9 +129,8 @@ restart:
+               }
+               if (error)
+                       last_error = error;
+-              /*
+-               * bail out if the filesystem is corrupted.
+-               */
++
++              /* bail out if the filesystem is corrupted.  */
+               if (error == EFSCORRUPTED)
+                       break;
+@@ -147,7 +151,8 @@ xfs_inode_ag_iterator(
+       int                     (*execute)(struct xfs_inode *ip,
+                                          struct xfs_perag *pag, int flags),
+       int                     flags,
+-      int                     tag)
++      int                     tag,
++      int                     exclusive)
+ {
+       int                     error = 0;
+       int                     last_error = 0;
+@@ -156,7 +161,8 @@ xfs_inode_ag_iterator(
+       for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+               if (!mp->m_perag[ag].pag_ici_init)
+                       continue;
+-              error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
++              error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
++                                              exclusive);
+               if (error) {
+                       last_error = error;
+                       if (error == EFSCORRUPTED)
+@@ -180,11 +186,7 @@ xfs_sync_inode_valid(
+               return EFSCORRUPTED;
+       }
+-      /*
+-       * If we can't get a reference on the inode, it must be in reclaim.
+-       * Leave it for the reclaim code to flush. Also avoid inodes that
+-       * haven't been fully initialised.
+-       */
++      /* If we can't get a reference on the inode, it must be in reclaim. */
+       if (!igrab(inode)) {
+               read_unlock(&pag->pag_ici_lock);
+               return ENOENT;
+@@ -281,7 +283,7 @@ xfs_sync_data(
+       ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+       error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+-                                    XFS_ICI_NO_TAG);
++                                    XFS_ICI_NO_TAG, 0);
+       if (error)
+               return XFS_ERROR(error);
+@@ -303,7 +305,7 @@ xfs_sync_attr(
+       ASSERT((flags & ~SYNC_WAIT) == 0);
+       return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+-                                   XFS_ICI_NO_TAG);
++                                   XFS_ICI_NO_TAG, 0);
+ }
+ STATIC int
+@@ -663,60 +665,6 @@ xfs_syncd_stop(
+       kthread_stop(mp->m_sync_task);
+ }
+-STATIC int
+-xfs_reclaim_inode(
+-      xfs_inode_t     *ip,
+-      int             sync_mode)
+-{
+-      xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+-
+-      /* The hash lock here protects a thread in xfs_iget_core from
+-       * racing with us on linking the inode back with a vnode.
+-       * Once we have the XFS_IRECLAIM flag set it will not touch
+-       * us.
+-       */
+-      write_lock(&pag->pag_ici_lock);
+-      spin_lock(&ip->i_flags_lock);
+-      if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+-          !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+-              spin_unlock(&ip->i_flags_lock);
+-              write_unlock(&pag->pag_ici_lock);
+-              return -EAGAIN;
+-      }
+-      __xfs_iflags_set(ip, XFS_IRECLAIM);
+-      spin_unlock(&ip->i_flags_lock);
+-      write_unlock(&pag->pag_ici_lock);
+-      xfs_put_perag(ip->i_mount, pag);
+-
+-      /*
+-       * If the inode is still dirty, then flush it out.  If the inode
+-       * is not in the AIL, then it will be OK to flush it delwri as
+-       * long as xfs_iflush() does not keep any references to the inode.
+-       * We leave that decision up to xfs_iflush() since it has the
+-       * knowledge of whether it's OK to simply do a delwri flush of
+-       * the inode or whether we need to wait until the inode is
+-       * pulled from the AIL.
+-       * We get the flush lock regardless, though, just to make sure
+-       * we don't free it while it is being flushed.
+-       */
+-      xfs_ilock(ip, XFS_ILOCK_EXCL);
+-      xfs_iflock(ip);
+-
+-      /*
+-       * In the case of a forced shutdown we rely on xfs_iflush() to
+-       * wait for the inode to be unpinned before returning an error.
+-       */
+-      if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+-              /* synchronize with xfs_iflush_done */
+-              xfs_iflock(ip);
+-              xfs_ifunlock(ip);
+-      }
+-
+-      xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-      xfs_ireclaim(ip);
+-      return 0;
+-}
+-
+ void
+ __xfs_inode_set_reclaim_tag(
+       struct xfs_perag        *pag,
+@@ -759,19 +707,55 @@ __xfs_inode_clear_reclaim_tag(
+ }
+ STATIC int
+-xfs_reclaim_inode_now(
++xfs_reclaim_inode(
+       struct xfs_inode        *ip,
+       struct xfs_perag        *pag,
+-      int                     flags)
++      int                     sync_mode)
+ {
+-      /* ignore if already under reclaim */
+-      if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+-              read_unlock(&pag->pag_ici_lock);
++      /*
++       * The radix tree lock here protects a thread in xfs_iget from racing
++       * with us starting reclaim on the inode.  Once we have the
++       * XFS_IRECLAIM flag set it will not touch us.
++       */
++      spin_lock(&ip->i_flags_lock);
++      ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
++      if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
++              /* ignore as it is already under reclaim */
++              spin_unlock(&ip->i_flags_lock);
++              write_unlock(&pag->pag_ici_lock);
+               return 0;
+       }
+-      read_unlock(&pag->pag_ici_lock);
++      __xfs_iflags_set(ip, XFS_IRECLAIM);
++      spin_unlock(&ip->i_flags_lock);
++      write_unlock(&pag->pag_ici_lock);
+-      return xfs_reclaim_inode(ip, flags);
++      /*
++       * If the inode is still dirty, then flush it out.  If the inode
++       * is not in the AIL, then it will be OK to flush it delwri as
++       * long as xfs_iflush() does not keep any references to the inode.
++       * We leave that decision up to xfs_iflush() since it has the
++       * knowledge of whether it's OK to simply do a delwri flush of
++       * the inode or whether we need to wait until the inode is
++       * pulled from the AIL.
++       * We get the flush lock regardless, though, just to make sure
++       * we don't free it while it is being flushed.
++       */
++      xfs_ilock(ip, XFS_ILOCK_EXCL);
++      xfs_iflock(ip);
++
++      /*
++       * In the case of a forced shutdown we rely on xfs_iflush() to
++       * wait for the inode to be unpinned before returning an error.
++       */
++      if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
++              /* synchronize with xfs_iflush_done */
++              xfs_iflock(ip);
++              xfs_ifunlock(ip);
++      }
++
++      xfs_iunlock(ip, XFS_ILOCK_EXCL);
++      xfs_ireclaim(ip);
++      return 0;
+ }
+ int
+@@ -779,6 +763,6 @@ xfs_reclaim_inodes(
+       xfs_mount_t     *mp,
+       int             mode)
+ {
+-      return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+-                                      XFS_ICI_RECLAIM_TAG);
++      return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
++                                      XFS_ICI_RECLAIM_TAG, 1);
+ }
+--- a/fs/xfs/linux-2.6/xfs_sync.h
++++ b/fs/xfs/linux-2.6/xfs_sync.h
+@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struc
+ int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+ int xfs_inode_ag_iterator(struct xfs_mount *mp,
+       int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+-      int flags, int tag);
++      int flags, int tag, int write_lock);
+ #endif
+--- a/fs/xfs/quota/xfs_qm_syscalls.c
++++ b/fs/xfs/quota/xfs_qm_syscalls.c
+@@ -893,7 +893,7 @@ xfs_qm_dqrele_all_inodes(
+       uint             flags)
+ {
+       ASSERT(mp->m_quotainfo);
+-      xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
++      xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+ }
+ /*------------------------------------------------------------------------*/
diff --git a/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch b/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch
new file mode 100644 (file)
index 0000000..d7d38b2
--- /dev/null
@@ -0,0 +1,64 @@
+From david@fromorbit.com  Fri Apr  2 11:12:06 2010
+From: Christoph Hellwig <hch@lst.de>
+Date: Fri, 12 Mar 2010 09:42:15 +1100
+Subject: xfs: remove invalid barrier optimization from xfs_fsync
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-18-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e8b217e7530c6a073ac69f1c85b922d93fdf5647 upstream
+
+Date: Tue, 2 Feb 2010 10:16:26 +1100
+We always need to flush the disk write cache and can't skip it just because
+the no inode attributes have changed.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_vnodeops.c |   12 ++----------
+ 1 file changed, 2 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -597,7 +597,7 @@ xfs_fsync(
+ {
+       xfs_trans_t     *tp;
+       int             error = 0;
+-      int             log_flushed = 0, changed = 1;
++      int             log_flushed = 0;
+       xfs_itrace_entry(ip);
+@@ -627,19 +627,11 @@ xfs_fsync(
+                * disk yet, the inode will be still be pinned.  If it is,
+                * force the log.
+                */
+-
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+-
+               if (xfs_ipincount(ip)) {
+                       error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+                                     XFS_LOG_FORCE | XFS_LOG_SYNC,
+                                     &log_flushed);
+-              } else {
+-                      /*
+-                       * If the inode is not pinned and nothing has changed
+-                       * we don't need to flush the cache.
+-                       */
+-                      changed = 0;
+               }
+       } else  {
+               /*
+@@ -674,7 +666,7 @@ xfs_fsync(
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+-      if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
++      if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+               /*
+                * If the log write didn't issue an ordered tag we need
+                * to flush the disk cache for the data device now.
diff --git a/queue-2.6.32/xfs-simplify-inode-teardown.patch b/queue-2.6.32/xfs-simplify-inode-teardown.patch
new file mode 100644 (file)
index 0000000..6af994e
--- /dev/null
@@ -0,0 +1,206 @@
+From david@fromorbit.com  Fri Apr  2 11:04:20 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:41:59 +1100
+Subject: xfs: simplify inode teardown
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-2-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 848ce8f731aed0a2d4ab5884a4f6664af73d2dd0 upstream
+
+Currently the reclaim code for the case where we don't reclaim the
+final reclaim is overly complicated.  We know that the inode is clean
+but instead of just directly reclaiming the clean inode we go through
+the whole process of marking the inode reclaimable just to directly
+reclaim it from the calling context.  Besides being overly complicated
+this introduces a race where iget could recycle an inode between
+marked reclaimable and actually being reclaimed leading to panics.
+
+This patch gets rid of the existing reclaim path, and replaces it with
+a simple call to xfs_ireclaim if the inode was clean.  While we're at
+it we also use the slightly more lax xfs_inode_clean check we'd use
+later to determine if we need to flush the inode here.
+
+Finally get rid of xfs_reclaim function and place the remaining small
+bits of reclaim code directly into xfs_fs_destroy_inode.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Patrick Schreurs <patrick@news-service.com>
+Reported-by: Tommy van Leeuwen <tommy@news-service.com>
+Tested-by: Patrick Schreurs <patrick@news-service.com>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_super.c |   34 ++++++++++++++++++++++++++++++----
+ fs/xfs/linux-2.6/xfs_sync.c  |   15 ++++-----------
+ fs/xfs/linux-2.6/xfs_sync.h  |    1 -
+ fs/xfs/xfs_vnodeops.c        |   40 ----------------------------------------
+ fs/xfs/xfs_vnodeops.h        |    1 -
+ 5 files changed, 34 insertions(+), 57 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -930,13 +930,39 @@ xfs_fs_alloc_inode(
+  */
+ STATIC void
+ xfs_fs_destroy_inode(
+-      struct inode    *inode)
++      struct inode            *inode)
+ {
+-      xfs_inode_t             *ip = XFS_I(inode);
++      struct xfs_inode        *ip = XFS_I(inode);
++
++      xfs_itrace_entry(ip);
+       XFS_STATS_INC(vn_reclaim);
+-      if (xfs_reclaim(ip))
+-              panic("%s: cannot reclaim 0x%p\n", __func__, inode);
++
++      /* bad inode, get out here ASAP */
++      if (is_bad_inode(inode))
++              goto out_reclaim;
++
++      xfs_ioend_wait(ip);
++
++      ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
++
++      /*
++       * We should never get here with one of the reclaim flags already set.
++       */
++      ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
++      ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
++
++      /*
++       * If we have nothing to flush with this inode then complete the
++       * teardown now, otherwise delay the flush operation.
++       */
++      if (!xfs_inode_clean(ip)) {
++              xfs_inode_set_reclaim_tag(ip);
++              return;
++      }
++
++out_reclaim:
++      xfs_ireclaim(ip);
+ }
+ /*
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -663,10 +663,9 @@ xfs_syncd_stop(
+       kthread_stop(mp->m_sync_task);
+ }
+-int
++STATIC int
+ xfs_reclaim_inode(
+       xfs_inode_t     *ip,
+-      int             locked,
+       int             sync_mode)
+ {
+       xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+@@ -682,10 +681,6 @@ xfs_reclaim_inode(
+           !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+               spin_unlock(&ip->i_flags_lock);
+               write_unlock(&pag->pag_ici_lock);
+-              if (locked) {
+-                      xfs_ifunlock(ip);
+-                      xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-              }
+               return -EAGAIN;
+       }
+       __xfs_iflags_set(ip, XFS_IRECLAIM);
+@@ -704,10 +699,8 @@ xfs_reclaim_inode(
+        * We get the flush lock regardless, though, just to make sure
+        * we don't free it while it is being flushed.
+        */
+-      if (!locked) {
+-              xfs_ilock(ip, XFS_ILOCK_EXCL);
+-              xfs_iflock(ip);
+-      }
++      xfs_ilock(ip, XFS_ILOCK_EXCL);
++      xfs_iflock(ip);
+       /*
+        * In the case of a forced shutdown we rely on xfs_iflush() to
+@@ -778,7 +771,7 @@ xfs_reclaim_inode_now(
+       }
+       read_unlock(&pag->pag_ici_lock);
+-      return xfs_reclaim_inode(ip, 0, flags);
++      return xfs_reclaim_inode(ip, flags);
+ }
+ int
+--- a/fs/xfs/linux-2.6/xfs_sync.h
++++ b/fs/xfs/linux-2.6/xfs_sync.h
+@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *
+ void xfs_flush_inodes(struct xfs_inode *ip);
+-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -2456,46 +2456,6 @@ xfs_set_dmattrs(
+       return error;
+ }
+-int
+-xfs_reclaim(
+-      xfs_inode_t     *ip)
+-{
+-
+-      xfs_itrace_entry(ip);
+-
+-      ASSERT(!VN_MAPPED(VFS_I(ip)));
+-
+-      /* bad inode, get out here ASAP */
+-      if (is_bad_inode(VFS_I(ip))) {
+-              xfs_ireclaim(ip);
+-              return 0;
+-      }
+-
+-      xfs_ioend_wait(ip);
+-
+-      ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+-
+-      /*
+-       * If we have nothing to flush with this inode then complete the
+-       * teardown now, otherwise break the link between the xfs inode and the
+-       * linux inode and clean up the xfs inode later. This avoids flushing
+-       * the inode to disk during the delete operation itself.
+-       *
+-       * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
+-       * first to ensure that xfs_iunpin() will never see an xfs inode
+-       * that has a linux inode being reclaimed. Synchronisation is provided
+-       * by the i_flags_lock.
+-       */
+-      if (!ip->i_update_core && (ip->i_itemp == NULL)) {
+-              xfs_ilock(ip, XFS_ILOCK_EXCL);
+-              xfs_iflock(ip);
+-              xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+-              return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+-      }
+-      xfs_inode_set_reclaim_tag(ip);
+-      return 0;
+-}
+-
+ /*
+  * xfs_alloc_file_space()
+  *      This routine allocates disk space for the given file.
+--- a/fs/xfs/xfs_vnodeops.h
++++ b/fs/xfs/xfs_vnodeops.h
+@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, st
+               const char *target_path, mode_t mode, struct xfs_inode **ipp,
+               cred_t *credp);
+ int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
+-int xfs_reclaim(struct xfs_inode *ip);
+ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
+               xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
+ int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
diff --git a/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch b/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch
new file mode 100644 (file)
index 0000000..519d11e
--- /dev/null
@@ -0,0 +1,130 @@
+From david@fromorbit.com  Fri Apr  2 11:06:08 2010
+From: Andy Poling <andy@realbig.com>
+Date: Fri, 12 Mar 2010 09:42:02 +1100
+Subject: xfs: Wrapped journal record corruption on read at recovery
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-5-git-send-email-david@fromorbit.com>
+
+
+From: Andy Poling <andy@realbig.com>
+
+commit fc5bc4c85c45f0bf854404e5736aa8b65720a18d upstream
+
+Summary of problem:
+
+If a journal record wraps at the physical end of the journal, it has to be
+read in two parts in xlog_do_recovery_pass(): a read at the physical end and a
+read at the physical beginning.  If xlog_bread() has to re-align the first
+read, the second read request does not take that re-alignment into account.
+If the first read was re-aligned, the second read over-writes the end of the
+data from the first read, effectively corrupting it.  This can happen either
+when reading the record header or reading the record data.
+
+The first sanity check in xlog_recover_process_data() is to check for a valid
+clientid, so that is the error reported.
+
+Summary of fix:
+
+If there was a first read at the physical end, XFS_BUF_PTR() returns where the
+data was requested to begin.  Conversely, because it is the result of
+xlog_align(), offset indicates where the requested data for the first read
+actually begins - whether or not xlog_bread() has re-aligned it.
+
+Using offset as the base for the calculation of where to place the second read
+data ensures that it will be correctly placed immediately following the data
+from the first read instead of sometimes over-writing the end of it.
+
+The attached patch has resolved the reported problem of occasional inability
+to recover the journal (reporting "bad clientid").
+
+Signed-off-by: Andy Poling <andy@realbig.com>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_log_recover.c |   24 +++++++-----------------
+ 1 file changed, 7 insertions(+), 17 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -3517,7 +3517,7 @@ xlog_do_recovery_pass(
+ {
+       xlog_rec_header_t       *rhead;
+       xfs_daddr_t             blk_no;
+-      xfs_caddr_t             bufaddr, offset;
++      xfs_caddr_t             offset;
+       xfs_buf_t               *hbp, *dbp;
+       int                     error = 0, h_size;
+       int                     bblks, split_bblks;
+@@ -3610,7 +3610,7 @@ xlog_do_recovery_pass(
+                       /*
+                        * Check for header wrapping around physical end-of-log
+                        */
+-                      offset = NULL;
++                      offset = XFS_BUF_PTR(hbp);
+                       split_hblks = 0;
+                       wrapped_hblks = 0;
+                       if (blk_no + hblks <= log->l_logBBsize) {
+@@ -3646,9 +3646,8 @@ xlog_do_recovery_pass(
+                                *   - order is important.
+                                */
+                               wrapped_hblks = hblks - split_hblks;
+-                              bufaddr = XFS_BUF_PTR(hbp);
+                               error = XFS_BUF_SET_PTR(hbp,
+-                                              bufaddr + BBTOB(split_hblks),
++                                              offset + BBTOB(split_hblks),
+                                               BBTOB(hblks - split_hblks));
+                               if (error)
+                                       goto bread_err2;
+@@ -3658,14 +3657,10 @@ xlog_do_recovery_pass(
+                               if (error)
+                                       goto bread_err2;
+-                              error = XFS_BUF_SET_PTR(hbp, bufaddr,
++                              error = XFS_BUF_SET_PTR(hbp, offset,
+                                                       BBTOB(hblks));
+                               if (error)
+                                       goto bread_err2;
+-
+-                              if (!offset)
+-                                      offset = xlog_align(log, 0,
+-                                                      wrapped_hblks, hbp);
+                       }
+                       rhead = (xlog_rec_header_t *)offset;
+                       error = xlog_valid_rec_header(log, rhead,
+@@ -3685,7 +3680,7 @@ xlog_do_recovery_pass(
+                       } else {
+                               /* This log record is split across the
+                                * physical end of log */
+-                              offset = NULL;
++                              offset = XFS_BUF_PTR(dbp);
+                               split_bblks = 0;
+                               if (blk_no != log->l_logBBsize) {
+                                       /* some data is before the physical
+@@ -3714,9 +3709,8 @@ xlog_do_recovery_pass(
+                                *   _first_, then the log start (LR header end)
+                                *   - order is important.
+                                */
+-                              bufaddr = XFS_BUF_PTR(dbp);
+                               error = XFS_BUF_SET_PTR(dbp,
+-                                              bufaddr + BBTOB(split_bblks),
++                                              offset + BBTOB(split_bblks),
+                                               BBTOB(bblks - split_bblks));
+                               if (error)
+                                       goto bread_err2;
+@@ -3727,13 +3721,9 @@ xlog_do_recovery_pass(
+                               if (error)
+                                       goto bread_err2;
+-                              error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
++                              error = XFS_BUF_SET_PTR(dbp, offset, h_size);
+                               if (error)
+                                       goto bread_err2;
+-
+-                              if (!offset)
+-                                      offset = xlog_align(log, wrapped_hblks,
+-                                              bblks - split_bblks, dbp);
+                       }
+                       xlog_unpack_data(rhead, offset, log);
+                       if ((error = xlog_recover_process_data(log, rhash,
diff --git a/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch b/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch
new file mode 100644 (file)
index 0000000..e722537
--- /dev/null
@@ -0,0 +1,185 @@
+From david@fromorbit.com  Fri Apr  2 11:10:52 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:12 +1100
+Subject: xfs: xfs_swap_extents needs to handle dynamic fork offsets
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-15-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit e09f98606dcc156de1146c209d45a0d6d5f51c3f upstream
+
+When swapping extents, we can corrupt inodes by swapping data forks
+that are in incompatible formats.  This is caused by the two indoes
+having different fork offsets due to the presence of an attribute
+fork on an attr2 filesystem.  xfs_fsr tries to be smart about
+setting the fork offset, but the trick it plays only works on attr1
+(old fixed format attribute fork) filesystems.
+
+Changing the way xfs_fsr sets up the attribute fork will prevent
+this situation from ever occurring, so in the kernel code we can get
+by with a preventative fix - check that the data fork in the
+defragmented inode is in a format valid for the inode it is being
+swapped into.  This will lead to files that will silently and
+potentially repeatedly fail defragmentation, so issue a warning to
+the log when this particular failure occurs to let us know that
+xfs_fsr needs updating/fixing.
+
+To help identify how to improve xfs_fsr to avoid this issue, add
+trace points for the inodes being swapped so that we can determine
+why the swap was rejected and to confirm that the code is making the
+right decisions and modifications when swapping forks.
+
+A further complication is even when the swap is allowed to proceed
+when the fork offset is different between the two inodes then value
+for the maximum number of extents the data fork can hold can be
+wrong. Make sure these are also set correctly after the swap occurs.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_dfrag.c |  106 +++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 90 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/xfs_dfrag.c
++++ b/fs/xfs/xfs_dfrag.c
+@@ -113,10 +113,82 @@ xfs_swapext(
+       return error;
+ }
++/*
++ * We need to check that the format of the data fork in the temporary inode is
++ * valid for the target inode before doing the swap. This is not a problem with
++ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
++ * data fork depending on the space the attribute fork is taking so we can get
++ * invalid formats on the target inode.
++ *
++ * E.g. target has space for 7 extents in extent format, temp inode only has
++ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
++ * btree, but when swapped it needs to be in extent format. Hence we can't just
++ * blindly swap data forks on attr2 filesystems.
++ *
++ * Note that we check the swap in both directions so that we don't end up with
++ * a corrupt temporary inode, either.
++ *
++ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
++ * inode will prevent this situation from occurring, so all we do here is
++ * reject and log the attempt. basically we are putting the responsibility on
++ * userspace to get this right.
++ */
++static int
++xfs_swap_extents_check_format(
++      xfs_inode_t     *ip,    /* target inode */
++      xfs_inode_t     *tip)   /* tmp inode */
++{
++
++      /* Should never get a local format */
++      if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
++          tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
++              return EINVAL;
++
++      /*
++       * if the target inode has less extents that then temporary inode then
++       * why did userspace call us?
++       */
++      if (ip->i_d.di_nextents < tip->i_d.di_nextents)
++              return EINVAL;
++
++      /*
++       * if the target inode is in extent form and the temp inode is in btree
++       * form then we will end up with the target inode in the wrong format
++       * as we already know there are less extents in the temp inode.
++       */
++      if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++          tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
++              return EINVAL;
++
++      /* Check temp in extent form to max in target */
++      if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++          XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
++              return EINVAL;
++
++      /* Check target in extent form to max in temp */
++      if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++          XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
++              return EINVAL;
++
++      /* Check root block of temp in btree form to max in target */
++      if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
++          XFS_IFORK_BOFF(ip) &&
++          tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
++              return EINVAL;
++
++      /* Check root block of target in btree form to max in temp */
++      if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
++          XFS_IFORK_BOFF(tip) &&
++          ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
++              return EINVAL;
++
++      return 0;
++}
++
+ int
+ xfs_swap_extents(
+-      xfs_inode_t     *ip,
+-      xfs_inode_t     *tip,
++      xfs_inode_t     *ip,    /* target inode */
++      xfs_inode_t     *tip,   /* tmp inode */
+       xfs_swapext_t   *sxp)
+ {
+       xfs_mount_t     *mp;
+@@ -160,13 +232,6 @@ xfs_swap_extents(
+               goto out_unlock;
+       }
+-      /* Should never get a local format */
+-      if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+-          tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+-              error = XFS_ERROR(EINVAL);
+-              goto out_unlock;
+-      }
+-
+       if (VN_CACHED(VFS_I(tip)) != 0) {
+               xfs_inval_cached_trace(tip, 0, -1, 0, -1);
+               error = xfs_flushinval_pages(tip, 0, -1,
+@@ -189,13 +254,12 @@ xfs_swap_extents(
+               goto out_unlock;
+       }
+-      /*
+-       * If the target has extended attributes, the tmp file
+-       * must also in order to ensure the correct data fork
+-       * format.
+-       */
+-      if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+-              error = XFS_ERROR(EINVAL);
++      /* check inode formats now that data is flushed */
++      error = xfs_swap_extents_check_format(ip, tip);
++      if (error) {
++              xfs_fs_cmn_err(CE_NOTE, mp,
++                  "%s: inode 0x%llx format is incompatible for exchanging.",
++                              __FILE__, ip->i_ino);
+               goto out_unlock;
+       }
+@@ -276,6 +340,16 @@ xfs_swap_extents(
+       *tifp = *tempifp;       /* struct copy */
+       /*
++       * Fix the in-memory data fork values that are dependent on the fork
++       * offset in the inode. We can't assume they remain the same as attr2
++       * has dynamic fork offsets.
++       */
++      ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
++                                      (uint)sizeof(xfs_bmbt_rec_t);
++      tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
++                                      (uint)sizeof(xfs_bmbt_rec_t);
++
++      /*
+        * Fix the on-disk inode values
+        */
+       tmp = (__uint64_t)ip->i_d.di_nblocks;