From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 2 Apr 2010 18:13:51 +0000 (-0700)
Subject: .32 xfs patches
X-Git-Tag: v2.6.32.12~60
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=cc67517a3f3a86de7188d8a16dd26cb91e5a31da;p=thirdparty%2Fkernel%2Fstable-queue.git

.32 xfs patches
---

diff --git a/queue-2.6.32/series b/queue-2.6.32/series
index 8e4a7266c65..f15d7868dc1 100644
--- a/queue-2.6.32/series
+++ b/queue-2.6.32/series
@@ -6,3 +6,22 @@ oom-fix-the-unsafe-usage-of-badness-in-proc_oom_score.patch
 drm-radeon-kms-don-t-print-error-on-erestartsys.patch
 drm-radeon-kms-fix-pal-tv-out-support-on-legacy-igp-chips.patch
 drm-return-enodev-if-the-inode-mapping-changes.patch
+xfs-simplify-inode-teardown.patch
+xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch
+xfs-i-o-completion-handlers-must-use-nofs-allocations.patch
+xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch
+xfs-fix-error-return-for-fallocate-on-xfs.patch
+xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch
+xfs-fix-timestamp-handling-in-xfs_setattr.patch
+xfs-don-t-flush-stale-inodes.patch
+xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch
+xfs-reclaim-inodes-under-a-write-lock.patch
+xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch
+xfs-reclaim-all-inodes-by-background-tree-walks.patch
+xfs-fix-stale-inode-flush-avoidance.patch
+xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch
+xfs-quota-limit-statvfs-available-blocks.patch
+xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch
+xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch
+xfs-non-blocking-inode-locking-in-io-completion.patch
+xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch
diff --git a/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch b/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch
new file mode 100644
index 00000000000..09eea8643c3
--- /dev/null
+++ b/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch
@@ -0,0 +1,73 @@
+From david@fromorbit.com  Fri Apr  2 11:09:28 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:09 +1100
+Subject: xfs: Avoid inodes in reclaim when flushing from inode cache
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-12-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 018027be90a6946e8cf3f9b17b5582384f7ed117 upstream
+
+The reclaim code will handle flushing of dirty inodes before reclaim
+occurs, so avoid them when determining whether an inode is a
+candidate for flushing to disk when walking the radix trees.  This
+is based on a test patch from Christoph Hellwig.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_sync.c |   31 ++++++++++++++++++-------------
+ 1 file changed, 18 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -179,26 +179,31 @@ xfs_sync_inode_valid(
+ 	struct xfs_perag	*pag)
+ {
+ 	struct inode		*inode = VFS_I(ip);
++	int			error = EFSCORRUPTED;
+ 
+ 	/* nothing to sync during shutdown */
+-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+-		read_unlock(&pag->pag_ici_lock);
+-		return EFSCORRUPTED;
+-	}
++	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
++		goto out_unlock;
+ 
+-	/* If we can't get a reference on the inode, it must be in reclaim. */
+-	if (!igrab(inode)) {
+-		read_unlock(&pag->pag_ici_lock);
+-		return ENOENT;
+-	}
+-	read_unlock(&pag->pag_ici_lock);
++	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
++	error = ENOENT;
++	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
++		goto out_unlock;
+ 
+-	if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
++	/* If we can't grab the inode, it must on it's way to reclaim. */
++	if (!igrab(inode))
++		goto out_unlock;
++
++	if (is_bad_inode(inode)) {
+ 		IRELE(ip);
+-		return ENOENT;
++		goto out_unlock;
+ 	}
+ 
+-	return 0;
++	/* inode is valid */
++	error = 0;
++out_unlock:
++	read_unlock(&pag->pag_ici_lock);
++	return error;
+ }
+ 
+ STATIC int
diff --git a/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch b/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch
new file mode 100644
index 00000000000..5ff112ec0d3
--- /dev/null
+++ b/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch
@@ -0,0 +1,52 @@
+From david@fromorbit.com  Fri Apr  2 11:07:09 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:04 +1100
+Subject: xfs: check for not fully initialized inodes in xfs_ireclaim
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-7-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit b44b1126279b60597f96bbe77507b1650f88a969 upstream
+
+Add an assert for inodes not added to the inode cache in xfs_ireclaim,
+to make sure we're not going to introduce something like the
+famous nfsd inode cache bug again.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_iget.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_iget.c
++++ b/fs/xfs/xfs_iget.c
+@@ -511,17 +511,21 @@ xfs_ireclaim(
+ {
+ 	struct xfs_mount	*mp = ip->i_mount;
+ 	struct xfs_perag	*pag;
++	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ 
+ 	XFS_STATS_INC(xs_ig_reclaims);
+ 
+ 	/*
+-	 * Remove the inode from the per-AG radix tree.  It doesn't matter
+-	 * if it was never added to it because radix_tree_delete can deal
+-	 * with that case just fine.
++	 * Remove the inode from the per-AG radix tree.
++	 *
++	 * Because radix_tree_delete won't complain even if the item was never
++	 * added to the tree assert that it's been there before to catch
++	 * problems with the inode life time early on.
+ 	 */
+ 	pag = xfs_get_perag(mp, ip->i_ino);
+ 	write_lock(&pag->pag_ici_lock);
+-	radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
++	if (!radix_tree_delete(&pag->pag_ici_root, agino))
++		ASSERT(0);
+ 	write_unlock(&pag->pag_ici_lock);
+ 	xfs_put_perag(mp, pag);
+ 
diff --git a/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch b/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch
new file mode 100644
index 00000000000..0960321ea07
--- /dev/null
+++ b/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch
@@ -0,0 +1,46 @@
+From david@fromorbit.com  Fri Apr  2 11:08:07 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:06 +1100
+Subject: xfs: Don't flush stale inodes
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-9-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 44e08c45cc14e6190a424be8d450070c8e508fad upstream
+
+Because inodes remain in cache much longer than inode buffers do
+under memory pressure, we can get the situation where we have
+stale, dirty inodes being reclaimed but the backing storage has
+been freed.  Hence we should never, ever flush XFS_ISTALE inodes
+to disk as there is no guarantee that the backing buffer is in
+cache and still marked stale when the flush occurs.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_inode.c |   10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2877,10 +2877,14 @@ xfs_iflush(
+ 	mp = ip->i_mount;
+ 
+ 	/*
+-	 * If the inode isn't dirty, then just release the inode
+-	 * flush lock and do nothing.
++	 * If the inode isn't dirty, then just release the inode flush lock and
++	 * do nothing. Treat stale inodes the same; we cannot rely on the
++	 * backing buffer remaining stale in cache for the remaining life of
++	 * the stale inode and so xfs_itobp() below may give us a buffer that
++	 * no longer contains inodes below. Doing this stale check here also
++	 * avoids forcing the log on pinned, stale inodes.
+ 	 */
+-	if (xfs_inode_clean(ip)) {
++	if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
+ 		xfs_ifunlock(ip);
+ 		return 0;
+ 	}
diff --git a/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch b/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch
new file mode 100644
index 00000000000..d71b3ac33ae
--- /dev/null
+++ b/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch
@@ -0,0 +1,94 @@
+From david@fromorbit.com  Fri Apr  2 11:11:43 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:14 +1100
+Subject: xfs: don't hold onto reserved blocks on remount, ro
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-17-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit cbe132a8bdcff0f9afd9060948fb50597c7400b8 upstream
+
+If we hold onto reserved blocks when doing a remount,ro we end
+up writing the blocks used count to disk that includes the reserved
+blocks. Reserved blocks are not actually used, so this results in
+the values in the superblock being incorrect.
+
+Hence if we run xfs_check or xfs_repair -n while the filesystem is
+mounted remount,ro we end up with an inconsistent filesystem being
+reported. Also, running xfs_copy on the remount,ro filesystem will
+result in an inconsistent image being generated.
+
+To fix this, unreserve the blocks when doing the remount,ro, and
+reserved them again on remount,rw. This way a remount,ro filesystem
+will appear consistent on disk to all utilities.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_super.c |   28 ++++++++++++++++++++++++++++
+ fs/xfs/xfs_mount.h           |    1 +
+ 2 files changed, 29 insertions(+)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -1323,6 +1323,8 @@ xfs_fs_remount(
+ 
+ 	/* ro -> rw */
+ 	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
++		__uint64_t resblks;
++
+ 		mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ 		if (mp->m_flags & XFS_MOUNT_BARRIER)
+ 			xfs_mountfs_check_barriers(mp);
+@@ -1340,11 +1342,37 @@ xfs_fs_remount(
+ 			}
+ 			mp->m_update_flags = 0;
+ 		}
++
++		/*
++		 * Fill out the reserve pool if it is empty. Use the stashed
++		 * value if it is non-zero, otherwise go with the default.
++		 */
++		if (mp->m_resblks_save) {
++			resblks = mp->m_resblks_save;
++			mp->m_resblks_save = 0;
++		} else {
++			resblks = mp->m_sb.sb_dblocks;
++			do_div(resblks, 20);
++			resblks = min_t(__uint64_t, resblks, 1024);
++		}
++		xfs_reserve_blocks(mp, &resblks, NULL);
+ 	}
+ 
+ 	/* rw -> ro */
+ 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
++		/*
++		 * After we have synced the data but before we sync the
++		 * metadata, we need to free up the reserve block pool so that
++		 * the used block count in the superblock on disk is correct at
++		 * the end of the remount. Stash the current reserve pool size
++		 * so that if we get remounted rw, we can return it to the same
++		 * size.
++		 */
++		__uint64_t resblks = 0;
++
+ 		xfs_quiesce_data(mp);
++		mp->m_resblks_save = mp->m_resblks;
++		xfs_reserve_blocks(mp, &resblks, NULL);
+ 		xfs_quiesce_attr(mp);
+ 		mp->m_flags |= XFS_MOUNT_RDONLY;
+ 	}
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -209,6 +209,7 @@ typedef struct xfs_mount {
+ 	__uint64_t		m_maxioffset;	/* maximum inode offset */
+ 	__uint64_t		m_resblks;	/* total reserved blocks */
+ 	__uint64_t		m_resblks_avail;/* available reserved blocks */
++	__uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
+ 	int			m_dalign;	/* stripe unit */
+ 	int			m_swidth;	/* stripe width */
+ 	int			m_sinoalign;	/* stripe unit inode alignment */
diff --git a/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch b/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch
new file mode 100644
index 00000000000..78ea50a50af
--- /dev/null
+++ b/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch
@@ -0,0 +1,115 @@
+From david@fromorbit.com  Fri Apr  2 11:08:31 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:07 +1100
+Subject: xfs: Ensure we force all busy extents in range to disk
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-10-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit fd45e4784164d1017521086524e3442318c67370 upstream
+
+When we search for and find a busy extent during allocation we
+force the log out to ensure the extent free transaction is on
+disk before the allocation transaction. The current implementation
+has a subtle bug in it--it does not handle multiple overlapping
+ranges.
+
+That is, if we free lots of little extents into a single
+contiguous extent, then allocate the contiguous extent, the busy
+search code stops searching at the first extent it finds that
+overlaps the allocated range. It then uses the commit LSN of the
+transaction to force the log out to.
+
+Unfortunately, the other busy ranges might have more recent
+commit LSNs than the first busy extent that is found, and this
+results in xfs_alloc_search_busy() returning before all the
+extent free transactions are on disk for the range being
+allocated. This can lead to potential metadata corruption or
+stale data exposure after a crash because log replay won't replay
+all the extent free transactions that cover the allocation range.
+
+Modified-by: Alex Elder <aelder@sgi.com>
+
+(Dropped the "found" argument from the xfs_alloc_busysearch trace
+event.)
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_alloc.c |   52 +++++++++++++++++++++-------------------------------
+ 1 file changed, 21 insertions(+), 31 deletions(-)
+
+--- a/fs/xfs/xfs_alloc.c
++++ b/fs/xfs/xfs_alloc.c
+@@ -2703,45 +2703,35 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
+ 	xfs_mount_t		*mp;
+ 	xfs_perag_busy_t	*bsy;
+ 	xfs_agblock_t		uend, bend;
+-	xfs_lsn_t		lsn;
++	xfs_lsn_t		lsn = 0;
+ 	int			cnt;
+ 
+ 	mp = tp->t_mountp;
+ 
+ 	spin_lock(&mp->m_perag[agno].pagb_lock);
+-	cnt = mp->m_perag[agno].pagb_count;
+-
+ 	uend = bno + len - 1;
+ 
+-	/* search pagb_list for this slot, skipping open slots */
+-	for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
+-
+-		/*
+-		 * (start1,length1) within (start2, length2)
+-		 */
+-		if (bsy->busy_tp != NULL) {
+-			bend = bsy->busy_start + bsy->busy_length - 1;
+-			if ((bno > bend) || (uend < bsy->busy_start)) {
+-				cnt--;
+-			} else {
+-				TRACE_BUSYSEARCH("xfs_alloc_search_busy",
+-					 "found1", agno, bno, len, tp);
+-				break;
+-			}
+-		}
+-	}
+-
+ 	/*
+-	 * If a block was found, force the log through the LSN of the
+-	 * transaction that freed the block
++	 * search pagb_list for this slot, skipping open slots. We have to
++	 * search the entire array as there may be multiple overlaps and
++	 * we have to get the most recent LSN for the log force to push out
++	 * all the transactions that span the range.
+ 	 */
+-	if (cnt) {
+-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
+-		lsn = bsy->busy_tp->t_commit_lsn;
+-		spin_unlock(&mp->m_perag[agno].pagb_lock);
+-		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+-	} else {
+-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
+-		spin_unlock(&mp->m_perag[agno].pagb_lock);
++	for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
++		bsy = &mp->m_perag[agno].pagb_list[cnt];
++		if (!bsy->busy_tp)
++			continue;
++		bend = bsy->busy_start + bsy->busy_length - 1;
++		if (bno > bend || uend < bsy->busy_start)
++			continue;
++
++		/* (start1,length1) within (start2, length2) */
++		if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
++			lsn = bsy->busy_tp->t_commit_lsn;
+ 	}
++	spin_unlock(&mp->m_perag[agno].pagb_lock);
++	TRACE_BUSYSEARCH("xfs_alloc_search_busy", lsn ? "found" : "not-found",
++						agno, bno, len, tp);
++	if (lsn)
++		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+ }
diff --git a/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch b/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch
new file mode 100644
index 00000000000..24c65714c5f
--- /dev/null
+++ b/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch
@@ -0,0 +1,49 @@
+From david@fromorbit.com  Fri Apr  2 11:06:34 2010
+From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+Date: Fri, 12 Mar 2010 09:42:03 +1100
+Subject: xfs: Fix error return for fallocate() on XFS
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-6-git-send-email-david@fromorbit.com>
+
+
+From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+
+commit 44a743f68705c681439f264deb05f8f38e9048d3 upstream
+
+Noticed that through glibc fallocate would return 28 rather than -1
+and errno = 28 for ENOSPC. The xfs routines uses XFS_ERROR format
+positive return error codes while the syscalls use negative return
+codes.  Fixup the two cases in xfs_vn_fallocate syscall to convert to
+negative.
+
+Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_iops.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_iops.c
++++ b/fs/xfs/linux-2.6/xfs_iops.c
+@@ -573,8 +573,8 @@ xfs_vn_fallocate(
+ 	bf.l_len = len;
+ 
+ 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+-	error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
+-				      0, XFS_ATTR_NOLOCK);
++	error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
++				       0, XFS_ATTR_NOLOCK);
+ 	if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
+ 	    offset + len > i_size_read(inode))
+ 		new_size = offset + len;
+@@ -585,7 +585,7 @@ xfs_vn_fallocate(
+ 
+ 		iattr.ia_valid = ATTR_SIZE;
+ 		iattr.ia_size = new_size;
+-		error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
++		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+ 	}
+ 
+ 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch b/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch
new file mode 100644
index 00000000000..92f60429b7b
--- /dev/null
+++ b/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch
@@ -0,0 +1,84 @@
+From david@fromorbit.com  Fri Apr  2 11:12:53 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:17 +1100
+Subject: xfs: fix locking for inode cache radix tree tag updates
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-20-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit f1f724e4b523d444c5a598d74505aefa3d6844d2 upstream
+
+The radix-tree code requires it's users to serialize tag updates
+against other updates to the tree.  While XFS protects tag updates
+against each other it does not serialize them against updates of the
+tree contents, which can lead to tag corruption.  Fix the inode
+cache to always take pag_ici_lock in exclusive mode when updating
+radix tree tags.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Patrick Schreurs <patrick@news-service.com>
+Tested-by: Patrick Schreurs <patrick@news-service.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_sync.c |    4 ++--
+ fs/xfs/xfs_iget.c           |   19 +++++++++++++------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -692,12 +692,12 @@ xfs_inode_set_reclaim_tag(
+ 	xfs_mount_t	*mp = ip->i_mount;
+ 	xfs_perag_t	*pag = xfs_get_perag(mp, ip->i_ino);
+ 
+-	read_lock(&pag->pag_ici_lock);
++	write_lock(&pag->pag_ici_lock);
+ 	spin_lock(&ip->i_flags_lock);
+ 	__xfs_inode_set_reclaim_tag(pag, ip);
+ 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ 	spin_unlock(&ip->i_flags_lock);
+-	read_unlock(&pag->pag_ici_lock);
++	write_unlock(&pag->pag_ici_lock);
+ 	xfs_put_perag(mp, pag);
+ }
+ 
+--- a/fs/xfs/xfs_iget.c
++++ b/fs/xfs/xfs_iget.c
+@@ -228,13 +228,12 @@ xfs_iget_cache_hit(
+ 		xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+ 
+ 		/*
+-		 * We need to set XFS_INEW atomically with clearing the
+-		 * reclaimable tag so that we do have an indicator of the
+-		 * inode still being initialized.
++		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
++		 * from stomping over us while we recycle the inode.  We can't
++		 * clear the radix tree reclaimable tag yet as it requires
++		 * pag_ici_lock to be held exclusive.
+ 		 */
+-		ip->i_flags |= XFS_INEW;
+-		ip->i_flags &= ~XFS_IRECLAIMABLE;
+-		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
++		ip->i_flags |= XFS_IRECLAIM;
+ 
+ 		spin_unlock(&ip->i_flags_lock);
+ 		read_unlock(&pag->pag_ici_lock);
+@@ -253,7 +252,15 @@ xfs_iget_cache_hit(
+ 			__xfs_inode_set_reclaim_tag(pag, ip);
+ 			goto out_error;
+ 		}
++
++		write_lock(&pag->pag_ici_lock);
++		spin_lock(&ip->i_flags_lock);
++		ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
++		ip->i_flags |= XFS_INEW;
++		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ 		inode->i_state = I_LOCK|I_NEW;
++		spin_unlock(&ip->i_flags_lock);
++		write_unlock(&pag->pag_ici_lock);
+ 	} else {
+ 		/* If the VFS inode is being torn down, pause and try again. */
+ 		if (!igrab(inode)) {
diff --git a/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch b/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch
new file mode 100644
index 00000000000..aef2a50cc28
--- /dev/null
+++ b/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch
@@ -0,0 +1,129 @@
+From david@fromorbit.com  Fri Apr  2 11:05:07 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:00 +1100
+Subject: xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-3-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit c56c9631cbe88f08854a56ff9776c1f310916830 upstream
+
+When xfs_free_eofblocks is called from ->release the VM might already
+hold the mmap_sem, but in the write path we take the iolock before
+taking the mmap_sem in the generic write code.
+
+Switch xfs_free_eofblocks to only trylock the iolock if called from
+->release and skip trimming the prellocated blocks in that case.
+We'll still free them later on the final iput.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_rw.h       |    7 -------
+ fs/xfs/xfs_vnodeops.c |   34 ++++++++++++++++++++++++++--------
+ 2 files changed, 26 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/xfs_rw.h
++++ b/fs/xfs/xfs_rw.h
+@@ -37,13 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_
+ }
+ 
+ /*
+- * Flags for xfs_free_eofblocks
+- */
+-#define XFS_FREE_EOF_LOCK	(1<<0)
+-#define XFS_FREE_EOF_NOLOCK	(1<<1)
+-
+-
+-/*
+  * helper function to extract extent size hint from inode
+  */
+ STATIC_INLINE xfs_extlen_t
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -709,6 +709,11 @@ xfs_fsync(
+ }
+ 
+ /*
++ * Flags for xfs_free_eofblocks
++ */
++#define XFS_FREE_EOF_TRYLOCK	(1<<0)
++
++/*
+  * This is called by xfs_inactive to free any blocks beyond eof
+  * when the link count isn't zero and by xfs_dm_punch_hole() when
+  * punching a hole to EOF.
+@@ -726,7 +731,6 @@ xfs_free_eofblocks(
+ 	xfs_filblks_t	map_len;
+ 	int		nimaps;
+ 	xfs_bmbt_irec_t	imap;
+-	int		use_iolock = (flags & XFS_FREE_EOF_LOCK);
+ 
+ 	/*
+ 	 * Figure out if there are any blocks beyond the end
+@@ -768,14 +772,19 @@ xfs_free_eofblocks(
+ 		 * cache and we can't
+ 		 * do that within a transaction.
+ 		 */
+-		if (use_iolock)
++		if (flags & XFS_FREE_EOF_TRYLOCK) {
++			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++				xfs_trans_cancel(tp, 0);
++				return 0;
++			}
++		} else {
+ 			xfs_ilock(ip, XFS_IOLOCK_EXCL);
++		}
+ 		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+ 				    ip->i_size);
+ 		if (error) {
+ 			xfs_trans_cancel(tp, 0);
+-			if (use_iolock)
+-				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ 			return error;
+ 		}
+ 
+@@ -812,8 +821,7 @@ xfs_free_eofblocks(
+ 			error = xfs_trans_commit(tp,
+ 						XFS_TRANS_RELEASE_LOG_RES);
+ 		}
+-		xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
+-					    : XFS_ILOCK_EXCL));
++		xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+ 	}
+ 	return error;
+ }
+@@ -1113,7 +1121,17 @@ xfs_release(
+ 		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+ 		    (!(ip->i_d.di_flags &
+ 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+-			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
++
++			/*
++			 * If we can't get the iolock just skip truncating
++			 * the blocks past EOF because we could deadlock
++			 * with the mmap_sem otherwise.  We'll get another
++			 * chance to drop them once the last reference to
++			 * the inode is dropped, so we'll never leak blocks
++			 * permanently.
++			 */
++			error = xfs_free_eofblocks(mp, ip,
++						   XFS_FREE_EOF_TRYLOCK);
+ 			if (error)
+ 				return error;
+ 		}
+@@ -1184,7 +1202,7 @@ xfs_inactive(
+ 		     (!(ip->i_d.di_flags &
+ 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+ 		      (ip->i_delayed_blks != 0)))) {
+-			error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
++			error = xfs_free_eofblocks(mp, ip, 0);
+ 			if (error)
+ 				return VN_INACTIVE_CACHE;
+ 		}
diff --git a/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch b/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch
new file mode 100644
index 00000000000..6cc19afd3e3
--- /dev/null
+++ b/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch
@@ -0,0 +1,64 @@
+From david@fromorbit.com  Fri Apr  2 11:10:21 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:11 +1100
+Subject: xfs: fix stale inode flush avoidance
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-14-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 4b6a46882cca8349e8942e2650c33b11bc571c92 upstream
+
+When reclaiming stale inodes, we need to guarantee that inodes are
+unpinned before returning with a "clean" status. If we don't we can
+reclaim inodes that are pinned, leading to use after free in the
+transaction subsystem as transactions complete.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_inode.c |   21 +++++++++++++++------
+ 1 file changed, 15 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2878,13 +2878,9 @@ xfs_iflush(
+ 
+ 	/*
+ 	 * If the inode isn't dirty, then just release the inode flush lock and
+-	 * do nothing. Treat stale inodes the same; we cannot rely on the
+-	 * backing buffer remaining stale in cache for the remaining life of
+-	 * the stale inode and so xfs_itobp() below may give us a buffer that
+-	 * no longer contains inodes below. Doing this stale check here also
+-	 * avoids forcing the log on pinned, stale inodes.
++	 * do nothing.
+ 	 */
+-	if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
++	if (xfs_inode_clean(ip)) {
+ 		xfs_ifunlock(ip);
+ 		return 0;
+ 	}
+@@ -2908,6 +2904,19 @@ xfs_iflush(
+ 	xfs_iunpin_wait(ip);
+ 
+ 	/*
++	 * For stale inodes we cannot rely on the backing buffer remaining
++	 * stale in cache for the remaining life of the stale inode and so
++	 * xfs_itobp() below may give us a buffer that no longer contains
++	 * inodes below. We have to check this after ensuring the inode is
++	 * unpinned so that it is safe to reclaim the stale inode after the
++	 * flush call.
++	 */
++	if (xfs_iflags_test(ip, XFS_ISTALE)) {
++		xfs_ifunlock(ip);
++		return 0;
++	}
++
++	/*
+ 	 * This may have been unpinned because the filesystem is shutting
+ 	 * down forcibly. If that's the case we must not write this inode
+ 	 * to disk, because the log record didn't make it to disk!
diff --git a/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch b/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch
new file mode 100644
index 00000000000..025a4a1033b
--- /dev/null
+++ b/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch
@@ -0,0 +1,215 @@
+From david@fromorbit.com  Fri Apr  2 11:07:34 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:05 +1100
+Subject: xfs: fix timestamp handling in xfs_setattr
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-8-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit d6d59bada372bcf8bd36c3bbc71c485c29dd2a4b upstream
+
+We currently have some rather odd code in xfs_setattr for
+updating the a/c/mtime timestamps:
+
+ - first we do a non-transaction update if all three are updated
+   together
+ - second we implicitly update the ctime for various changes
+   instead of relying on the ATTR_CTIME flag
+ - third we set the timestamps to the current time instead of the
+   arguments in the iattr structure in many cases.
+
+This patch makes sure we update it in a consistent way:
+
+ - always transactional
+ - ctime is only updated if ATTR_CTIME is set or we do a size
+   update, which is a special case
+ - always to the times passed in from the caller instead of the
+   current time
+
+The only non-size caller of xfs_setattr that doesn't come from
+the VFS is updated to set ATTR_CTIME and pass in a valid ctime
+value.
+
+Reported-by: Eric Blake <ebb9@byu.net>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_acl.c |    3 -
+ fs/xfs/xfs_vnodeops.c      |   93 ++++++++++++++++++---------------------------
+ 2 files changed, 41 insertions(+), 55 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_acl.c
++++ b/fs/xfs/linux-2.6/xfs_acl.c
+@@ -250,8 +250,9 @@ xfs_set_mode(struct inode *inode, mode_t
+ 	if (mode != inode->i_mode) {
+ 		struct iattr iattr;
+ 
+-		iattr.ia_valid = ATTR_MODE;
++		iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+ 		iattr.ia_mode = mode;
++		iattr.ia_ctime = current_fs_time(inode->i_sb);
+ 
+ 		error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+ 	}
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -69,7 +69,6 @@ xfs_setattr(
+ 	uint			commit_flags=0;
+ 	uid_t			uid=0, iuid=0;
+ 	gid_t			gid=0, igid=0;
+-	int			timeflags = 0;
+ 	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
+ 	int			need_iolock = 1;
+ 
+@@ -134,16 +133,13 @@ xfs_setattr(
+ 	if (flags & XFS_ATTR_NOLOCK)
+ 		need_iolock = 0;
+ 	if (!(mask & ATTR_SIZE)) {
+-		if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
+-		    (mp->m_flags & XFS_MOUNT_WSYNC)) {
+-			tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+-			commit_flags = 0;
+-			if ((code = xfs_trans_reserve(tp, 0,
+-						     XFS_ICHANGE_LOG_RES(mp), 0,
+-						     0, 0))) {
+-				lock_flags = 0;
+-				goto error_return;
+-			}
++		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
++		commit_flags = 0;
++		code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
++					 0, 0, 0);
++		if (code) {
++			lock_flags = 0;
++			goto error_return;
+ 		}
+ 	} else {
+ 		if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
+@@ -294,15 +290,23 @@ xfs_setattr(
+ 		 * or we are explicitly asked to change it. This handles
+ 		 * the semantic difference between truncate() and ftruncate()
+ 		 * as implemented in the VFS.
++		 *
++		 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
++		 * is a special case where we need to update the times despite
++		 * not having these flags set.  For all other operations the
++		 * VFS set these flags explicitly if it wants a timestamp
++		 * update.
+ 		 */
+-		if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
+-			timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
++		if (iattr->ia_size != ip->i_size &&
++		    (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
++			iattr->ia_ctime = iattr->ia_mtime =
++				current_fs_time(inode->i_sb);
++			mask |= ATTR_CTIME | ATTR_MTIME;
++		}
+ 
+ 		if (iattr->ia_size > ip->i_size) {
+ 			ip->i_d.di_size = iattr->ia_size;
+ 			ip->i_size = iattr->ia_size;
+-			if (!(flags & XFS_ATTR_DMI))
+-				xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+ 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ 		} else if (iattr->ia_size <= ip->i_size ||
+ 			   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+@@ -373,9 +377,6 @@ xfs_setattr(
+ 			ip->i_d.di_gid = gid;
+ 			inode->i_gid = gid;
+ 		}
+-
+-		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+-		timeflags |= XFS_ICHGTIME_CHG;
+ 	}
+ 
+ 	/*
+@@ -392,51 +393,37 @@ xfs_setattr(
+ 
+ 		inode->i_mode &= S_IFMT;
+ 		inode->i_mode |= mode & ~S_IFMT;
+-
+-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+-		timeflags |= XFS_ICHGTIME_CHG;
+ 	}
+ 
+ 	/*
+ 	 * Change file access or modified times.
+ 	 */
+-	if (mask & (ATTR_ATIME|ATTR_MTIME)) {
+-		if (mask & ATTR_ATIME) {
+-			inode->i_atime = iattr->ia_atime;
+-			ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+-			ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+-			ip->i_update_core = 1;
+-		}
+-		if (mask & ATTR_MTIME) {
+-			inode->i_mtime = iattr->ia_mtime;
+-			ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+-			ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+-			timeflags &= ~XFS_ICHGTIME_MOD;
+-			timeflags |= XFS_ICHGTIME_CHG;
+-		}
+-		if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
+-			xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
++	if (mask & ATTR_ATIME) {
++		inode->i_atime = iattr->ia_atime;
++		ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
++		ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
++		ip->i_update_core = 1;
+ 	}
+-
+-	/*
+-	 * Change file inode change time only if ATTR_CTIME set
+-	 * AND we have been called by a DMI function.
+-	 */
+-
+-	if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
++	if (mask & ATTR_CTIME) {
+ 		inode->i_ctime = iattr->ia_ctime;
+ 		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ 		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+ 		ip->i_update_core = 1;
+-		timeflags &= ~XFS_ICHGTIME_CHG;
++	}
++	if (mask & ATTR_MTIME) {
++		inode->i_mtime = iattr->ia_mtime;
++		ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
++		ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
++		ip->i_update_core = 1;
+ 	}
+ 
+ 	/*
+-	 * Send out timestamp changes that need to be set to the
+-	 * current time.  Not done when called by a DMI function.
++	 * And finally, log the inode core if any attribute in it
++	 * has been changed.
+ 	 */
+-	if (timeflags && !(flags & XFS_ATTR_DMI))
+-		xfs_ichgtime(ip, timeflags);
++	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
++		    ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
++		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ 
+ 	XFS_STATS_INC(xs_ig_attrchg);
+ 
+@@ -451,12 +438,10 @@ xfs_setattr(
+ 	 * mix so this probably isn't worth the trouble to optimize.
+ 	 */
+ 	code = 0;
+-	if (tp) {
+-		if (mp->m_flags & XFS_MOUNT_WSYNC)
+-			xfs_trans_set_sync(tp);
++	if (mp->m_flags & XFS_MOUNT_WSYNC)
++		xfs_trans_set_sync(tp);
+ 
+-		code = xfs_trans_commit(tp, commit_flags);
+-	}
++	code = xfs_trans_commit(tp, commit_flags);
+ 
+ 	xfs_iunlock(ip, lock_flags);
+ 
diff --git a/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch b/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch
new file mode 100644
index 00000000000..f215decd317
--- /dev/null
+++ b/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch
@@ -0,0 +1,112 @@
+From david@fromorbit.com  Fri Apr  2 11:05:39 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:01 +1100
+Subject: xfs: I/O completion handlers must use NOFS allocations
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-4-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 80641dc66a2d6dfb22af4413227a92b8ab84c7bb upstream
+
+When completing I/O requests we must not allow the memory allocator to
+recurse into the filesystem, as we might deadlock on waiting for the
+I/O completion otherwise.  The only thing currently allocating normal
+GFP_KERNEL memory is the allocation of the transaction structure for
+the unwritten extent conversion.  Add a memflags argument to
+_xfs_trans_alloc to allow controlling the allocator behaviour.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Thomas Neumann <tneumann@users.sourceforge.net>
+Tested-by: Thomas Neumann <tneumann@users.sourceforge.net>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_fsops.c |    2 +-
+ fs/xfs/xfs_iomap.c |    9 ++++++++-
+ fs/xfs/xfs_mount.c |    2 +-
+ fs/xfs/xfs_trans.c |    7 ++++---
+ fs/xfs/xfs_trans.h |    2 +-
+ 5 files changed, 15 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -611,7 +611,7 @@ xfs_fs_log_dummy(
+ 	xfs_inode_t	*ip;
+ 	int		error;
+ 
+-	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
++	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
+ 	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ 	if (error) {
+ 		xfs_trans_cancel(tp, 0);
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
+ 		 * set up a transaction to convert the range of extents
+ 		 * from unwritten to real. Do allocations in a loop until
+ 		 * we have covered the range passed in.
++		 *
++		 * Note that we open code the transaction allocation here
++		 * to pass KM_NOFS--we can't risk to recursing back into
++		 * the filesystem here as we might be asked to write out
++		 * the same inode that we complete here and might deadlock
++		 * on the iolock.
+ 		 */
+-		tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
++		xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
++		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+ 		tp->t_flags |= XFS_TRANS_RESERVE;
+ 		error = xfs_trans_reserve(tp, resblks,
+ 				XFS_WRITE_LOG_RES(mp), 0,
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1471,7 +1471,7 @@ xfs_log_sbcount(
+ 	if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+ 		return 0;
+ 
+-	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
++	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+ 	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ 					XFS_DEFAULT_LOG_COUNT);
+ 	if (error) {
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -236,19 +236,20 @@ xfs_trans_alloc(
+ 	uint		type)
+ {
+ 	xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+-	return _xfs_trans_alloc(mp, type);
++	return _xfs_trans_alloc(mp, type, KM_SLEEP);
+ }
+ 
+ xfs_trans_t *
+ _xfs_trans_alloc(
+ 	xfs_mount_t	*mp,
+-	uint		type)
++	uint		type,
++	uint		memflags)
+ {
+ 	xfs_trans_t	*tp;
+ 
+ 	atomic_inc(&mp->m_active_trans);
+ 
+-	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
++	tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
+ 	tp->t_magic = XFS_TRANS_MAGIC;
+ 	tp->t_type = type;
+ 	tp->t_mountp = mp;
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -924,7 +924,7 @@ typedef struct xfs_trans {
+  * XFS transaction mechanism exported interfaces.
+  */
+ xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
+-xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint);
++xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+ xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
+ int		xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+ 				  uint, uint);
diff --git a/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch b/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch
new file mode 100644
index 00000000000..f445c700246
--- /dev/null
+++ b/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch
@@ -0,0 +1,222 @@
+From david@fromorbit.com  Fri Apr  2 11:12:28 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:16 +1100
+Subject: xfs: Non-blocking inode locking in IO completion
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-19-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 77d7a0c2eeb285c9069e15396703d0cb9690ac50 upstream
+
+The introduction of barriers to loop devices has created a new IO
+order completion dependency that XFS does not handle. The loop
+device implements barriers using fsync and so turns a log IO in the
+XFS filesystem on the loop device into a data IO in the backing
+filesystem. That is, the completion of log IOs in the loop
+filesystem are now dependent on completion of data IO in the backing
+filesystem.
+
+This can cause deadlocks when a flush daemon issues a log force with
+an inode locked because the IO completion of IO on the inode is
+blocked by the inode lock. This in turn prevents further data IO
+completion from occuring on all XFS filesystems on that CPU (due to
+the shared nature of the completion queues). This then prevents the
+log IO from completing because the log is waiting for data IO
+completion as well.
+
+The fix for this new completion order dependency issue is to make
+the IO completion inode locking non-blocking. If the inode lock
+can't be grabbed, simply requeue the IO completion back to the work
+queue so that it can be processed later. This prevents the
+completion queue from being blocked and allows data IO completion on
+other inodes to proceed, hence avoiding completion order dependent
+deadlocks.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_aops.c |  118 ++++++++++++++++++++++++++++++--------------
+ 1 file changed, 82 insertions(+), 36 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_aops.c
++++ b/fs/xfs/linux-2.6/xfs_aops.c
+@@ -204,14 +204,17 @@ xfs_ioend_new_eof(
+ }
+ 
+ /*
+- * Update on-disk file size now that data has been written to disk.
+- * The current in-memory file size is i_size.  If a write is beyond
+- * eof i_new_size will be the intended file size until i_size is
+- * updated.  If this write does not extend all the way to the valid
+- * file size then restrict this update to the end of the write.
++ * Update on-disk file size now that data has been written to disk.  The
++ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
++ * will be the intended file size until i_size is updated.  If this write does
++ * not extend all the way to the valid file size then restrict this update to
++ * the end of the write.
++ *
++ * This function does not block as blocking on the inode lock in IO completion
++ * can lead to IO completion order dependency deadlocks.. If it can't get the
++ * inode ilock it will return EAGAIN. Callers must handle this.
+  */
+-
+-STATIC void
++STATIC int
+ xfs_setfilesize(
+ 	xfs_ioend_t		*ioend)
+ {
+@@ -222,9 +225,11 @@ xfs_setfilesize(
+ 	ASSERT(ioend->io_type != IOMAP_READ);
+ 
+ 	if (unlikely(ioend->io_error))
+-		return;
++		return 0;
++
++	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
++		return EAGAIN;
+ 
+-	xfs_ilock(ip, XFS_ILOCK_EXCL);
+ 	isize = xfs_ioend_new_eof(ioend);
+ 	if (isize) {
+ 		ip->i_d.di_size = isize;
+@@ -232,6 +237,28 @@ xfs_setfilesize(
+ 	}
+ 
+ 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
++	return 0;
++}
++
++/*
++ * Schedule IO completion handling on a xfsdatad if this was
++ * the final hold on this ioend. If we are asked to wait,
++ * flush the workqueue.
++ */
++STATIC void
++xfs_finish_ioend(
++	xfs_ioend_t	*ioend,
++	int		wait)
++{
++	if (atomic_dec_and_test(&ioend->io_remaining)) {
++		struct workqueue_struct *wq;
++
++		wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
++			xfsconvertd_workqueue : xfsdatad_workqueue;
++		queue_work(wq, &ioend->io_work);
++		if (wait)
++			flush_workqueue(wq);
++	}
+ }
+ 
+ /*
+@@ -243,9 +270,23 @@ xfs_end_bio_delalloc(
+ {
+ 	xfs_ioend_t		*ioend =
+ 		container_of(work, xfs_ioend_t, io_work);
++	int			error;
+ 
+-	xfs_setfilesize(ioend);
+-	xfs_destroy_ioend(ioend);
++	/*
++	 * If we didn't complete processing of the ioend, requeue it to the
++	 * tail of the workqueue for another attempt later. Otherwise destroy
++	 * it.
++	 */
++	error = xfs_setfilesize(ioend);
++	if (error == EAGAIN) {
++		atomic_inc(&ioend->io_remaining);
++		xfs_finish_ioend(ioend, 0);
++		/* ensure we don't spin on blocked ioends */
++		delay(1);
++	} else {
++		ASSERT(!error);
++		xfs_destroy_ioend(ioend);
++	}
+ }
+ 
+ /*
+@@ -257,9 +298,23 @@ xfs_end_bio_written(
+ {
+ 	xfs_ioend_t		*ioend =
+ 		container_of(work, xfs_ioend_t, io_work);
++	int			error;
+ 
+-	xfs_setfilesize(ioend);
+-	xfs_destroy_ioend(ioend);
++	/*
++	 * If we didn't complete processing of the ioend, requeue it to the
++	 * tail of the workqueue for another attempt later. Otherwise destroy
++	 * it.
++	 */
++	error = xfs_setfilesize(ioend);
++	if (error == EAGAIN) {
++		atomic_inc(&ioend->io_remaining);
++		xfs_finish_ioend(ioend, 0);
++		/* ensure we don't spin on blocked ioends */
++		delay(1);
++	} else {
++		ASSERT(!error);
++		xfs_destroy_ioend(ioend);
++	}
+ }
+ 
+ /*
+@@ -279,13 +334,25 @@ xfs_end_bio_unwritten(
+ 	size_t			size = ioend->io_size;
+ 
+ 	if (likely(!ioend->io_error)) {
++		int	error;
+ 		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+-			int error;
+ 			error = xfs_iomap_write_unwritten(ip, offset, size);
+ 			if (error)
+ 				ioend->io_error = error;
+ 		}
+-		xfs_setfilesize(ioend);
++		/*
++		 * If we didn't complete processing of the ioend, requeue it to the
++		 * tail of the workqueue for another attempt later. Otherwise destroy
++		 * it.
++		 */
++		error = xfs_setfilesize(ioend);
++		if (error == EAGAIN) {
++			atomic_inc(&ioend->io_remaining);
++			xfs_finish_ioend(ioend, 0);
++			/* ensure we don't spin on blocked ioends */
++			delay(1);
++			return;
++		}
+ 	}
+ 	xfs_destroy_ioend(ioend);
+ }
+@@ -304,27 +371,6 @@ xfs_end_bio_read(
+ }
+ 
+ /*
+- * Schedule IO completion handling on a xfsdatad if this was
+- * the final hold on this ioend. If we are asked to wait,
+- * flush the workqueue.
+- */
+-STATIC void
+-xfs_finish_ioend(
+-	xfs_ioend_t	*ioend,
+-	int		wait)
+-{
+-	if (atomic_dec_and_test(&ioend->io_remaining)) {
+-		struct workqueue_struct *wq = xfsdatad_workqueue;
+-		if (ioend->io_work.func == xfs_end_bio_unwritten)
+-			wq = xfsconvertd_workqueue;
+-
+-		queue_work(wq, &ioend->io_work);
+-		if (wait)
+-			flush_workqueue(wq);
+-	}
+-}
+-
+-/*
+  * Allocate and initialise an IO completion structure.
+  * We need to track unwritten extent write completion here initially.
+  * We'll need to extend this for updating the ondisk inode size later
diff --git a/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch b/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch
new file mode 100644
index 00000000000..055cfcc9bdb
--- /dev/null
+++ b/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch
@@ -0,0 +1,38 @@
+From david@fromorbit.com  Fri Apr  2 11:11:19 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:13 +1100
+Subject: xfs: quota limit statvfs available blocks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-16-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 9b00f30762fe9f914eb6e03057a616ed63a4e8ca upstream
+
+A "df" run on an NFS client of an exported XFS file system reports
+the wrong information for "available" blocks.  When a block quota is
+enforced, the amount reported as free is limited by the quota, but
+the amount reported available is not (and should be).
+
+Reported-by: Guk-Bong, Kwon <gbkwon@gmail.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/quota/xfs_qm_bhv.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/quota/xfs_qm_bhv.c
++++ b/fs/xfs/quota/xfs_qm_bhv.c
+@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
+ 		be64_to_cpu(dp->d_blk_hardlimit);
+ 	if (limit && statp->f_blocks > limit) {
+ 		statp->f_blocks = limit;
+-		statp->f_bfree =
++		statp->f_bfree = statp->f_bavail =
+ 			(statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
+ 			 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+ 	}
diff --git a/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch b/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch
new file mode 100644
index 00000000000..7610deba7b0
--- /dev/null
+++ b/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch
@@ -0,0 +1,64 @@
+From david@fromorbit.com  Fri Apr  2 11:09:55 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:10 +1100
+Subject: xfs: reclaim all inodes by background tree walks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-13-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 57817c68229984818fea9e614d6f95249c3fb098 upstream
+
+We cannot do direct inode reclaim without taking the flush lock to
+ensure that we do not reclaim an inode under IO. We check the inode
+is clean before doing direct reclaim, but this is not good enough
+because the inode flush code marks the inode clean once it has
+copied the in-core dirty state to the backing buffer.
+
+It is the flush lock that determines whether the inode is still
+under IO, even though it is marked clean, and the inode is still
+required at IO completion so we can't reclaim it even though it is
+clean in core. Hence the requirement that we need to take the flush
+lock even on clean inodes because this guarantees that the inode
+writeback IO has completed and it is safe to reclaim the inode.
+
+With delayed write inode flushing, we could end up waiting a long
+time on the flush lock even for a clean inode. The background
+reclaim already handles this efficiently, so avoid all the problems
+by killing the direct reclaim path altogether.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_super.c |   14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -953,16 +953,14 @@ xfs_fs_destroy_inode(
+ 	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+ 
+ 	/*
+-	 * If we have nothing to flush with this inode then complete the
+-	 * teardown now, otherwise delay the flush operation.
++	 * We always use background reclaim here because even if the
++	 * inode is clean, it still may be under IO and hence we have
++	 * to take the flush lock. The background reclaim path handles
++	 * this more efficiently than we can here, so simply let background
++	 * reclaim tear down all inodes.
+ 	 */
+-	if (!xfs_inode_clean(ip)) {
+-		xfs_inode_set_reclaim_tag(ip);
+-		return;
+-	}
+-
+ out_reclaim:
+-	xfs_ireclaim(ip);
++	xfs_inode_set_reclaim_tag(ip);
+ }
+ 
+ /*
diff --git a/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch b/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch
new file mode 100644
index 00000000000..dbe7c012113
--- /dev/null
+++ b/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch
@@ -0,0 +1,309 @@
+From david@fromorbit.com  Fri Apr  2 11:09:00 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:08 +1100
+Subject: xfs: reclaim inodes under a write lock
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-11-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit c8e20be020f234c8d492927a424a7d8bbefd5b5d upstream
+
+Make the inode tree reclaim walk exclusive to avoid races with
+concurrent sync walkers and lookups. This is a version of a patch
+posted by Christoph Hellwig that avoids all the code duplication.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_sync.c    |  154 ++++++++++++++++++-----------------------
+ fs/xfs/linux-2.6/xfs_sync.h    |    2 
+ fs/xfs/quota/xfs_qm_syscalls.c |    2 
+ 3 files changed, 71 insertions(+), 87 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -64,7 +64,6 @@ xfs_inode_ag_lookup(
+ 	 * as the tree is sparse and a gang lookup walks to find
+ 	 * the number of objects requested.
+ 	 */
+-	read_lock(&pag->pag_ici_lock);
+ 	if (tag == XFS_ICI_NO_TAG) {
+ 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ 				(void **)&ip, *first_index, 1);
+@@ -73,7 +72,7 @@ xfs_inode_ag_lookup(
+ 				(void **)&ip, *first_index, 1, tag);
+ 	}
+ 	if (!nr_found)
+-		goto unlock;
++		return NULL;
+ 
+ 	/*
+ 	 * Update the index for the next lookup. Catch overflows
+@@ -83,13 +82,8 @@ xfs_inode_ag_lookup(
+ 	 */
+ 	*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ 	if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+-		goto unlock;
+-
++		return NULL;
+ 	return ip;
+-
+-unlock:
+-	read_unlock(&pag->pag_ici_lock);
+-	return NULL;
+ }
+ 
+ STATIC int
+@@ -99,7 +93,8 @@ xfs_inode_ag_walk(
+ 	int			(*execute)(struct xfs_inode *ip,
+ 					   struct xfs_perag *pag, int flags),
+ 	int			flags,
+-	int			tag)
++	int			tag,
++	int			exclusive)
+ {
+ 	struct xfs_perag	*pag = &mp->m_perag[ag];
+ 	uint32_t		first_index;
+@@ -113,10 +108,20 @@ restart:
+ 		int		error = 0;
+ 		xfs_inode_t	*ip;
+ 
++		if (exclusive)
++			write_lock(&pag->pag_ici_lock);
++		else
++			read_lock(&pag->pag_ici_lock);
+ 		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+-		if (!ip)
++		if (!ip) {
++			if (exclusive)
++				write_unlock(&pag->pag_ici_lock);
++			else
++				read_unlock(&pag->pag_ici_lock);
+ 			break;
++		}
+ 
++		/* execute releases pag->pag_ici_lock */
+ 		error = execute(ip, pag, flags);
+ 		if (error == EAGAIN) {
+ 			skipped++;
+@@ -124,9 +129,8 @@ restart:
+ 		}
+ 		if (error)
+ 			last_error = error;
+-		/*
+-		 * bail out if the filesystem is corrupted.
+-		 */
++
++		/* bail out if the filesystem is corrupted.  */
+ 		if (error == EFSCORRUPTED)
+ 			break;
+ 
+@@ -147,7 +151,8 @@ xfs_inode_ag_iterator(
+ 	int			(*execute)(struct xfs_inode *ip,
+ 					   struct xfs_perag *pag, int flags),
+ 	int			flags,
+-	int			tag)
++	int			tag,
++	int			exclusive)
+ {
+ 	int			error = 0;
+ 	int			last_error = 0;
+@@ -156,7 +161,8 @@ xfs_inode_ag_iterator(
+ 	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+ 		if (!mp->m_perag[ag].pag_ici_init)
+ 			continue;
+-		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
++		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
++						exclusive);
+ 		if (error) {
+ 			last_error = error;
+ 			if (error == EFSCORRUPTED)
+@@ -180,11 +186,7 @@ xfs_sync_inode_valid(
+ 		return EFSCORRUPTED;
+ 	}
+ 
+-	/*
+-	 * If we can't get a reference on the inode, it must be in reclaim.
+-	 * Leave it for the reclaim code to flush. Also avoid inodes that
+-	 * haven't been fully initialised.
+-	 */
++	/* If we can't get a reference on the inode, it must be in reclaim. */
+ 	if (!igrab(inode)) {
+ 		read_unlock(&pag->pag_ici_lock);
+ 		return ENOENT;
+@@ -281,7 +283,7 @@ xfs_sync_data(
+ 	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+ 
+ 	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+-				      XFS_ICI_NO_TAG);
++				      XFS_ICI_NO_TAG, 0);
+ 	if (error)
+ 		return XFS_ERROR(error);
+ 
+@@ -303,7 +305,7 @@ xfs_sync_attr(
+ 	ASSERT((flags & ~SYNC_WAIT) == 0);
+ 
+ 	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+-				     XFS_ICI_NO_TAG);
++				     XFS_ICI_NO_TAG, 0);
+ }
+ 
+ STATIC int
+@@ -663,60 +665,6 @@ xfs_syncd_stop(
+ 	kthread_stop(mp->m_sync_task);
+ }
+ 
+-STATIC int
+-xfs_reclaim_inode(
+-	xfs_inode_t	*ip,
+-	int		sync_mode)
+-{
+-	xfs_perag_t	*pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+-
+-	/* The hash lock here protects a thread in xfs_iget_core from
+-	 * racing with us on linking the inode back with a vnode.
+-	 * Once we have the XFS_IRECLAIM flag set it will not touch
+-	 * us.
+-	 */
+-	write_lock(&pag->pag_ici_lock);
+-	spin_lock(&ip->i_flags_lock);
+-	if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+-	    !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+-		spin_unlock(&ip->i_flags_lock);
+-		write_unlock(&pag->pag_ici_lock);
+-		return -EAGAIN;
+-	}
+-	__xfs_iflags_set(ip, XFS_IRECLAIM);
+-	spin_unlock(&ip->i_flags_lock);
+-	write_unlock(&pag->pag_ici_lock);
+-	xfs_put_perag(ip->i_mount, pag);
+-
+-	/*
+-	 * If the inode is still dirty, then flush it out.  If the inode
+-	 * is not in the AIL, then it will be OK to flush it delwri as
+-	 * long as xfs_iflush() does not keep any references to the inode.
+-	 * We leave that decision up to xfs_iflush() since it has the
+-	 * knowledge of whether it's OK to simply do a delwri flush of
+-	 * the inode or whether we need to wait until the inode is
+-	 * pulled from the AIL.
+-	 * We get the flush lock regardless, though, just to make sure
+-	 * we don't free it while it is being flushed.
+-	 */
+-	xfs_ilock(ip, XFS_ILOCK_EXCL);
+-	xfs_iflock(ip);
+-
+-	/*
+-	 * In the case of a forced shutdown we rely on xfs_iflush() to
+-	 * wait for the inode to be unpinned before returning an error.
+-	 */
+-	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+-		/* synchronize with xfs_iflush_done */
+-		xfs_iflock(ip);
+-		xfs_ifunlock(ip);
+-	}
+-
+-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-	xfs_ireclaim(ip);
+-	return 0;
+-}
+-
+ void
+ __xfs_inode_set_reclaim_tag(
+ 	struct xfs_perag	*pag,
+@@ -759,19 +707,55 @@ __xfs_inode_clear_reclaim_tag(
+ }
+ 
+ STATIC int
+-xfs_reclaim_inode_now(
++xfs_reclaim_inode(
+ 	struct xfs_inode	*ip,
+ 	struct xfs_perag	*pag,
+-	int			flags)
++	int			sync_mode)
+ {
+-	/* ignore if already under reclaim */
+-	if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+-		read_unlock(&pag->pag_ici_lock);
++	/*
++	 * The radix tree lock here protects a thread in xfs_iget from racing
++	 * with us starting reclaim on the inode.  Once we have the
++	 * XFS_IRECLAIM flag set it will not touch us.
++	 */
++	spin_lock(&ip->i_flags_lock);
++	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
++	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
++		/* ignore as it is already under reclaim */
++		spin_unlock(&ip->i_flags_lock);
++		write_unlock(&pag->pag_ici_lock);
+ 		return 0;
+ 	}
+-	read_unlock(&pag->pag_ici_lock);
++	__xfs_iflags_set(ip, XFS_IRECLAIM);
++	spin_unlock(&ip->i_flags_lock);
++	write_unlock(&pag->pag_ici_lock);
+ 
+-	return xfs_reclaim_inode(ip, flags);
++	/*
++	 * If the inode is still dirty, then flush it out.  If the inode
++	 * is not in the AIL, then it will be OK to flush it delwri as
++	 * long as xfs_iflush() does not keep any references to the inode.
++	 * We leave that decision up to xfs_iflush() since it has the
++	 * knowledge of whether it's OK to simply do a delwri flush of
++	 * the inode or whether we need to wait until the inode is
++	 * pulled from the AIL.
++	 * We get the flush lock regardless, though, just to make sure
++	 * we don't free it while it is being flushed.
++	 */
++	xfs_ilock(ip, XFS_ILOCK_EXCL);
++	xfs_iflock(ip);
++
++	/*
++	 * In the case of a forced shutdown we rely on xfs_iflush() to
++	 * wait for the inode to be unpinned before returning an error.
++	 */
++	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
++		/* synchronize with xfs_iflush_done */
++		xfs_iflock(ip);
++		xfs_ifunlock(ip);
++	}
++
++	xfs_iunlock(ip, XFS_ILOCK_EXCL);
++	xfs_ireclaim(ip);
++	return 0;
+ }
+ 
+ int
+@@ -779,6 +763,6 @@ xfs_reclaim_inodes(
+ 	xfs_mount_t	*mp,
+ 	int		mode)
+ {
+-	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+-					XFS_ICI_RECLAIM_TAG);
++	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
++					XFS_ICI_RECLAIM_TAG, 1);
+ }
+--- a/fs/xfs/linux-2.6/xfs_sync.h
++++ b/fs/xfs/linux-2.6/xfs_sync.h
+@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struc
+ int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+ int xfs_inode_ag_iterator(struct xfs_mount *mp,
+ 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+-	int flags, int tag);
++	int flags, int tag, int write_lock);
+ 
+ #endif
+--- a/fs/xfs/quota/xfs_qm_syscalls.c
++++ b/fs/xfs/quota/xfs_qm_syscalls.c
+@@ -893,7 +893,7 @@ xfs_qm_dqrele_all_inodes(
+ 	uint		 flags)
+ {
+ 	ASSERT(mp->m_quotainfo);
+-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
++	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+ }
+ 
+ /*------------------------------------------------------------------------*/
diff --git a/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch b/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch
new file mode 100644
index 00000000000..d7d38b26bb9
--- /dev/null
+++ b/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch
@@ -0,0 +1,64 @@
+From david@fromorbit.com  Fri Apr  2 11:12:06 2010
+From: Christoph Hellwig <hch@lst.de>
+Date: Fri, 12 Mar 2010 09:42:15 +1100
+Subject: xfs: remove invalid barrier optimization from xfs_fsync
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-18-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e8b217e7530c6a073ac69f1c85b922d93fdf5647 upstream
+
+Date: Tue, 2 Feb 2010 10:16:26 +1100
+We always need to flush the disk write cache and can't skip it just because
+the no inode attributes have changed.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_vnodeops.c |   12 ++----------
+ 1 file changed, 2 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -597,7 +597,7 @@ xfs_fsync(
+ {
+ 	xfs_trans_t	*tp;
+ 	int		error = 0;
+-	int		log_flushed = 0, changed = 1;
++	int		log_flushed = 0;
+ 
+ 	xfs_itrace_entry(ip);
+ 
+@@ -627,19 +627,11 @@ xfs_fsync(
+ 		 * disk yet, the inode will be still be pinned.  If it is,
+ 		 * force the log.
+ 		 */
+-
+ 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+-
+ 		if (xfs_ipincount(ip)) {
+ 			error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+ 				      XFS_LOG_FORCE | XFS_LOG_SYNC,
+ 				      &log_flushed);
+-		} else {
+-			/*
+-			 * If the inode is not pinned and nothing has changed
+-			 * we don't need to flush the cache.
+-			 */
+-			changed = 0;
+ 		}
+ 	} else	{
+ 		/*
+@@ -674,7 +666,7 @@ xfs_fsync(
+ 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ 	}
+ 
+-	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
++	if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+ 		/*
+ 		 * If the log write didn't issue an ordered tag we need
+ 		 * to flush the disk cache for the data device now.
diff --git a/queue-2.6.32/xfs-simplify-inode-teardown.patch b/queue-2.6.32/xfs-simplify-inode-teardown.patch
new file mode 100644
index 00000000000..6af994eab54
--- /dev/null
+++ b/queue-2.6.32/xfs-simplify-inode-teardown.patch
@@ -0,0 +1,206 @@
+From david@fromorbit.com  Fri Apr  2 11:04:20 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:41:59 +1100
+Subject: xfs: simplify inode teardown
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-2-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 848ce8f731aed0a2d4ab5884a4f6664af73d2dd0 upstream
+
+Currently the reclaim code for the case where we don't reclaim the
+final reclaim is overly complicated.  We know that the inode is clean
+but instead of just directly reclaiming the clean inode we go through
+the whole process of marking the inode reclaimable just to directly
+reclaim it from the calling context.  Besides being overly complicated
+this introduces a race where iget could recycle an inode between
+marked reclaimable and actually being reclaimed leading to panics.
+
+This patch gets rid of the existing reclaim path, and replaces it with
+a simple call to xfs_ireclaim if the inode was clean.  While we're at
+it we also use the slightly more lax xfs_inode_clean check we'd use
+later to determine if we need to flush the inode here.
+
+Finally get rid of xfs_reclaim function and place the remaining small
+bits of reclaim code directly into xfs_fs_destroy_inode.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Patrick Schreurs <patrick@news-service.com>
+Reported-by: Tommy van Leeuwen <tommy@news-service.com>
+Tested-by: Patrick Schreurs <patrick@news-service.com>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_super.c |   34 ++++++++++++++++++++++++++++++----
+ fs/xfs/linux-2.6/xfs_sync.c  |   15 ++++-----------
+ fs/xfs/linux-2.6/xfs_sync.h  |    1 -
+ fs/xfs/xfs_vnodeops.c        |   40 ----------------------------------------
+ fs/xfs/xfs_vnodeops.h        |    1 -
+ 5 files changed, 34 insertions(+), 57 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -930,13 +930,39 @@ xfs_fs_alloc_inode(
+  */
+ STATIC void
+ xfs_fs_destroy_inode(
+-	struct inode	*inode)
++	struct inode		*inode)
+ {
+-	xfs_inode_t		*ip = XFS_I(inode);
++	struct xfs_inode	*ip = XFS_I(inode);
++
++	xfs_itrace_entry(ip);
+ 
+ 	XFS_STATS_INC(vn_reclaim);
+-	if (xfs_reclaim(ip))
+-		panic("%s: cannot reclaim 0x%p\n", __func__, inode);
++
++	/* bad inode, get out here ASAP */
++	if (is_bad_inode(inode))
++		goto out_reclaim;
++
++	xfs_ioend_wait(ip);
++
++	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
++
++	/*
++	 * We should never get here with one of the reclaim flags already set.
++	 */
++	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
++	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
++
++	/*
++	 * If we have nothing to flush with this inode then complete the
++	 * teardown now, otherwise delay the flush operation.
++	 */
++	if (!xfs_inode_clean(ip)) {
++		xfs_inode_set_reclaim_tag(ip);
++		return;
++	}
++
++out_reclaim:
++	xfs_ireclaim(ip);
+ }
+ 
+ /*
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -663,10 +663,9 @@ xfs_syncd_stop(
+ 	kthread_stop(mp->m_sync_task);
+ }
+ 
+-int
++STATIC int
+ xfs_reclaim_inode(
+ 	xfs_inode_t	*ip,
+-	int		locked,
+ 	int		sync_mode)
+ {
+ 	xfs_perag_t	*pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+@@ -682,10 +681,6 @@ xfs_reclaim_inode(
+ 	    !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+ 		spin_unlock(&ip->i_flags_lock);
+ 		write_unlock(&pag->pag_ici_lock);
+-		if (locked) {
+-			xfs_ifunlock(ip);
+-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-		}
+ 		return -EAGAIN;
+ 	}
+ 	__xfs_iflags_set(ip, XFS_IRECLAIM);
+@@ -704,10 +699,8 @@ xfs_reclaim_inode(
+ 	 * We get the flush lock regardless, though, just to make sure
+ 	 * we don't free it while it is being flushed.
+ 	 */
+-	if (!locked) {
+-		xfs_ilock(ip, XFS_ILOCK_EXCL);
+-		xfs_iflock(ip);
+-	}
++	xfs_ilock(ip, XFS_ILOCK_EXCL);
++	xfs_iflock(ip);
+ 
+ 	/*
+ 	 * In the case of a forced shutdown we rely on xfs_iflush() to
+@@ -778,7 +771,7 @@ xfs_reclaim_inode_now(
+ 	}
+ 	read_unlock(&pag->pag_ici_lock);
+ 
+-	return xfs_reclaim_inode(ip, 0, flags);
++	return xfs_reclaim_inode(ip, flags);
+ }
+ 
+ int
+--- a/fs/xfs/linux-2.6/xfs_sync.h
++++ b/fs/xfs/linux-2.6/xfs_sync.h
+@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *
+ 
+ void xfs_flush_inodes(struct xfs_inode *ip);
+ 
+-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+ 
+ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -2456,46 +2456,6 @@ xfs_set_dmattrs(
+ 	return error;
+ }
+ 
+-int
+-xfs_reclaim(
+-	xfs_inode_t	*ip)
+-{
+-
+-	xfs_itrace_entry(ip);
+-
+-	ASSERT(!VN_MAPPED(VFS_I(ip)));
+-
+-	/* bad inode, get out here ASAP */
+-	if (is_bad_inode(VFS_I(ip))) {
+-		xfs_ireclaim(ip);
+-		return 0;
+-	}
+-
+-	xfs_ioend_wait(ip);
+-
+-	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+-
+-	/*
+-	 * If we have nothing to flush with this inode then complete the
+-	 * teardown now, otherwise break the link between the xfs inode and the
+-	 * linux inode and clean up the xfs inode later. This avoids flushing
+-	 * the inode to disk during the delete operation itself.
+-	 *
+-	 * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
+-	 * first to ensure that xfs_iunpin() will never see an xfs inode
+-	 * that has a linux inode being reclaimed. Synchronisation is provided
+-	 * by the i_flags_lock.
+-	 */
+-	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
+-		xfs_ilock(ip, XFS_ILOCK_EXCL);
+-		xfs_iflock(ip);
+-		xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+-		return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+-	}
+-	xfs_inode_set_reclaim_tag(ip);
+-	return 0;
+-}
+-
+ /*
+  * xfs_alloc_file_space()
+  *      This routine allocates disk space for the given file.
+--- a/fs/xfs/xfs_vnodeops.h
++++ b/fs/xfs/xfs_vnodeops.h
+@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, st
+ 		const char *target_path, mode_t mode, struct xfs_inode **ipp,
+ 		cred_t *credp);
+ int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
+-int xfs_reclaim(struct xfs_inode *ip);
+ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
+ 		xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
+ int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
diff --git a/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch b/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch
new file mode 100644
index 00000000000..519d11ea65b
--- /dev/null
+++ b/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch
@@ -0,0 +1,130 @@
+From david@fromorbit.com  Fri Apr  2 11:06:08 2010
+From: Andy Poling <andy@realbig.com>
+Date: Fri, 12 Mar 2010 09:42:02 +1100
+Subject: xfs: Wrapped journal record corruption on read at recovery
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-5-git-send-email-david@fromorbit.com>
+
+
+From: Andy Poling <andy@realbig.com>
+
+commit fc5bc4c85c45f0bf854404e5736aa8b65720a18d upstream
+
+Summary of problem:
+
+If a journal record wraps at the physical end of the journal, it has to be
+read in two parts in xlog_do_recovery_pass(): a read at the physical end and a
+read at the physical beginning.  If xlog_bread() has to re-align the first
+read, the second read request does not take that re-alignment into account.
+If the first read was re-aligned, the second read over-writes the end of the
+data from the first read, effectively corrupting it.  This can happen either
+when reading the record header or reading the record data.
+
+The first sanity check in xlog_recover_process_data() is to check for a valid
+clientid, so that is the error reported.
+
+Summary of fix:
+
+If there was a first read at the physical end, XFS_BUF_PTR() returns where the
+data was requested to begin.  Conversely, because it is the result of
+xlog_align(), offset indicates where the requested data for the first read
+actually begins - whether or not xlog_bread() has re-aligned it.
+
+Using offset as the base for the calculation of where to place the second read
+data ensures that it will be correctly placed immediately following the data
+from the first read instead of sometimes over-writing the end of it.
+
+The attached patch has resolved the reported problem of occasional inability
+to recover the journal (reporting "bad clientid").
+
+Signed-off-by: Andy Poling <andy@realbig.com>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_log_recover.c |   24 +++++++-----------------
+ 1 file changed, 7 insertions(+), 17 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -3517,7 +3517,7 @@ xlog_do_recovery_pass(
+ {
+ 	xlog_rec_header_t	*rhead;
+ 	xfs_daddr_t		blk_no;
+-	xfs_caddr_t		bufaddr, offset;
++	xfs_caddr_t		offset;
+ 	xfs_buf_t		*hbp, *dbp;
+ 	int			error = 0, h_size;
+ 	int			bblks, split_bblks;
+@@ -3610,7 +3610,7 @@ xlog_do_recovery_pass(
+ 			/*
+ 			 * Check for header wrapping around physical end-of-log
+ 			 */
+-			offset = NULL;
++			offset = XFS_BUF_PTR(hbp);
+ 			split_hblks = 0;
+ 			wrapped_hblks = 0;
+ 			if (blk_no + hblks <= log->l_logBBsize) {
+@@ -3646,9 +3646,8 @@ xlog_do_recovery_pass(
+ 				 *   - order is important.
+ 				 */
+ 				wrapped_hblks = hblks - split_hblks;
+-				bufaddr = XFS_BUF_PTR(hbp);
+ 				error = XFS_BUF_SET_PTR(hbp,
+-						bufaddr + BBTOB(split_hblks),
++						offset + BBTOB(split_hblks),
+ 						BBTOB(hblks - split_hblks));
+ 				if (error)
+ 					goto bread_err2;
+@@ -3658,14 +3657,10 @@ xlog_do_recovery_pass(
+ 				if (error)
+ 					goto bread_err2;
+ 
+-				error = XFS_BUF_SET_PTR(hbp, bufaddr,
++				error = XFS_BUF_SET_PTR(hbp, offset,
+ 							BBTOB(hblks));
+ 				if (error)
+ 					goto bread_err2;
+-
+-				if (!offset)
+-					offset = xlog_align(log, 0,
+-							wrapped_hblks, hbp);
+ 			}
+ 			rhead = (xlog_rec_header_t *)offset;
+ 			error = xlog_valid_rec_header(log, rhead,
+@@ -3685,7 +3680,7 @@ xlog_do_recovery_pass(
+ 			} else {
+ 				/* This log record is split across the
+ 				 * physical end of log */
+-				offset = NULL;
++				offset = XFS_BUF_PTR(dbp);
+ 				split_bblks = 0;
+ 				if (blk_no != log->l_logBBsize) {
+ 					/* some data is before the physical
+@@ -3714,9 +3709,8 @@ xlog_do_recovery_pass(
+ 				 *   _first_, then the log start (LR header end)
+ 				 *   - order is important.
+ 				 */
+-				bufaddr = XFS_BUF_PTR(dbp);
+ 				error = XFS_BUF_SET_PTR(dbp,
+-						bufaddr + BBTOB(split_bblks),
++						offset + BBTOB(split_bblks),
+ 						BBTOB(bblks - split_bblks));
+ 				if (error)
+ 					goto bread_err2;
+@@ -3727,13 +3721,9 @@ xlog_do_recovery_pass(
+ 				if (error)
+ 					goto bread_err2;
+ 
+-				error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
++				error = XFS_BUF_SET_PTR(dbp, offset, h_size);
+ 				if (error)
+ 					goto bread_err2;
+-
+-				if (!offset)
+-					offset = xlog_align(log, wrapped_hblks,
+-						bblks - split_bblks, dbp);
+ 			}
+ 			xlog_unpack_data(rhead, offset, log);
+ 			if ((error = xlog_recover_process_data(log, rhash,
diff --git a/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch b/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch
new file mode 100644
index 00000000000..e722537945f
--- /dev/null
+++ b/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch
@@ -0,0 +1,185 @@
+From david@fromorbit.com  Fri Apr  2 11:10:52 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:12 +1100
+Subject: xfs: xfs_swap_extents needs to handle dynamic fork offsets
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-15-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit e09f98606dcc156de1146c209d45a0d6d5f51c3f upstream
+
+When swapping extents, we can corrupt inodes by swapping data forks
+that are in incompatible formats.  This is caused by the two indoes
+having different fork offsets due to the presence of an attribute
+fork on an attr2 filesystem.  xfs_fsr tries to be smart about
+setting the fork offset, but the trick it plays only works on attr1
+(old fixed format attribute fork) filesystems.
+
+Changing the way xfs_fsr sets up the attribute fork will prevent
+this situation from ever occurring, so in the kernel code we can get
+by with a preventative fix - check that the data fork in the
+defragmented inode is in a format valid for the inode it is being
+swapped into.  This will lead to files that will silently and
+potentially repeatedly fail defragmentation, so issue a warning to
+the log when this particular failure occurs to let us know that
+xfs_fsr needs updating/fixing.
+
+To help identify how to improve xfs_fsr to avoid this issue, add
+trace points for the inodes being swapped so that we can determine
+why the swap was rejected and to confirm that the code is making the
+right decisions and modifications when swapping forks.
+
+A further complication is even when the swap is allowed to proceed
+when the fork offset is different between the two inodes then value
+for the maximum number of extents the data fork can hold can be
+wrong. Make sure these are also set correctly after the swap occurs.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_dfrag.c |  106 +++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 90 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/xfs_dfrag.c
++++ b/fs/xfs/xfs_dfrag.c
+@@ -113,10 +113,82 @@ xfs_swapext(
+ 	return error;
+ }
+ 
++/*
++ * We need to check that the format of the data fork in the temporary inode is
++ * valid for the target inode before doing the swap. This is not a problem with
++ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
++ * data fork depending on the space the attribute fork is taking so we can get
++ * invalid formats on the target inode.
++ *
++ * E.g. target has space for 7 extents in extent format, temp inode only has
++ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
++ * btree, but when swapped it needs to be in extent format. Hence we can't just
++ * blindly swap data forks on attr2 filesystems.
++ *
++ * Note that we check the swap in both directions so that we don't end up with
++ * a corrupt temporary inode, either.
++ *
++ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
++ * inode will prevent this situation from occurring, so all we do here is
++ * reject and log the attempt. basically we are putting the responsibility on
++ * userspace to get this right.
++ */
++static int
++xfs_swap_extents_check_format(
++	xfs_inode_t	*ip,	/* target inode */
++	xfs_inode_t	*tip)	/* tmp inode */
++{
++
++	/* Should never get a local format */
++	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
++	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
++		return EINVAL;
++
++	/*
++	 * if the target inode has less extents that then temporary inode then
++	 * why did userspace call us?
++	 */
++	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
++		return EINVAL;
++
++	/*
++	 * if the target inode is in extent form and the temp inode is in btree
++	 * form then we will end up with the target inode in the wrong format
++	 * as we already know there are less extents in the temp inode.
++	 */
++	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
++		return EINVAL;
++
++	/* Check temp in extent form to max in target */
++	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
++		return EINVAL;
++
++	/* Check target in extent form to max in temp */
++	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
++		return EINVAL;
++
++	/* Check root block of temp in btree form to max in target */
++	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
++	    XFS_IFORK_BOFF(ip) &&
++	    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
++		return EINVAL;
++
++	/* Check root block of target in btree form to max in temp */
++	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
++	    XFS_IFORK_BOFF(tip) &&
++	    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
++		return EINVAL;
++
++	return 0;
++}
++
+ int
+ xfs_swap_extents(
+-	xfs_inode_t	*ip,
+-	xfs_inode_t	*tip,
++	xfs_inode_t	*ip,	/* target inode */
++	xfs_inode_t	*tip,	/* tmp inode */
+ 	xfs_swapext_t	*sxp)
+ {
+ 	xfs_mount_t	*mp;
+@@ -160,13 +232,6 @@ xfs_swap_extents(
+ 		goto out_unlock;
+ 	}
+ 
+-	/* Should never get a local format */
+-	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+-	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+-		error = XFS_ERROR(EINVAL);
+-		goto out_unlock;
+-	}
+-
+ 	if (VN_CACHED(VFS_I(tip)) != 0) {
+ 		xfs_inval_cached_trace(tip, 0, -1, 0, -1);
+ 		error = xfs_flushinval_pages(tip, 0, -1,
+@@ -189,13 +254,12 @@ xfs_swap_extents(
+ 		goto out_unlock;
+ 	}
+ 
+-	/*
+-	 * If the target has extended attributes, the tmp file
+-	 * must also in order to ensure the correct data fork
+-	 * format.
+-	 */
+-	if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+-		error = XFS_ERROR(EINVAL);
++	/* check inode formats now that data is flushed */
++	error = xfs_swap_extents_check_format(ip, tip);
++	if (error) {
++		xfs_fs_cmn_err(CE_NOTE, mp,
++		    "%s: inode 0x%llx format is incompatible for exchanging.",
++				__FILE__, ip->i_ino);
+ 		goto out_unlock;
+ 	}
+ 
+@@ -276,6 +340,16 @@ xfs_swap_extents(
+ 	*tifp = *tempifp;	/* struct copy */
+ 
+ 	/*
++	 * Fix the in-memory data fork values that are dependent on the fork
++	 * offset in the inode. We can't assume they remain the same as attr2
++	 * has dynamic fork offsets.
++	 */
++	ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
++					(uint)sizeof(xfs_bmbt_rec_t);
++	tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
++					(uint)sizeof(xfs_bmbt_rec_t);
++
++	/*
+ 	 * Fix the on-disk inode values
+ 	 */
+ 	tmp = (__uint64_t)ip->i_d.di_nblocks;