drm-radeon-kms-don-t-print-error-on-erestartsys.patch
drm-radeon-kms-fix-pal-tv-out-support-on-legacy-igp-chips.patch
drm-return-enodev-if-the-inode-mapping-changes.patch
+xfs-simplify-inode-teardown.patch
+xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch
+xfs-i-o-completion-handlers-must-use-nofs-allocations.patch
+xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch
+xfs-fix-error-return-for-fallocate-on-xfs.patch
+xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch
+xfs-fix-timestamp-handling-in-xfs_setattr.patch
+xfs-don-t-flush-stale-inodes.patch
+xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch
+xfs-reclaim-inodes-under-a-write-lock.patch
+xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch
+xfs-reclaim-all-inodes-by-background-tree-walks.patch
+xfs-fix-stale-inode-flush-avoidance.patch
+xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch
+xfs-quota-limit-statvfs-available-blocks.patch
+xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch
+xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch
+xfs-non-blocking-inode-locking-in-io-completion.patch
+xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:09:28 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:09 +1100
+Subject: xfs: Avoid inodes in reclaim when flushing from inode cache
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-12-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 018027be90a6946e8cf3f9b17b5582384f7ed117 upstream
+
+The reclaim code will handle flushing of dirty inodes before reclaim
+occurs, so avoid them when determining whether an inode is a
+candidate for flushing to disk when walking the radix trees. This
+is based on a test patch from Christoph Hellwig.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_sync.c | 31 ++++++++++++++++++-------------
+ 1 file changed, 18 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -179,26 +179,31 @@ xfs_sync_inode_valid(
+ struct xfs_perag *pag)
+ {
+ struct inode *inode = VFS_I(ip);
++ int error = EFSCORRUPTED;
+
+ /* nothing to sync during shutdown */
+- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+- read_unlock(&pag->pag_ici_lock);
+- return EFSCORRUPTED;
+- }
++ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
++ goto out_unlock;
+
+- /* If we can't get a reference on the inode, it must be in reclaim. */
+- if (!igrab(inode)) {
+- read_unlock(&pag->pag_ici_lock);
+- return ENOENT;
+- }
+- read_unlock(&pag->pag_ici_lock);
++ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
++ error = ENOENT;
++ if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
++ goto out_unlock;
+
+- if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
++ /* If we can't grab the inode, it must on it's way to reclaim. */
++ if (!igrab(inode))
++ goto out_unlock;
++
++ if (is_bad_inode(inode)) {
+ IRELE(ip);
+- return ENOENT;
++ goto out_unlock;
+ }
+
+- return 0;
++ /* inode is valid */
++ error = 0;
++out_unlock:
++ read_unlock(&pag->pag_ici_lock);
++ return error;
+ }
+
+ STATIC int
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:07:09 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:04 +1100
+Subject: xfs: check for not fully initialized inodes in xfs_ireclaim
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-7-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit b44b1126279b60597f96bbe77507b1650f88a969 upstream
+
+Add an assert for inodes not added to the inode cache in xfs_ireclaim,
+to make sure we're not going to introduce something like the
+famous nfsd inode cache bug again.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_iget.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_iget.c
++++ b/fs/xfs/xfs_iget.c
+@@ -511,17 +511,21 @@ xfs_ireclaim(
+ {
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
++ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+
+ XFS_STATS_INC(xs_ig_reclaims);
+
+ /*
+- * Remove the inode from the per-AG radix tree. It doesn't matter
+- * if it was never added to it because radix_tree_delete can deal
+- * with that case just fine.
++ * Remove the inode from the per-AG radix tree.
++ *
++ * Because radix_tree_delete won't complain even if the item was never
++ * added to the tree assert that it's been there before to catch
++ * problems with the inode life time early on.
+ */
+ pag = xfs_get_perag(mp, ip->i_ino);
+ write_lock(&pag->pag_ici_lock);
+- radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
++ if (!radix_tree_delete(&pag->pag_ici_root, agino))
++ ASSERT(0);
+ write_unlock(&pag->pag_ici_lock);
+ xfs_put_perag(mp, pag);
+
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:08:07 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:06 +1100
+Subject: xfs: Don't flush stale inodes
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-9-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 44e08c45cc14e6190a424be8d450070c8e508fad upstream
+
+Because inodes remain in cache much longer than inode buffers do
+under memory pressure, we can get the situation where we have
+stale, dirty inodes being reclaimed but the backing storage has
+been freed. Hence we should never, ever flush XFS_ISTALE inodes
+to disk as there is no guarantee that the backing buffer is in
+cache and still marked stale when the flush occurs.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_inode.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2877,10 +2877,14 @@ xfs_iflush(
+ mp = ip->i_mount;
+
+ /*
+- * If the inode isn't dirty, then just release the inode
+- * flush lock and do nothing.
++ * If the inode isn't dirty, then just release the inode flush lock and
++ * do nothing. Treat stale inodes the same; we cannot rely on the
++ * backing buffer remaining stale in cache for the remaining life of
++ * the stale inode and so xfs_itobp() below may give us a buffer that
++ * no longer contains inodes below. Doing this stale check here also
++ * avoids forcing the log on pinned, stale inodes.
+ */
+- if (xfs_inode_clean(ip)) {
++ if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
+ xfs_ifunlock(ip);
+ return 0;
+ }
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:11:43 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:14 +1100
+Subject: xfs: don't hold onto reserved blocks on remount, ro
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-17-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit cbe132a8bdcff0f9afd9060948fb50597c7400b8 upstream
+
+If we hold onto reserved blocks when doing a remount,ro we end
+up writing the blocks used count to disk that includes the reserved
+blocks. Reserved blocks are not actually used, so this results in
+the values in the superblock being incorrect.
+
+Hence if we run xfs_check or xfs_repair -n while the filesystem is
+mounted remount,ro we end up with an inconsistent filesystem being
+reported. Also, running xfs_copy on the remount,ro filesystem will
+result in an inconsistent image being generated.
+
+To fix this, unreserve the blocks when doing the remount,ro, and
+reserved them again on remount,rw. This way a remount,ro filesystem
+will appear consistent on disk to all utilities.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_super.c | 28 ++++++++++++++++++++++++++++
+ fs/xfs/xfs_mount.h | 1 +
+ 2 files changed, 29 insertions(+)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -1323,6 +1323,8 @@ xfs_fs_remount(
+
+ /* ro -> rw */
+ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
++ __uint64_t resblks;
++
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ if (mp->m_flags & XFS_MOUNT_BARRIER)
+ xfs_mountfs_check_barriers(mp);
+@@ -1340,11 +1342,37 @@ xfs_fs_remount(
+ }
+ mp->m_update_flags = 0;
+ }
++
++ /*
++ * Fill out the reserve pool if it is empty. Use the stashed
++ * value if it is non-zero, otherwise go with the default.
++ */
++ if (mp->m_resblks_save) {
++ resblks = mp->m_resblks_save;
++ mp->m_resblks_save = 0;
++ } else {
++ resblks = mp->m_sb.sb_dblocks;
++ do_div(resblks, 20);
++ resblks = min_t(__uint64_t, resblks, 1024);
++ }
++ xfs_reserve_blocks(mp, &resblks, NULL);
+ }
+
+ /* rw -> ro */
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
++ /*
++ * After we have synced the data but before we sync the
++ * metadata, we need to free up the reserve block pool so that
++ * the used block count in the superblock on disk is correct at
++ * the end of the remount. Stash the current reserve pool size
++ * so that if we get remounted rw, we can return it to the same
++ * size.
++ */
++ __uint64_t resblks = 0;
++
+ xfs_quiesce_data(mp);
++ mp->m_resblks_save = mp->m_resblks;
++ xfs_reserve_blocks(mp, &resblks, NULL);
+ xfs_quiesce_attr(mp);
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ }
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -209,6 +209,7 @@ typedef struct xfs_mount {
+ __uint64_t m_maxioffset; /* maximum inode offset */
+ __uint64_t m_resblks; /* total reserved blocks */
+ __uint64_t m_resblks_avail;/* available reserved blocks */
++ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
+ int m_dalign; /* stripe unit */
+ int m_swidth; /* stripe width */
+ int m_sinoalign; /* stripe unit inode alignment */
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:08:31 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:07 +1100
+Subject: xfs: Ensure we force all busy extents in range to disk
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-10-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit fd45e4784164d1017521086524e3442318c67370 upstream
+
+When we search for and find a busy extent during allocation we
+force the log out to ensure the extent free transaction is on
+disk before the allocation transaction. The current implementation
+has a subtle bug in it--it does not handle multiple overlapping
+ranges.
+
+That is, if we free lots of little extents into a single
+contiguous extent, then allocate the contiguous extent, the busy
+search code stops searching at the first extent it finds that
+overlaps the allocated range. It then uses the commit LSN of the
+transaction to force the log out to.
+
+Unfortunately, the other busy ranges might have more recent
+commit LSNs than the first busy extent that is found, and this
+results in xfs_alloc_search_busy() returning before all the
+extent free transactions are on disk for the range being
+allocated. This can lead to potential metadata corruption or
+stale data exposure after a crash because log replay won't replay
+all the extent free transactions that cover the allocation range.
+
+Modified-by: Alex Elder <aelder@sgi.com>
+
+(Dropped the "found" argument from the xfs_alloc_busysearch trace
+event.)
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_alloc.c | 52 +++++++++++++++++++++-------------------------------
+ 1 file changed, 21 insertions(+), 31 deletions(-)
+
+--- a/fs/xfs/xfs_alloc.c
++++ b/fs/xfs/xfs_alloc.c
+@@ -2703,45 +2703,35 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
+ xfs_mount_t *mp;
+ xfs_perag_busy_t *bsy;
+ xfs_agblock_t uend, bend;
+- xfs_lsn_t lsn;
++ xfs_lsn_t lsn = 0;
+ int cnt;
+
+ mp = tp->t_mountp;
+
+ spin_lock(&mp->m_perag[agno].pagb_lock);
+- cnt = mp->m_perag[agno].pagb_count;
+-
+ uend = bno + len - 1;
+
+- /* search pagb_list for this slot, skipping open slots */
+- for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
+-
+- /*
+- * (start1,length1) within (start2, length2)
+- */
+- if (bsy->busy_tp != NULL) {
+- bend = bsy->busy_start + bsy->busy_length - 1;
+- if ((bno > bend) || (uend < bsy->busy_start)) {
+- cnt--;
+- } else {
+- TRACE_BUSYSEARCH("xfs_alloc_search_busy",
+- "found1", agno, bno, len, tp);
+- break;
+- }
+- }
+- }
+-
+ /*
+- * If a block was found, force the log through the LSN of the
+- * transaction that freed the block
++ * search pagb_list for this slot, skipping open slots. We have to
++ * search the entire array as there may be multiple overlaps and
++ * we have to get the most recent LSN for the log force to push out
++ * all the transactions that span the range.
+ */
+- if (cnt) {
+- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
+- lsn = bsy->busy_tp->t_commit_lsn;
+- spin_unlock(&mp->m_perag[agno].pagb_lock);
+- xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+- } else {
+- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
+- spin_unlock(&mp->m_perag[agno].pagb_lock);
++ for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
++ bsy = &mp->m_perag[agno].pagb_list[cnt];
++ if (!bsy->busy_tp)
++ continue;
++ bend = bsy->busy_start + bsy->busy_length - 1;
++ if (bno > bend || uend < bsy->busy_start)
++ continue;
++
++ /* (start1,length1) within (start2, length2) */
++ if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
++ lsn = bsy->busy_tp->t_commit_lsn;
+ }
++ spin_unlock(&mp->m_perag[agno].pagb_lock);
++ TRACE_BUSYSEARCH("xfs_alloc_search_busy", lsn ? "found" : "not-found",
++ agno, bno, len, tp);
++ if (lsn)
++ xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+ }
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:06:34 2010
+From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+Date: Fri, 12 Mar 2010 09:42:03 +1100
+Subject: xfs: Fix error return for fallocate() on XFS
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-6-git-send-email-david@fromorbit.com>
+
+
+From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+
+commit 44a743f68705c681439f264deb05f8f38e9048d3 upstream
+
+Noticed that through glibc fallocate would return 28 rather than -1
+and errno = 28 for ENOSPC. The xfs routines uses XFS_ERROR format
+positive return error codes while the syscalls use negative return
+codes. Fixup the two cases in xfs_vn_fallocate syscall to convert to
+negative.
+
+Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_iops.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_iops.c
++++ b/fs/xfs/linux-2.6/xfs_iops.c
+@@ -573,8 +573,8 @@ xfs_vn_fallocate(
+ bf.l_len = len;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+- error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
+- 0, XFS_ATTR_NOLOCK);
++ error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
++ 0, XFS_ATTR_NOLOCK);
+ if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode))
+ new_size = offset + len;
+@@ -585,7 +585,7 @@ xfs_vn_fallocate(
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = new_size;
+- error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
++ error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+ }
+
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:12:53 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:17 +1100
+Subject: xfs: fix locking for inode cache radix tree tag updates
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-20-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit f1f724e4b523d444c5a598d74505aefa3d6844d2 upstream
+
+The radix-tree code requires it's users to serialize tag updates
+against other updates to the tree. While XFS protects tag updates
+against each other it does not serialize them against updates of the
+tree contents, which can lead to tag corruption. Fix the inode
+cache to always take pag_ici_lock in exclusive mode when updating
+radix tree tags.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Patrick Schreurs <patrick@news-service.com>
+Tested-by: Patrick Schreurs <patrick@news-service.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_sync.c | 4 ++--
+ fs/xfs/xfs_iget.c | 19 +++++++++++++------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -692,12 +692,12 @@ xfs_inode_set_reclaim_tag(
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
+
+- read_lock(&pag->pag_ici_lock);
++ write_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+- read_unlock(&pag->pag_ici_lock);
++ write_unlock(&pag->pag_ici_lock);
+ xfs_put_perag(mp, pag);
+ }
+
+--- a/fs/xfs/xfs_iget.c
++++ b/fs/xfs/xfs_iget.c
+@@ -228,13 +228,12 @@ xfs_iget_cache_hit(
+ xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+
+ /*
+- * We need to set XFS_INEW atomically with clearing the
+- * reclaimable tag so that we do have an indicator of the
+- * inode still being initialized.
++ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
++ * from stomping over us while we recycle the inode. We can't
++ * clear the radix tree reclaimable tag yet as it requires
++ * pag_ici_lock to be held exclusive.
+ */
+- ip->i_flags |= XFS_INEW;
+- ip->i_flags &= ~XFS_IRECLAIMABLE;
+- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
++ ip->i_flags |= XFS_IRECLAIM;
+
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
+@@ -253,7 +252,15 @@ xfs_iget_cache_hit(
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ goto out_error;
+ }
++
++ write_lock(&pag->pag_ici_lock);
++ spin_lock(&ip->i_flags_lock);
++ ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
++ ip->i_flags |= XFS_INEW;
++ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ inode->i_state = I_LOCK|I_NEW;
++ spin_unlock(&ip->i_flags_lock);
++ write_unlock(&pag->pag_ici_lock);
+ } else {
+ /* If the VFS inode is being torn down, pause and try again. */
+ if (!igrab(inode)) {
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:05:07 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:00 +1100
+Subject: xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-3-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit c56c9631cbe88f08854a56ff9776c1f310916830 upstream
+
+When xfs_free_eofblocks is called from ->release the VM might already
+hold the mmap_sem, but in the write path we take the iolock before
+taking the mmap_sem in the generic write code.
+
+Switch xfs_free_eofblocks to only trylock the iolock if called from
+->release and skip trimming the prellocated blocks in that case.
+We'll still free them later on the final iput.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_rw.h | 7 -------
+ fs/xfs/xfs_vnodeops.c | 34 ++++++++++++++++++++++++++--------
+ 2 files changed, 26 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/xfs_rw.h
++++ b/fs/xfs/xfs_rw.h
+@@ -37,13 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_
+ }
+
+ /*
+- * Flags for xfs_free_eofblocks
+- */
+-#define XFS_FREE_EOF_LOCK (1<<0)
+-#define XFS_FREE_EOF_NOLOCK (1<<1)
+-
+-
+-/*
+ * helper function to extract extent size hint from inode
+ */
+ STATIC_INLINE xfs_extlen_t
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -709,6 +709,11 @@ xfs_fsync(
+ }
+
+ /*
++ * Flags for xfs_free_eofblocks
++ */
++#define XFS_FREE_EOF_TRYLOCK (1<<0)
++
++/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+@@ -726,7 +731,6 @@ xfs_free_eofblocks(
+ xfs_filblks_t map_len;
+ int nimaps;
+ xfs_bmbt_irec_t imap;
+- int use_iolock = (flags & XFS_FREE_EOF_LOCK);
+
+ /*
+ * Figure out if there are any blocks beyond the end
+@@ -768,14 +772,19 @@ xfs_free_eofblocks(
+ * cache and we can't
+ * do that within a transaction.
+ */
+- if (use_iolock)
++ if (flags & XFS_FREE_EOF_TRYLOCK) {
++ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
++ xfs_trans_cancel(tp, 0);
++ return 0;
++ }
++ } else {
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
++ }
+ error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+ ip->i_size);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+- if (use_iolock)
+- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
++ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+@@ -812,8 +821,7 @@ xfs_free_eofblocks(
+ error = xfs_trans_commit(tp,
+ XFS_TRANS_RELEASE_LOG_RES);
+ }
+- xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
+- : XFS_ILOCK_EXCL));
++ xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+ }
+ return error;
+ }
+@@ -1113,7 +1121,17 @@ xfs_release(
+ (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
+ (!(ip->i_d.di_flags &
+ (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
++
++ /*
++ * If we can't get the iolock just skip truncating
++ * the blocks past EOF because we could deadlock
++ * with the mmap_sem otherwise. We'll get another
++ * chance to drop them once the last reference to
++ * the inode is dropped, so we'll never leak blocks
++ * permanently.
++ */
++ error = xfs_free_eofblocks(mp, ip,
++ XFS_FREE_EOF_TRYLOCK);
+ if (error)
+ return error;
+ }
+@@ -1184,7 +1202,7 @@ xfs_inactive(
+ (!(ip->i_d.di_flags &
+ (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+ (ip->i_delayed_blks != 0)))) {
+- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
++ error = xfs_free_eofblocks(mp, ip, 0);
+ if (error)
+ return VN_INACTIVE_CACHE;
+ }
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:10:21 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:11 +1100
+Subject: xfs: fix stale inode flush avoidance
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-14-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 4b6a46882cca8349e8942e2650c33b11bc571c92 upstream
+
+When reclaiming stale inodes, we need to guarantee that inodes are
+unpinned before returning with a "clean" status. If we don't we can
+reclaim inodes that are pinned, leading to use after free in the
+transaction subsystem as transactions complete.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_inode.c | 21 +++++++++++++++------
+ 1 file changed, 15 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2878,13 +2878,9 @@ xfs_iflush(
+
+ /*
+ * If the inode isn't dirty, then just release the inode flush lock and
+- * do nothing. Treat stale inodes the same; we cannot rely on the
+- * backing buffer remaining stale in cache for the remaining life of
+- * the stale inode and so xfs_itobp() below may give us a buffer that
+- * no longer contains inodes below. Doing this stale check here also
+- * avoids forcing the log on pinned, stale inodes.
++ * do nothing.
+ */
+- if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
++ if (xfs_inode_clean(ip)) {
+ xfs_ifunlock(ip);
+ return 0;
+ }
+@@ -2908,6 +2904,19 @@ xfs_iflush(
+ xfs_iunpin_wait(ip);
+
+ /*
++ * For stale inodes we cannot rely on the backing buffer remaining
++ * stale in cache for the remaining life of the stale inode and so
++ * xfs_itobp() below may give us a buffer that no longer contains
++ * inodes below. We have to check this after ensuring the inode is
++ * unpinned so that it is safe to reclaim the stale inode after the
++ * flush call.
++ */
++ if (xfs_iflags_test(ip, XFS_ISTALE)) {
++ xfs_ifunlock(ip);
++ return 0;
++ }
++
++ /*
+ * This may have been unpinned because the filesystem is shutting
+ * down forcibly. If that's the case we must not write this inode
+ * to disk, because the log record didn't make it to disk!
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:07:34 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:05 +1100
+Subject: xfs: fix timestamp handling in xfs_setattr
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-8-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit d6d59bada372bcf8bd36c3bbc71c485c29dd2a4b upstream
+
+We currently have some rather odd code in xfs_setattr for
+updating the a/c/mtime timestamps:
+
+ - first we do a non-transaction update if all three are updated
+ together
+ - second we implicitly update the ctime for various changes
+ instead of relying on the ATTR_CTIME flag
+ - third we set the timestamps to the current time instead of the
+ arguments in the iattr structure in many cases.
+
+This patch makes sure we update it in a consistent way:
+
+ - always transactional
+ - ctime is only updated if ATTR_CTIME is set or we do a size
+ update, which is a special case
+ - always to the times passed in from the caller instead of the
+ current time
+
+The only non-size caller of xfs_setattr that doesn't come from
+the VFS is updated to set ATTR_CTIME and pass in a valid ctime
+value.
+
+Reported-by: Eric Blake <ebb9@byu.net>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_acl.c | 3 -
+ fs/xfs/xfs_vnodeops.c | 93 ++++++++++++++++++---------------------------
+ 2 files changed, 41 insertions(+), 55 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_acl.c
++++ b/fs/xfs/linux-2.6/xfs_acl.c
+@@ -250,8 +250,9 @@ xfs_set_mode(struct inode *inode, mode_t
+ if (mode != inode->i_mode) {
+ struct iattr iattr;
+
+- iattr.ia_valid = ATTR_MODE;
++ iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+ iattr.ia_mode = mode;
++ iattr.ia_ctime = current_fs_time(inode->i_sb);
+
+ error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+ }
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -69,7 +69,6 @@ xfs_setattr(
+ uint commit_flags=0;
+ uid_t uid=0, iuid=0;
+ gid_t gid=0, igid=0;
+- int timeflags = 0;
+ struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
+ int need_iolock = 1;
+
+@@ -134,16 +133,13 @@ xfs_setattr(
+ if (flags & XFS_ATTR_NOLOCK)
+ need_iolock = 0;
+ if (!(mask & ATTR_SIZE)) {
+- if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
+- (mp->m_flags & XFS_MOUNT_WSYNC)) {
+- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+- commit_flags = 0;
+- if ((code = xfs_trans_reserve(tp, 0,
+- XFS_ICHANGE_LOG_RES(mp), 0,
+- 0, 0))) {
+- lock_flags = 0;
+- goto error_return;
+- }
++ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
++ commit_flags = 0;
++ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
++ 0, 0, 0);
++ if (code) {
++ lock_flags = 0;
++ goto error_return;
+ }
+ } else {
+ if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
+@@ -294,15 +290,23 @@ xfs_setattr(
+ * or we are explicitly asked to change it. This handles
+ * the semantic difference between truncate() and ftruncate()
+ * as implemented in the VFS.
++ *
++ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
++ * is a special case where we need to update the times despite
++ * not having these flags set. For all other operations the
++ * VFS set these flags explicitly if it wants a timestamp
++ * update.
+ */
+- if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
+- timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
++ if (iattr->ia_size != ip->i_size &&
++ (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
++ iattr->ia_ctime = iattr->ia_mtime =
++ current_fs_time(inode->i_sb);
++ mask |= ATTR_CTIME | ATTR_MTIME;
++ }
+
+ if (iattr->ia_size > ip->i_size) {
+ ip->i_d.di_size = iattr->ia_size;
+ ip->i_size = iattr->ia_size;
+- if (!(flags & XFS_ATTR_DMI))
+- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ } else if (iattr->ia_size <= ip->i_size ||
+ (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+@@ -373,9 +377,6 @@ xfs_setattr(
+ ip->i_d.di_gid = gid;
+ inode->i_gid = gid;
+ }
+-
+- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+- timeflags |= XFS_ICHGTIME_CHG;
+ }
+
+ /*
+@@ -392,51 +393,37 @@ xfs_setattr(
+
+ inode->i_mode &= S_IFMT;
+ inode->i_mode |= mode & ~S_IFMT;
+-
+- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+- timeflags |= XFS_ICHGTIME_CHG;
+ }
+
+ /*
+ * Change file access or modified times.
+ */
+- if (mask & (ATTR_ATIME|ATTR_MTIME)) {
+- if (mask & ATTR_ATIME) {
+- inode->i_atime = iattr->ia_atime;
+- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+- ip->i_update_core = 1;
+- }
+- if (mask & ATTR_MTIME) {
+- inode->i_mtime = iattr->ia_mtime;
+- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+- timeflags &= ~XFS_ICHGTIME_MOD;
+- timeflags |= XFS_ICHGTIME_CHG;
+- }
+- if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
+- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
++ if (mask & ATTR_ATIME) {
++ inode->i_atime = iattr->ia_atime;
++ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
++ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
++ ip->i_update_core = 1;
+ }
+-
+- /*
+- * Change file inode change time only if ATTR_CTIME set
+- * AND we have been called by a DMI function.
+- */
+-
+- if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
++ if (mask & ATTR_CTIME) {
+ inode->i_ctime = iattr->ia_ctime;
+ ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+ ip->i_update_core = 1;
+- timeflags &= ~XFS_ICHGTIME_CHG;
++ }
++ if (mask & ATTR_MTIME) {
++ inode->i_mtime = iattr->ia_mtime;
++ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
++ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
++ ip->i_update_core = 1;
+ }
+
+ /*
+- * Send out timestamp changes that need to be set to the
+- * current time. Not done when called by a DMI function.
++ * And finally, log the inode core if any attribute in it
++ * has been changed.
+ */
+- if (timeflags && !(flags & XFS_ATTR_DMI))
+- xfs_ichgtime(ip, timeflags);
++ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
++ ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(xs_ig_attrchg);
+
+@@ -451,12 +438,10 @@ xfs_setattr(
+ * mix so this probably isn't worth the trouble to optimize.
+ */
+ code = 0;
+- if (tp) {
+- if (mp->m_flags & XFS_MOUNT_WSYNC)
+- xfs_trans_set_sync(tp);
++ if (mp->m_flags & XFS_MOUNT_WSYNC)
++ xfs_trans_set_sync(tp);
+
+- code = xfs_trans_commit(tp, commit_flags);
+- }
++ code = xfs_trans_commit(tp, commit_flags);
+
+ xfs_iunlock(ip, lock_flags);
+
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:05:39 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:01 +1100
+Subject: xfs: I/O completion handlers must use NOFS allocations
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-4-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 80641dc66a2d6dfb22af4413227a92b8ab84c7bb upstream
+
+When completing I/O requests we must not allow the memory allocator to
+recurse into the filesystem, as we might deadlock on waiting for the
+I/O completion otherwise. The only thing currently allocating normal
+GFP_KERNEL memory is the allocation of the transaction structure for
+the unwritten extent conversion. Add a memflags argument to
+_xfs_trans_alloc to allow controlling the allocator behaviour.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Thomas Neumann <tneumann@users.sourceforge.net>
+Tested-by: Thomas Neumann <tneumann@users.sourceforge.net>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_fsops.c | 2 +-
+ fs/xfs/xfs_iomap.c | 9 ++++++++-
+ fs/xfs/xfs_mount.c | 2 +-
+ fs/xfs/xfs_trans.c | 7 ++++---
+ fs/xfs/xfs_trans.h | 2 +-
+ 5 files changed, 15 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -611,7 +611,7 @@ xfs_fs_log_dummy(
+ xfs_inode_t *ip;
+ int error;
+
+- tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
++ tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
+ error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten(
+ * set up a transaction to convert the range of extents
+ * from unwritten to real. Do allocations in a loop until
+ * we have covered the range passed in.
++ *
++ * Note that we open code the transaction allocation here
++ * to pass KM_NOFS--we can't risk to recursing back into
++ * the filesystem here as we might be asked to write out
++ * the same inode that we complete here and might deadlock
++ * on the iolock.
+ */
+- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
++ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
++ tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, resblks,
+ XFS_WRITE_LOG_RES(mp), 0,
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1471,7 +1471,7 @@ xfs_log_sbcount(
+ if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+ return 0;
+
+- tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT);
++ tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+ error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
+ if (error) {
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -236,19 +236,20 @@ xfs_trans_alloc(
+ uint type)
+ {
+ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+- return _xfs_trans_alloc(mp, type);
++ return _xfs_trans_alloc(mp, type, KM_SLEEP);
+ }
+
+ xfs_trans_t *
+ _xfs_trans_alloc(
+ xfs_mount_t *mp,
+- uint type)
++ uint type,
++ uint memflags)
+ {
+ xfs_trans_t *tp;
+
+ atomic_inc(&mp->m_active_trans);
+
+- tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
++ tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
+ tp->t_magic = XFS_TRANS_MAGIC;
+ tp->t_type = type;
+ tp->t_mountp = mp;
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -924,7 +924,7 @@ typedef struct xfs_trans {
+ * XFS transaction mechanism exported interfaces.
+ */
+ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
+-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint);
++xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+ xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
+ int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+ uint, uint);
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:12:28 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:16 +1100
+Subject: xfs: Non-blocking inode locking in IO completion
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-19-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 77d7a0c2eeb285c9069e15396703d0cb9690ac50 upstream
+
+The introduction of barriers to loop devices has created a new IO
+order completion dependency that XFS does not handle. The loop
+device implements barriers using fsync and so turns a log IO in the
+XFS filesystem on the loop device into a data IO in the backing
+filesystem. That is, the completion of log IOs in the loop
+filesystem are now dependent on completion of data IO in the backing
+filesystem.
+
+This can cause deadlocks when a flush daemon issues a log force with
+an inode locked because the IO completion of IO on the inode is
+blocked by the inode lock. This in turn prevents further data IO
+completion from occuring on all XFS filesystems on that CPU (due to
+the shared nature of the completion queues). This then prevents the
+log IO from completing because the log is waiting for data IO
+completion as well.
+
+The fix for this new completion order dependency issue is to make
+the IO completion inode locking non-blocking. If the inode lock
+can't be grabbed, simply requeue the IO completion back to the work
+queue so that it can be processed later. This prevents the
+completion queue from being blocked and allows data IO completion on
+other inodes to proceed, hence avoiding completion order dependent
+deadlocks.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_aops.c | 118 ++++++++++++++++++++++++++++++--------------
+ 1 file changed, 82 insertions(+), 36 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_aops.c
++++ b/fs/xfs/linux-2.6/xfs_aops.c
+@@ -204,14 +204,17 @@ xfs_ioend_new_eof(
+ }
+
+ /*
+- * Update on-disk file size now that data has been written to disk.
+- * The current in-memory file size is i_size. If a write is beyond
+- * eof i_new_size will be the intended file size until i_size is
+- * updated. If this write does not extend all the way to the valid
+- * file size then restrict this update to the end of the write.
++ * Update on-disk file size now that data has been written to disk. The
++ * current in-memory file size is i_size. If a write is beyond eof i_new_size
++ * will be the intended file size until i_size is updated. If this write does
++ * not extend all the way to the valid file size then restrict this update to
++ * the end of the write.
++ *
++ * This function does not block as blocking on the inode lock in IO completion
++ * can lead to IO completion order dependency deadlocks.. If it can't get the
++ * inode ilock it will return EAGAIN. Callers must handle this.
+ */
+-
+-STATIC void
++STATIC int
+ xfs_setfilesize(
+ xfs_ioend_t *ioend)
+ {
+@@ -222,9 +225,11 @@ xfs_setfilesize(
+ ASSERT(ioend->io_type != IOMAP_READ);
+
+ if (unlikely(ioend->io_error))
+- return;
++ return 0;
++
++ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
++ return EAGAIN;
+
+- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ isize = xfs_ioend_new_eof(ioend);
+ if (isize) {
+ ip->i_d.di_size = isize;
+@@ -232,6 +237,28 @@ xfs_setfilesize(
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
++ return 0;
++}
++
++/*
++ * Schedule IO completion handling on a xfsdatad if this was
++ * the final hold on this ioend. If we are asked to wait,
++ * flush the workqueue.
++ */
++STATIC void
++xfs_finish_ioend(
++ xfs_ioend_t *ioend,
++ int wait)
++{
++ if (atomic_dec_and_test(&ioend->io_remaining)) {
++ struct workqueue_struct *wq;
++
++ wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
++ xfsconvertd_workqueue : xfsdatad_workqueue;
++ queue_work(wq, &ioend->io_work);
++ if (wait)
++ flush_workqueue(wq);
++ }
+ }
+
+ /*
+@@ -243,9 +270,23 @@ xfs_end_bio_delalloc(
+ {
+ xfs_ioend_t *ioend =
+ container_of(work, xfs_ioend_t, io_work);
++ int error;
+
+- xfs_setfilesize(ioend);
+- xfs_destroy_ioend(ioend);
++ /*
++ * If we didn't complete processing of the ioend, requeue it to the
++ * tail of the workqueue for another attempt later. Otherwise destroy
++ * it.
++ */
++ error = xfs_setfilesize(ioend);
++ if (error == EAGAIN) {
++ atomic_inc(&ioend->io_remaining);
++ xfs_finish_ioend(ioend, 0);
++ /* ensure we don't spin on blocked ioends */
++ delay(1);
++ } else {
++ ASSERT(!error);
++ xfs_destroy_ioend(ioend);
++ }
+ }
+
+ /*
+@@ -257,9 +298,23 @@ xfs_end_bio_written(
+ {
+ xfs_ioend_t *ioend =
+ container_of(work, xfs_ioend_t, io_work);
++ int error;
+
+- xfs_setfilesize(ioend);
+- xfs_destroy_ioend(ioend);
++ /*
++ * If we didn't complete processing of the ioend, requeue it to the
++ * tail of the workqueue for another attempt later. Otherwise destroy
++ * it.
++ */
++ error = xfs_setfilesize(ioend);
++ if (error == EAGAIN) {
++ atomic_inc(&ioend->io_remaining);
++ xfs_finish_ioend(ioend, 0);
++ /* ensure we don't spin on blocked ioends */
++ delay(1);
++ } else {
++ ASSERT(!error);
++ xfs_destroy_ioend(ioend);
++ }
+ }
+
+ /*
+@@ -279,13 +334,25 @@ xfs_end_bio_unwritten(
+ size_t size = ioend->io_size;
+
+ if (likely(!ioend->io_error)) {
++ int error;
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+- int error;
+ error = xfs_iomap_write_unwritten(ip, offset, size);
+ if (error)
+ ioend->io_error = error;
+ }
+- xfs_setfilesize(ioend);
++ /*
++ * If we didn't complete processing of the ioend, requeue it to the
++ * tail of the workqueue for another attempt later. Otherwise destroy
++ * it.
++ */
++ error = xfs_setfilesize(ioend);
++ if (error == EAGAIN) {
++ atomic_inc(&ioend->io_remaining);
++ xfs_finish_ioend(ioend, 0);
++ /* ensure we don't spin on blocked ioends */
++ delay(1);
++ return;
++ }
+ }
+ xfs_destroy_ioend(ioend);
+ }
+@@ -304,27 +371,6 @@ xfs_end_bio_read(
+ }
+
+ /*
+- * Schedule IO completion handling on a xfsdatad if this was
+- * the final hold on this ioend. If we are asked to wait,
+- * flush the workqueue.
+- */
+-STATIC void
+-xfs_finish_ioend(
+- xfs_ioend_t *ioend,
+- int wait)
+-{
+- if (atomic_dec_and_test(&ioend->io_remaining)) {
+- struct workqueue_struct *wq = xfsdatad_workqueue;
+- if (ioend->io_work.func == xfs_end_bio_unwritten)
+- wq = xfsconvertd_workqueue;
+-
+- queue_work(wq, &ioend->io_work);
+- if (wait)
+- flush_workqueue(wq);
+- }
+-}
+-
+-/*
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:11:19 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:42:13 +1100
+Subject: xfs: quota limit statvfs available blocks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-16-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 9b00f30762fe9f914eb6e03057a616ed63a4e8ca upstream
+
+A "df" run on an NFS client of an exported XFS file system reports
+the wrong information for "available" blocks. When a block quota is
+enforced, the amount reported as free is limited by the quota, but
+the amount reported available is not (and should be).
+
+Reported-by: Guk-Bong, Kwon <gbkwon@gmail.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/quota/xfs_qm_bhv.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/quota/xfs_qm_bhv.c
++++ b/fs/xfs/quota/xfs_qm_bhv.c
+@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
+ be64_to_cpu(dp->d_blk_hardlimit);
+ if (limit && statp->f_blocks > limit) {
+ statp->f_blocks = limit;
+- statp->f_bfree =
++ statp->f_bfree = statp->f_bavail =
+ (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
+ (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+ }
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:09:55 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:10 +1100
+Subject: xfs: reclaim all inodes by background tree walks
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-13-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 57817c68229984818fea9e614d6f95249c3fb098 upstream
+
+We cannot do direct inode reclaim without taking the flush lock to
+ensure that we do not reclaim an inode under IO. We check the inode
+is clean before doing direct reclaim, but this is not good enough
+because the inode flush code marks the inode clean once it has
+copied the in-core dirty state to the backing buffer.
+
+It is the flush lock that determines whether the inode is still
+under IO, even though it is marked clean, and the inode is still
+required at IO completion so we can't reclaim it even though it is
+clean in core. Hence the requirement that we need to take the flush
+lock even on clean inodes because this guarantees that the inode
+writeback IO has completed and it is safe to reclaim the inode.
+
+With delayed write inode flushing, we could end up waiting a long
+time on the flush lock even for a clean inode. The background
+reclaim already handles this efficiently, so avoid all the problems
+by killing the direct reclaim path altogether.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/linux-2.6/xfs_super.c | 14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -953,16 +953,14 @@ xfs_fs_destroy_inode(
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+ /*
+- * If we have nothing to flush with this inode then complete the
+- * teardown now, otherwise delay the flush operation.
++ * We always use background reclaim here because even if the
++ * inode is clean, it still may be under IO and hence we have
++ * to take the flush lock. The background reclaim path handles
++ * this more efficiently than we can here, so simply let background
++ * reclaim tear down all inodes.
+ */
+- if (!xfs_inode_clean(ip)) {
+- xfs_inode_set_reclaim_tag(ip);
+- return;
+- }
+-
+ out_reclaim:
+- xfs_ireclaim(ip);
++ xfs_inode_set_reclaim_tag(ip);
+ }
+
+ /*
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:09:00 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:08 +1100
+Subject: xfs: reclaim inodes under a write lock
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-11-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit c8e20be020f234c8d492927a424a7d8bbefd5b5d upstream
+
+Make the inode tree reclaim walk exclusive to avoid races with
+concurrent sync walkers and lookups. This is a version of a patch
+posted by Christoph Hellwig that avoids all the code duplication.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_sync.c | 154 ++++++++++++++++++-----------------------
+ fs/xfs/linux-2.6/xfs_sync.h | 2
+ fs/xfs/quota/xfs_qm_syscalls.c | 2
+ 3 files changed, 71 insertions(+), 87 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -64,7 +64,6 @@ xfs_inode_ag_lookup(
+ * as the tree is sparse and a gang lookup walks to find
+ * the number of objects requested.
+ */
+- read_lock(&pag->pag_ici_lock);
+ if (tag == XFS_ICI_NO_TAG) {
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)&ip, *first_index, 1);
+@@ -73,7 +72,7 @@ xfs_inode_ag_lookup(
+ (void **)&ip, *first_index, 1, tag);
+ }
+ if (!nr_found)
+- goto unlock;
++ return NULL;
+
+ /*
+ * Update the index for the next lookup. Catch overflows
+@@ -83,13 +82,8 @@ xfs_inode_ag_lookup(
+ */
+ *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+- goto unlock;
+-
++ return NULL;
+ return ip;
+-
+-unlock:
+- read_unlock(&pag->pag_ici_lock);
+- return NULL;
+ }
+
+ STATIC int
+@@ -99,7 +93,8 @@ xfs_inode_ag_walk(
+ int (*execute)(struct xfs_inode *ip,
+ struct xfs_perag *pag, int flags),
+ int flags,
+- int tag)
++ int tag,
++ int exclusive)
+ {
+ struct xfs_perag *pag = &mp->m_perag[ag];
+ uint32_t first_index;
+@@ -113,10 +108,20 @@ restart:
+ int error = 0;
+ xfs_inode_t *ip;
+
++ if (exclusive)
++ write_lock(&pag->pag_ici_lock);
++ else
++ read_lock(&pag->pag_ici_lock);
+ ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+- if (!ip)
++ if (!ip) {
++ if (exclusive)
++ write_unlock(&pag->pag_ici_lock);
++ else
++ read_unlock(&pag->pag_ici_lock);
+ break;
++ }
+
++ /* execute releases pag->pag_ici_lock */
+ error = execute(ip, pag, flags);
+ if (error == EAGAIN) {
+ skipped++;
+@@ -124,9 +129,8 @@ restart:
+ }
+ if (error)
+ last_error = error;
+- /*
+- * bail out if the filesystem is corrupted.
+- */
++
++ /* bail out if the filesystem is corrupted. */
+ if (error == EFSCORRUPTED)
+ break;
+
+@@ -147,7 +151,8 @@ xfs_inode_ag_iterator(
+ int (*execute)(struct xfs_inode *ip,
+ struct xfs_perag *pag, int flags),
+ int flags,
+- int tag)
++ int tag,
++ int exclusive)
+ {
+ int error = 0;
+ int last_error = 0;
+@@ -156,7 +161,8 @@ xfs_inode_ag_iterator(
+ for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+ if (!mp->m_perag[ag].pag_ici_init)
+ continue;
+- error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
++ error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
++ exclusive);
+ if (error) {
+ last_error = error;
+ if (error == EFSCORRUPTED)
+@@ -180,11 +186,7 @@ xfs_sync_inode_valid(
+ return EFSCORRUPTED;
+ }
+
+- /*
+- * If we can't get a reference on the inode, it must be in reclaim.
+- * Leave it for the reclaim code to flush. Also avoid inodes that
+- * haven't been fully initialised.
+- */
++ /* If we can't get a reference on the inode, it must be in reclaim. */
+ if (!igrab(inode)) {
+ read_unlock(&pag->pag_ici_lock);
+ return ENOENT;
+@@ -281,7 +283,7 @@ xfs_sync_data(
+ ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+
+ error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+- XFS_ICI_NO_TAG);
++ XFS_ICI_NO_TAG, 0);
+ if (error)
+ return XFS_ERROR(error);
+
+@@ -303,7 +305,7 @@ xfs_sync_attr(
+ ASSERT((flags & ~SYNC_WAIT) == 0);
+
+ return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+- XFS_ICI_NO_TAG);
++ XFS_ICI_NO_TAG, 0);
+ }
+
+ STATIC int
+@@ -663,60 +665,6 @@ xfs_syncd_stop(
+ kthread_stop(mp->m_sync_task);
+ }
+
+-STATIC int
+-xfs_reclaim_inode(
+- xfs_inode_t *ip,
+- int sync_mode)
+-{
+- xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+-
+- /* The hash lock here protects a thread in xfs_iget_core from
+- * racing with us on linking the inode back with a vnode.
+- * Once we have the XFS_IRECLAIM flag set it will not touch
+- * us.
+- */
+- write_lock(&pag->pag_ici_lock);
+- spin_lock(&ip->i_flags_lock);
+- if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+- !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+- spin_unlock(&ip->i_flags_lock);
+- write_unlock(&pag->pag_ici_lock);
+- return -EAGAIN;
+- }
+- __xfs_iflags_set(ip, XFS_IRECLAIM);
+- spin_unlock(&ip->i_flags_lock);
+- write_unlock(&pag->pag_ici_lock);
+- xfs_put_perag(ip->i_mount, pag);
+-
+- /*
+- * If the inode is still dirty, then flush it out. If the inode
+- * is not in the AIL, then it will be OK to flush it delwri as
+- * long as xfs_iflush() does not keep any references to the inode.
+- * We leave that decision up to xfs_iflush() since it has the
+- * knowledge of whether it's OK to simply do a delwri flush of
+- * the inode or whether we need to wait until the inode is
+- * pulled from the AIL.
+- * We get the flush lock regardless, though, just to make sure
+- * we don't free it while it is being flushed.
+- */
+- xfs_ilock(ip, XFS_ILOCK_EXCL);
+- xfs_iflock(ip);
+-
+- /*
+- * In the case of a forced shutdown we rely on xfs_iflush() to
+- * wait for the inode to be unpinned before returning an error.
+- */
+- if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+- /* synchronize with xfs_iflush_done */
+- xfs_iflock(ip);
+- xfs_ifunlock(ip);
+- }
+-
+- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+- xfs_ireclaim(ip);
+- return 0;
+-}
+-
+ void
+ __xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+@@ -759,19 +707,55 @@ __xfs_inode_clear_reclaim_tag(
+ }
+
+ STATIC int
+-xfs_reclaim_inode_now(
++xfs_reclaim_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+- int flags)
++ int sync_mode)
+ {
+- /* ignore if already under reclaim */
+- if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+- read_unlock(&pag->pag_ici_lock);
++ /*
++ * The radix tree lock here protects a thread in xfs_iget from racing
++ * with us starting reclaim on the inode. Once we have the
++ * XFS_IRECLAIM flag set it will not touch us.
++ */
++ spin_lock(&ip->i_flags_lock);
++ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
++ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
++ /* ignore as it is already under reclaim */
++ spin_unlock(&ip->i_flags_lock);
++ write_unlock(&pag->pag_ici_lock);
+ return 0;
+ }
+- read_unlock(&pag->pag_ici_lock);
++ __xfs_iflags_set(ip, XFS_IRECLAIM);
++ spin_unlock(&ip->i_flags_lock);
++ write_unlock(&pag->pag_ici_lock);
+
+- return xfs_reclaim_inode(ip, flags);
++ /*
++ * If the inode is still dirty, then flush it out. If the inode
++ * is not in the AIL, then it will be OK to flush it delwri as
++ * long as xfs_iflush() does not keep any references to the inode.
++ * We leave that decision up to xfs_iflush() since it has the
++ * knowledge of whether it's OK to simply do a delwri flush of
++ * the inode or whether we need to wait until the inode is
++ * pulled from the AIL.
++ * We get the flush lock regardless, though, just to make sure
++ * we don't free it while it is being flushed.
++ */
++ xfs_ilock(ip, XFS_ILOCK_EXCL);
++ xfs_iflock(ip);
++
++ /*
++ * In the case of a forced shutdown we rely on xfs_iflush() to
++ * wait for the inode to be unpinned before returning an error.
++ */
++ if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
++ /* synchronize with xfs_iflush_done */
++ xfs_iflock(ip);
++ xfs_ifunlock(ip);
++ }
++
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
++ xfs_ireclaim(ip);
++ return 0;
+ }
+
+ int
+@@ -779,6 +763,6 @@ xfs_reclaim_inodes(
+ xfs_mount_t *mp,
+ int mode)
+ {
+- return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+- XFS_ICI_RECLAIM_TAG);
++ return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
++ XFS_ICI_RECLAIM_TAG, 1);
+ }
+--- a/fs/xfs/linux-2.6/xfs_sync.h
++++ b/fs/xfs/linux-2.6/xfs_sync.h
+@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struc
+ int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+ int xfs_inode_ag_iterator(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+- int flags, int tag);
++ int flags, int tag, int write_lock);
+
+ #endif
+--- a/fs/xfs/quota/xfs_qm_syscalls.c
++++ b/fs/xfs/quota/xfs_qm_syscalls.c
+@@ -893,7 +893,7 @@ xfs_qm_dqrele_all_inodes(
+ uint flags)
+ {
+ ASSERT(mp->m_quotainfo);
+- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
++ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+ }
+
+ /*------------------------------------------------------------------------*/
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:12:06 2010
+From: Christoph Hellwig <hch@lst.de>
+Date: Fri, 12 Mar 2010 09:42:15 +1100
+Subject: xfs: remove invalid barrier optimization from xfs_fsync
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-18-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e8b217e7530c6a073ac69f1c85b922d93fdf5647 upstream
+
+Date: Tue, 2 Feb 2010 10:16:26 +1100
+We always need to flush the disk write cache and can't skip it just because
+the no inode attributes have changed.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_vnodeops.c | 12 ++----------
+ 1 file changed, 2 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -597,7 +597,7 @@ xfs_fsync(
+ {
+ xfs_trans_t *tp;
+ int error = 0;
+- int log_flushed = 0, changed = 1;
++ int log_flushed = 0;
+
+ xfs_itrace_entry(ip);
+
+@@ -627,19 +627,11 @@ xfs_fsync(
+ * disk yet, the inode will be still be pinned. If it is,
+ * force the log.
+ */
+-
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+-
+ if (xfs_ipincount(ip)) {
+ error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+ XFS_LOG_FORCE | XFS_LOG_SYNC,
+ &log_flushed);
+- } else {
+- /*
+- * If the inode is not pinned and nothing has changed
+- * we don't need to flush the cache.
+- */
+- changed = 0;
+ }
+ } else {
+ /*
+@@ -674,7 +666,7 @@ xfs_fsync(
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+- if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
++ if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+ /*
+ * If the log write didn't issue an ordered tag we need
+ * to flush the disk cache for the data device now.
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:04:20 2010
+From: Christoph Hellwig <hch@infradead.org>
+Date: Fri, 12 Mar 2010 09:41:59 +1100
+Subject: xfs: simplify inode teardown
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-2-git-send-email-david@fromorbit.com>
+
+
+From: Christoph Hellwig <hch@infradead.org>
+
+commit 848ce8f731aed0a2d4ab5884a4f6664af73d2dd0 upstream
+
+Currently the reclaim code for the case where we don't reclaim the
+final reclaim is overly complicated. We know that the inode is clean
+but instead of just directly reclaiming the clean inode we go through
+the whole process of marking the inode reclaimable just to directly
+reclaim it from the calling context. Besides being overly complicated
+this introduces a race where iget could recycle an inode between
+marked reclaimable and actually being reclaimed leading to panics.
+
+This patch gets rid of the existing reclaim path, and replaces it with
+a simple call to xfs_ireclaim if the inode was clean. While we're at
+it we also use the slightly more lax xfs_inode_clean check we'd use
+later to determine if we need to flush the inode here.
+
+Finally get rid of xfs_reclaim function and place the remaining small
+bits of reclaim code directly into xfs_fs_destroy_inode.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reported-by: Patrick Schreurs <patrick@news-service.com>
+Reported-by: Tommy van Leeuwen <tommy@news-service.com>
+Tested-by: Patrick Schreurs <patrick@news-service.com>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/linux-2.6/xfs_super.c | 34 ++++++++++++++++++++++++++++++----
+ fs/xfs/linux-2.6/xfs_sync.c | 15 ++++-----------
+ fs/xfs/linux-2.6/xfs_sync.h | 1 -
+ fs/xfs/xfs_vnodeops.c | 40 ----------------------------------------
+ fs/xfs/xfs_vnodeops.h | 1 -
+ 5 files changed, 34 insertions(+), 57 deletions(-)
+
+--- a/fs/xfs/linux-2.6/xfs_super.c
++++ b/fs/xfs/linux-2.6/xfs_super.c
+@@ -930,13 +930,39 @@ xfs_fs_alloc_inode(
+ */
+ STATIC void
+ xfs_fs_destroy_inode(
+- struct inode *inode)
++ struct inode *inode)
+ {
+- xfs_inode_t *ip = XFS_I(inode);
++ struct xfs_inode *ip = XFS_I(inode);
++
++ xfs_itrace_entry(ip);
+
+ XFS_STATS_INC(vn_reclaim);
+- if (xfs_reclaim(ip))
+- panic("%s: cannot reclaim 0x%p\n", __func__, inode);
++
++ /* bad inode, get out here ASAP */
++ if (is_bad_inode(inode))
++ goto out_reclaim;
++
++ xfs_ioend_wait(ip);
++
++ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
++
++ /*
++ * We should never get here with one of the reclaim flags already set.
++ */
++ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
++ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
++
++ /*
++ * If we have nothing to flush with this inode then complete the
++ * teardown now, otherwise delay the flush operation.
++ */
++ if (!xfs_inode_clean(ip)) {
++ xfs_inode_set_reclaim_tag(ip);
++ return;
++ }
++
++out_reclaim:
++ xfs_ireclaim(ip);
+ }
+
+ /*
+--- a/fs/xfs/linux-2.6/xfs_sync.c
++++ b/fs/xfs/linux-2.6/xfs_sync.c
+@@ -663,10 +663,9 @@ xfs_syncd_stop(
+ kthread_stop(mp->m_sync_task);
+ }
+
+-int
++STATIC int
+ xfs_reclaim_inode(
+ xfs_inode_t *ip,
+- int locked,
+ int sync_mode)
+ {
+ xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+@@ -682,10 +681,6 @@ xfs_reclaim_inode(
+ !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
+- if (locked) {
+- xfs_ifunlock(ip);
+- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+- }
+ return -EAGAIN;
+ }
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+@@ -704,10 +699,8 @@ xfs_reclaim_inode(
+ * We get the flush lock regardless, though, just to make sure
+ * we don't free it while it is being flushed.
+ */
+- if (!locked) {
+- xfs_ilock(ip, XFS_ILOCK_EXCL);
+- xfs_iflock(ip);
+- }
++ xfs_ilock(ip, XFS_ILOCK_EXCL);
++ xfs_iflock(ip);
+
+ /*
+ * In the case of a forced shutdown we rely on xfs_iflush() to
+@@ -778,7 +771,7 @@ xfs_reclaim_inode_now(
+ }
+ read_unlock(&pag->pag_ici_lock);
+
+- return xfs_reclaim_inode(ip, 0, flags);
++ return xfs_reclaim_inode(ip, flags);
+ }
+
+ int
+--- a/fs/xfs/linux-2.6/xfs_sync.h
++++ b/fs/xfs/linux-2.6/xfs_sync.h
+@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *
+
+ void xfs_flush_inodes(struct xfs_inode *ip);
+
+-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+
+ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+--- a/fs/xfs/xfs_vnodeops.c
++++ b/fs/xfs/xfs_vnodeops.c
+@@ -2456,46 +2456,6 @@ xfs_set_dmattrs(
+ return error;
+ }
+
+-int
+-xfs_reclaim(
+- xfs_inode_t *ip)
+-{
+-
+- xfs_itrace_entry(ip);
+-
+- ASSERT(!VN_MAPPED(VFS_I(ip)));
+-
+- /* bad inode, get out here ASAP */
+- if (is_bad_inode(VFS_I(ip))) {
+- xfs_ireclaim(ip);
+- return 0;
+- }
+-
+- xfs_ioend_wait(ip);
+-
+- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+-
+- /*
+- * If we have nothing to flush with this inode then complete the
+- * teardown now, otherwise break the link between the xfs inode and the
+- * linux inode and clean up the xfs inode later. This avoids flushing
+- * the inode to disk during the delete operation itself.
+- *
+- * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
+- * first to ensure that xfs_iunpin() will never see an xfs inode
+- * that has a linux inode being reclaimed. Synchronisation is provided
+- * by the i_flags_lock.
+- */
+- if (!ip->i_update_core && (ip->i_itemp == NULL)) {
+- xfs_ilock(ip, XFS_ILOCK_EXCL);
+- xfs_iflock(ip);
+- xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+- return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+- }
+- xfs_inode_set_reclaim_tag(ip);
+- return 0;
+-}
+-
+ /*
+ * xfs_alloc_file_space()
+ * This routine allocates disk space for the given file.
+--- a/fs/xfs/xfs_vnodeops.h
++++ b/fs/xfs/xfs_vnodeops.h
+@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, st
+ const char *target_path, mode_t mode, struct xfs_inode **ipp,
+ cred_t *credp);
+ int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
+-int xfs_reclaim(struct xfs_inode *ip);
+ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
+ xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
+ int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:06:08 2010
+From: Andy Poling <andy@realbig.com>
+Date: Fri, 12 Mar 2010 09:42:02 +1100
+Subject: xfs: Wrapped journal record corruption on read at recovery
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-5-git-send-email-david@fromorbit.com>
+
+
+From: Andy Poling <andy@realbig.com>
+
+commit fc5bc4c85c45f0bf854404e5736aa8b65720a18d upstream
+
+Summary of problem:
+
+If a journal record wraps at the physical end of the journal, it has to be
+read in two parts in xlog_do_recovery_pass(): a read at the physical end and a
+read at the physical beginning. If xlog_bread() has to re-align the first
+read, the second read request does not take that re-alignment into account.
+If the first read was re-aligned, the second read over-writes the end of the
+data from the first read, effectively corrupting it. This can happen either
+when reading the record header or reading the record data.
+
+The first sanity check in xlog_recover_process_data() is to check for a valid
+clientid, so that is the error reported.
+
+Summary of fix:
+
+If there was a first read at the physical end, XFS_BUF_PTR() returns where the
+data was requested to begin. Conversely, because it is the result of
+xlog_align(), offset indicates where the requested data for the first read
+actually begins - whether or not xlog_bread() has re-aligned it.
+
+Using offset as the base for the calculation of where to place the second read
+data ensures that it will be correctly placed immediately following the data
+from the first read instead of sometimes over-writing the end of it.
+
+The attached patch has resolved the reported problem of occasional inability
+to recover the journal (reporting "bad clientid").
+
+Signed-off-by: Andy Poling <andy@realbig.com>
+Reviewed-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/xfs/xfs_log_recover.c | 24 +++++++-----------------
+ 1 file changed, 7 insertions(+), 17 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -3517,7 +3517,7 @@ xlog_do_recovery_pass(
+ {
+ xlog_rec_header_t *rhead;
+ xfs_daddr_t blk_no;
+- xfs_caddr_t bufaddr, offset;
++ xfs_caddr_t offset;
+ xfs_buf_t *hbp, *dbp;
+ int error = 0, h_size;
+ int bblks, split_bblks;
+@@ -3610,7 +3610,7 @@ xlog_do_recovery_pass(
+ /*
+ * Check for header wrapping around physical end-of-log
+ */
+- offset = NULL;
++ offset = XFS_BUF_PTR(hbp);
+ split_hblks = 0;
+ wrapped_hblks = 0;
+ if (blk_no + hblks <= log->l_logBBsize) {
+@@ -3646,9 +3646,8 @@ xlog_do_recovery_pass(
+ * - order is important.
+ */
+ wrapped_hblks = hblks - split_hblks;
+- bufaddr = XFS_BUF_PTR(hbp);
+ error = XFS_BUF_SET_PTR(hbp,
+- bufaddr + BBTOB(split_hblks),
++ offset + BBTOB(split_hblks),
+ BBTOB(hblks - split_hblks));
+ if (error)
+ goto bread_err2;
+@@ -3658,14 +3657,10 @@ xlog_do_recovery_pass(
+ if (error)
+ goto bread_err2;
+
+- error = XFS_BUF_SET_PTR(hbp, bufaddr,
++ error = XFS_BUF_SET_PTR(hbp, offset,
+ BBTOB(hblks));
+ if (error)
+ goto bread_err2;
+-
+- if (!offset)
+- offset = xlog_align(log, 0,
+- wrapped_hblks, hbp);
+ }
+ rhead = (xlog_rec_header_t *)offset;
+ error = xlog_valid_rec_header(log, rhead,
+@@ -3685,7 +3680,7 @@ xlog_do_recovery_pass(
+ } else {
+ /* This log record is split across the
+ * physical end of log */
+- offset = NULL;
++ offset = XFS_BUF_PTR(dbp);
+ split_bblks = 0;
+ if (blk_no != log->l_logBBsize) {
+ /* some data is before the physical
+@@ -3714,9 +3709,8 @@ xlog_do_recovery_pass(
+ * _first_, then the log start (LR header end)
+ * - order is important.
+ */
+- bufaddr = XFS_BUF_PTR(dbp);
+ error = XFS_BUF_SET_PTR(dbp,
+- bufaddr + BBTOB(split_bblks),
++ offset + BBTOB(split_bblks),
+ BBTOB(bblks - split_bblks));
+ if (error)
+ goto bread_err2;
+@@ -3727,13 +3721,9 @@ xlog_do_recovery_pass(
+ if (error)
+ goto bread_err2;
+
+- error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
++ error = XFS_BUF_SET_PTR(dbp, offset, h_size);
+ if (error)
+ goto bread_err2;
+-
+- if (!offset)
+- offset = xlog_align(log, wrapped_hblks,
+- bblks - split_bblks, dbp);
+ }
+ xlog_unpack_data(rhead, offset, log);
+ if ((error = xlog_recover_process_data(log, rhash,
--- /dev/null
+From david@fromorbit.com Fri Apr 2 11:10:52 2010
+From: Dave Chinner <david@fromorbit.com>
+Date: Fri, 12 Mar 2010 09:42:12 +1100
+Subject: xfs: xfs_swap_extents needs to handle dynamic fork offsets
+To: stable@kernel.org
+Cc: xfs@oss.sgi.com
+Message-ID: <1268347337-7160-15-git-send-email-david@fromorbit.com>
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit e09f98606dcc156de1146c209d45a0d6d5f51c3f upstream
+
+When swapping extents, we can corrupt inodes by swapping data forks
+that are in incompatible formats. This is caused by the two indoes
+having different fork offsets due to the presence of an attribute
+fork on an attr2 filesystem. xfs_fsr tries to be smart about
+setting the fork offset, but the trick it plays only works on attr1
+(old fixed format attribute fork) filesystems.
+
+Changing the way xfs_fsr sets up the attribute fork will prevent
+this situation from ever occurring, so in the kernel code we can get
+by with a preventative fix - check that the data fork in the
+defragmented inode is in a format valid for the inode it is being
+swapped into. This will lead to files that will silently and
+potentially repeatedly fail defragmentation, so issue a warning to
+the log when this particular failure occurs to let us know that
+xfs_fsr needs updating/fixing.
+
+To help identify how to improve xfs_fsr to avoid this issue, add
+trace points for the inodes being swapped so that we can determine
+why the swap was rejected and to confirm that the code is making the
+right decisions and modifications when swapping forks.
+
+A further complication is even when the swap is allowed to proceed
+when the fork offset is different between the two inodes then value
+for the maximum number of extents the data fork can hold can be
+wrong. Make sure these are also set correctly after the swap occurs.
+
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Alex Elder <aelder@sgi.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/xfs/xfs_dfrag.c | 106 +++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 90 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/xfs_dfrag.c
++++ b/fs/xfs/xfs_dfrag.c
+@@ -113,10 +113,82 @@ xfs_swapext(
+ return error;
+ }
+
++/*
++ * We need to check that the format of the data fork in the temporary inode is
++ * valid for the target inode before doing the swap. This is not a problem with
++ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
++ * data fork depending on the space the attribute fork is taking so we can get
++ * invalid formats on the target inode.
++ *
++ * E.g. target has space for 7 extents in extent format, temp inode only has
++ * space for 6. If we defragment down to 7 extents, then the tmp format is a
++ * btree, but when swapped it needs to be in extent format. Hence we can't just
++ * blindly swap data forks on attr2 filesystems.
++ *
++ * Note that we check the swap in both directions so that we don't end up with
++ * a corrupt temporary inode, either.
++ *
++ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
++ * inode will prevent this situation from occurring, so all we do here is
++ * reject and log the attempt. basically we are putting the responsibility on
++ * userspace to get this right.
++ */
++static int
++xfs_swap_extents_check_format(
++ xfs_inode_t *ip, /* target inode */
++ xfs_inode_t *tip) /* tmp inode */
++{
++
++ /* Should never get a local format */
++ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
++ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
++ return EINVAL;
++
++ /*
++ * if the target inode has less extents that then temporary inode then
++ * why did userspace call us?
++ */
++ if (ip->i_d.di_nextents < tip->i_d.di_nextents)
++ return EINVAL;
++
++ /*
++ * if the target inode is in extent form and the temp inode is in btree
++ * form then we will end up with the target inode in the wrong format
++ * as we already know there are less extents in the temp inode.
++ */
++ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
++ return EINVAL;
++
++ /* Check temp in extent form to max in target */
++ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
++ return EINVAL;
++
++ /* Check target in extent form to max in temp */
++ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
++ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
++ return EINVAL;
++
++ /* Check root block of temp in btree form to max in target */
++ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
++ XFS_IFORK_BOFF(ip) &&
++ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
++ return EINVAL;
++
++ /* Check root block of target in btree form to max in temp */
++ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
++ XFS_IFORK_BOFF(tip) &&
++ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
++ return EINVAL;
++
++ return 0;
++}
++
+ int
+ xfs_swap_extents(
+- xfs_inode_t *ip,
+- xfs_inode_t *tip,
++ xfs_inode_t *ip, /* target inode */
++ xfs_inode_t *tip, /* tmp inode */
+ xfs_swapext_t *sxp)
+ {
+ xfs_mount_t *mp;
+@@ -160,13 +232,6 @@ xfs_swap_extents(
+ goto out_unlock;
+ }
+
+- /* Should never get a local format */
+- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+- error = XFS_ERROR(EINVAL);
+- goto out_unlock;
+- }
+-
+ if (VN_CACHED(VFS_I(tip)) != 0) {
+ xfs_inval_cached_trace(tip, 0, -1, 0, -1);
+ error = xfs_flushinval_pages(tip, 0, -1,
+@@ -189,13 +254,12 @@ xfs_swap_extents(
+ goto out_unlock;
+ }
+
+- /*
+- * If the target has extended attributes, the tmp file
+- * must also in order to ensure the correct data fork
+- * format.
+- */
+- if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+- error = XFS_ERROR(EINVAL);
++ /* check inode formats now that data is flushed */
++ error = xfs_swap_extents_check_format(ip, tip);
++ if (error) {
++ xfs_fs_cmn_err(CE_NOTE, mp,
++ "%s: inode 0x%llx format is incompatible for exchanging.",
++ __FILE__, ip->i_ino);
+ goto out_unlock;
+ }
+
+@@ -276,6 +340,16 @@ xfs_swap_extents(
+ *tifp = *tempifp; /* struct copy */
+
+ /*
++ * Fix the in-memory data fork values that are dependent on the fork
++ * offset in the inode. We can't assume they remain the same as attr2
++ * has dynamic fork offsets.
++ */
++ ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
++ (uint)sizeof(xfs_bmbt_rec_t);
++ tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
++ (uint)sizeof(xfs_bmbt_rec_t);
++
++ /*
+ * Fix the on-disk inode values
+ */
+ tmp = (__uint64_t)ip->i_d.di_nblocks;