From: Greg Kroah-Hartman Date: Fri, 2 Apr 2010 18:13:51 +0000 (-0700) Subject: .32 xfs patches X-Git-Tag: v2.6.32.12~60 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=cc67517a3f3a86de7188d8a16dd26cb91e5a31da;p=thirdparty%2Fkernel%2Fstable-queue.git .32 xfs patches --- diff --git a/queue-2.6.32/series b/queue-2.6.32/series index 8e4a7266c65..f15d7868dc1 100644 --- a/queue-2.6.32/series +++ b/queue-2.6.32/series @@ -6,3 +6,22 @@ oom-fix-the-unsafe-usage-of-badness-in-proc_oom_score.patch drm-radeon-kms-don-t-print-error-on-erestartsys.patch drm-radeon-kms-fix-pal-tv-out-support-on-legacy-igp-chips.patch drm-return-enodev-if-the-inode-mapping-changes.patch +xfs-simplify-inode-teardown.patch +xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch +xfs-i-o-completion-handlers-must-use-nofs-allocations.patch +xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch +xfs-fix-error-return-for-fallocate-on-xfs.patch +xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch +xfs-fix-timestamp-handling-in-xfs_setattr.patch +xfs-don-t-flush-stale-inodes.patch +xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch +xfs-reclaim-inodes-under-a-write-lock.patch +xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch +xfs-reclaim-all-inodes-by-background-tree-walks.patch +xfs-fix-stale-inode-flush-avoidance.patch +xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch +xfs-quota-limit-statvfs-available-blocks.patch +xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch +xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch +xfs-non-blocking-inode-locking-in-io-completion.patch +xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch diff --git a/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch b/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch new file mode 100644 index 00000000000..09eea8643c3 --- /dev/null +++ b/queue-2.6.32/xfs-avoid-inodes-in-reclaim-when-flushing-from-inode-cache.patch @@ -0,0 +1,73 @@ +From david@fromorbit.com Fri Apr 2 11:09:28 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:09 +1100 +Subject: xfs: Avoid inodes in reclaim when flushing from inode cache +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-12-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit 018027be90a6946e8cf3f9b17b5582384f7ed117 upstream + +The reclaim code will handle flushing of dirty inodes before reclaim +occurs, so avoid them when determining whether an inode is a +candidate for flushing to disk when walking the radix trees. This +is based on a test patch from Christoph Hellwig. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/linux-2.6/xfs_sync.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_sync.c ++++ b/fs/xfs/linux-2.6/xfs_sync.c +@@ -179,26 +179,31 @@ xfs_sync_inode_valid( + struct xfs_perag *pag) + { + struct inode *inode = VFS_I(ip); ++ int error = EFSCORRUPTED; + + /* nothing to sync during shutdown */ +- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { +- read_unlock(&pag->pag_ici_lock); +- return EFSCORRUPTED; +- } ++ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) ++ goto out_unlock; + +- /* If we can't get a reference on the inode, it must be in reclaim. */ +- if (!igrab(inode)) { +- read_unlock(&pag->pag_ici_lock); +- return ENOENT; +- } +- read_unlock(&pag->pag_ici_lock); ++ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ ++ error = ENOENT; ++ if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) ++ goto out_unlock; + +- if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { ++ /* If we can't grab the inode, it must on it's way to reclaim. */ ++ if (!igrab(inode)) ++ goto out_unlock; ++ ++ if (is_bad_inode(inode)) { + IRELE(ip); +- return ENOENT; ++ goto out_unlock; + } + +- return 0; ++ /* inode is valid */ ++ error = 0; ++out_unlock: ++ read_unlock(&pag->pag_ici_lock); ++ return error; + } + + STATIC int diff --git a/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch b/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch new file mode 100644 index 00000000000..5ff112ec0d3 --- /dev/null +++ b/queue-2.6.32/xfs-check-for-not-fully-initialized-inodes-in-xfs_ireclaim.patch @@ -0,0 +1,52 @@ +From david@fromorbit.com Fri Apr 2 11:07:09 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:42:04 +1100 +Subject: xfs: check for not fully initialized inodes in xfs_ireclaim +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-7-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit b44b1126279b60597f96bbe77507b1650f88a969 upstream + +Add an assert for inodes not added to the inode cache in xfs_ireclaim, +to make sure we're not going to introduce something like the +famous nfsd inode cache bug again. + +Signed-off-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iget.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_iget.c ++++ b/fs/xfs/xfs_iget.c +@@ -511,17 +511,21 @@ xfs_ireclaim( + { + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; ++ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + + XFS_STATS_INC(xs_ig_reclaims); + + /* +- * Remove the inode from the per-AG radix tree. It doesn't matter +- * if it was never added to it because radix_tree_delete can deal +- * with that case just fine. ++ * Remove the inode from the per-AG radix tree. ++ * ++ * Because radix_tree_delete won't complain even if the item was never ++ * added to the tree assert that it's been there before to catch ++ * problems with the inode life time early on. + */ + pag = xfs_get_perag(mp, ip->i_ino); + write_lock(&pag->pag_ici_lock); +- radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); ++ if (!radix_tree_delete(&pag->pag_ici_root, agino)) ++ ASSERT(0); + write_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); + diff --git a/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch b/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch new file mode 100644 index 00000000000..0960321ea07 --- /dev/null +++ b/queue-2.6.32/xfs-don-t-flush-stale-inodes.patch @@ -0,0 +1,46 @@ +From david@fromorbit.com Fri Apr 2 11:08:07 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:06 +1100 +Subject: xfs: Don't flush stale inodes +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-9-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit 44e08c45cc14e6190a424be8d450070c8e508fad upstream + +Because inodes remain in cache much longer than inode buffers do +under memory pressure, we can get the situation where we have +stale, dirty inodes being reclaimed but the backing storage has +been freed. Hence we should never, ever flush XFS_ISTALE inodes +to disk as there is no guarantee that the backing buffer is in +cache and still marked stale when the flush occurs. + +Signed-off-by: Dave Chinner +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -2877,10 +2877,14 @@ xfs_iflush( + mp = ip->i_mount; + + /* +- * If the inode isn't dirty, then just release the inode +- * flush lock and do nothing. ++ * If the inode isn't dirty, then just release the inode flush lock and ++ * do nothing. Treat stale inodes the same; we cannot rely on the ++ * backing buffer remaining stale in cache for the remaining life of ++ * the stale inode and so xfs_itobp() below may give us a buffer that ++ * no longer contains inodes below. Doing this stale check here also ++ * avoids forcing the log on pinned, stale inodes. + */ +- if (xfs_inode_clean(ip)) { ++ if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_ifunlock(ip); + return 0; + } diff --git a/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch b/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch new file mode 100644 index 00000000000..d71b3ac33ae --- /dev/null +++ b/queue-2.6.32/xfs-don-t-hold-onto-reserved-blocks-on-remount-ro.patch @@ -0,0 +1,94 @@ +From david@fromorbit.com Fri Apr 2 11:11:43 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:14 +1100 +Subject: xfs: don't hold onto reserved blocks on remount, ro +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-17-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit cbe132a8bdcff0f9afd9060948fb50597c7400b8 upstream + +If we hold onto reserved blocks when doing a remount,ro we end +up writing the blocks used count to disk that includes the reserved +blocks. Reserved blocks are not actually used, so this results in +the values in the superblock being incorrect. + +Hence if we run xfs_check or xfs_repair -n while the filesystem is +mounted remount,ro we end up with an inconsistent filesystem being +reported. Also, running xfs_copy on the remount,ro filesystem will +result in an inconsistent image being generated. + +To fix this, unreserve the blocks when doing the remount,ro, and +reserved them again on remount,rw. This way a remount,ro filesystem +will appear consistent on disk to all utilities. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/linux-2.6/xfs_super.c | 28 ++++++++++++++++++++++++++++ + fs/xfs/xfs_mount.h | 1 + + 2 files changed, 29 insertions(+) + +--- a/fs/xfs/linux-2.6/xfs_super.c ++++ b/fs/xfs/linux-2.6/xfs_super.c +@@ -1323,6 +1323,8 @@ xfs_fs_remount( + + /* ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { ++ __uint64_t resblks; ++ + mp->m_flags &= ~XFS_MOUNT_RDONLY; + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_mountfs_check_barriers(mp); +@@ -1340,11 +1342,37 @@ xfs_fs_remount( + } + mp->m_update_flags = 0; + } ++ ++ /* ++ * Fill out the reserve pool if it is empty. Use the stashed ++ * value if it is non-zero, otherwise go with the default. ++ */ ++ if (mp->m_resblks_save) { ++ resblks = mp->m_resblks_save; ++ mp->m_resblks_save = 0; ++ } else { ++ resblks = mp->m_sb.sb_dblocks; ++ do_div(resblks, 20); ++ resblks = min_t(__uint64_t, resblks, 1024); ++ } ++ xfs_reserve_blocks(mp, &resblks, NULL); + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { ++ /* ++ * After we have synced the data but before we sync the ++ * metadata, we need to free up the reserve block pool so that ++ * the used block count in the superblock on disk is correct at ++ * the end of the remount. Stash the current reserve pool size ++ * so that if we get remounted rw, we can return it to the same ++ * size. ++ */ ++ __uint64_t resblks = 0; ++ + xfs_quiesce_data(mp); ++ mp->m_resblks_save = mp->m_resblks; ++ xfs_reserve_blocks(mp, &resblks, NULL); + xfs_quiesce_attr(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + } +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -209,6 +209,7 @@ typedef struct xfs_mount { + __uint64_t m_maxioffset; /* maximum inode offset */ + __uint64_t m_resblks; /* total reserved blocks */ + __uint64_t m_resblks_avail;/* available reserved blocks */ ++ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + int m_sinoalign; /* stripe unit inode alignment */ diff --git a/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch b/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch new file mode 100644 index 00000000000..78ea50a50af --- /dev/null +++ b/queue-2.6.32/xfs-ensure-we-force-all-busy-extents-in-range-to-disk.patch @@ -0,0 +1,115 @@ +From david@fromorbit.com Fri Apr 2 11:08:31 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:07 +1100 +Subject: xfs: Ensure we force all busy extents in range to disk +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-10-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit fd45e4784164d1017521086524e3442318c67370 upstream + +When we search for and find a busy extent during allocation we +force the log out to ensure the extent free transaction is on +disk before the allocation transaction. The current implementation +has a subtle bug in it--it does not handle multiple overlapping +ranges. + +That is, if we free lots of little extents into a single +contiguous extent, then allocate the contiguous extent, the busy +search code stops searching at the first extent it finds that +overlaps the allocated range. It then uses the commit LSN of the +transaction to force the log out to. + +Unfortunately, the other busy ranges might have more recent +commit LSNs than the first busy extent that is found, and this +results in xfs_alloc_search_busy() returning before all the +extent free transactions are on disk for the range being +allocated. This can lead to potential metadata corruption or +stale data exposure after a crash because log replay won't replay +all the extent free transactions that cover the allocation range. + +Modified-by: Alex Elder + +(Dropped the "found" argument from the xfs_alloc_busysearch trace +event.) + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_alloc.c | 52 +++++++++++++++++++++------------------------------- + 1 file changed, 21 insertions(+), 31 deletions(-) + +--- a/fs/xfs/xfs_alloc.c ++++ b/fs/xfs/xfs_alloc.c +@@ -2703,45 +2703,35 @@ xfs_alloc_search_busy(xfs_trans_t *tp, + xfs_mount_t *mp; + xfs_perag_busy_t *bsy; + xfs_agblock_t uend, bend; +- xfs_lsn_t lsn; ++ xfs_lsn_t lsn = 0; + int cnt; + + mp = tp->t_mountp; + + spin_lock(&mp->m_perag[agno].pagb_lock); +- cnt = mp->m_perag[agno].pagb_count; +- + uend = bno + len - 1; + +- /* search pagb_list for this slot, skipping open slots */ +- for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { +- +- /* +- * (start1,length1) within (start2, length2) +- */ +- if (bsy->busy_tp != NULL) { +- bend = bsy->busy_start + bsy->busy_length - 1; +- if ((bno > bend) || (uend < bsy->busy_start)) { +- cnt--; +- } else { +- TRACE_BUSYSEARCH("xfs_alloc_search_busy", +- "found1", agno, bno, len, tp); +- break; +- } +- } +- } +- + /* +- * If a block was found, force the log through the LSN of the +- * transaction that freed the block ++ * search pagb_list for this slot, skipping open slots. We have to ++ * search the entire array as there may be multiple overlaps and ++ * we have to get the most recent LSN for the log force to push out ++ * all the transactions that span the range. + */ +- if (cnt) { +- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp); +- lsn = bsy->busy_tp->t_commit_lsn; +- spin_unlock(&mp->m_perag[agno].pagb_lock); +- xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); +- } else { +- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp); +- spin_unlock(&mp->m_perag[agno].pagb_lock); ++ for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { ++ bsy = &mp->m_perag[agno].pagb_list[cnt]; ++ if (!bsy->busy_tp) ++ continue; ++ bend = bsy->busy_start + bsy->busy_length - 1; ++ if (bno > bend || uend < bsy->busy_start) ++ continue; ++ ++ /* (start1,length1) within (start2, length2) */ ++ if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) ++ lsn = bsy->busy_tp->t_commit_lsn; + } ++ spin_unlock(&mp->m_perag[agno].pagb_lock); ++ TRACE_BUSYSEARCH("xfs_alloc_search_busy", lsn ? "found" : "not-found", ++ agno, bno, len, tp); ++ if (lsn) ++ xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); + } diff --git a/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch b/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch new file mode 100644 index 00000000000..24c65714c5f --- /dev/null +++ b/queue-2.6.32/xfs-fix-error-return-for-fallocate-on-xfs.patch @@ -0,0 +1,49 @@ +From david@fromorbit.com Fri Apr 2 11:06:34 2010 +From: Jason Gunthorpe +Date: Fri, 12 Mar 2010 09:42:03 +1100 +Subject: xfs: Fix error return for fallocate() on XFS +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-6-git-send-email-david@fromorbit.com> + + +From: Jason Gunthorpe + +commit 44a743f68705c681439f264deb05f8f38e9048d3 upstream + +Noticed that through glibc fallocate would return 28 rather than -1 +and errno = 28 for ENOSPC. The xfs routines uses XFS_ERROR format +positive return error codes while the syscalls use negative return +codes. Fixup the two cases in xfs_vn_fallocate syscall to convert to +negative. + +Signed-off-by: Jason Gunthorpe +Reviewed-by: Eric Sandeen +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/linux-2.6/xfs_iops.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_iops.c ++++ b/fs/xfs/linux-2.6/xfs_iops.c +@@ -573,8 +573,8 @@ xfs_vn_fallocate( + bf.l_len = len; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); +- error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, +- 0, XFS_ATTR_NOLOCK); ++ error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, ++ 0, XFS_ATTR_NOLOCK); + if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) + new_size = offset + len; +@@ -585,7 +585,7 @@ xfs_vn_fallocate( + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; +- error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); ++ error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); + } + + xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch b/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch new file mode 100644 index 00000000000..92f60429b7b --- /dev/null +++ b/queue-2.6.32/xfs-fix-locking-for-inode-cache-radix-tree-tag-updates.patch @@ -0,0 +1,84 @@ +From david@fromorbit.com Fri Apr 2 11:12:53 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:42:17 +1100 +Subject: xfs: fix locking for inode cache radix tree tag updates +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-20-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit f1f724e4b523d444c5a598d74505aefa3d6844d2 upstream + +The radix-tree code requires it's users to serialize tag updates +against other updates to the tree. While XFS protects tag updates +against each other it does not serialize them against updates of the +tree contents, which can lead to tag corruption. Fix the inode +cache to always take pag_ici_lock in exclusive mode when updating +radix tree tags. + +Signed-off-by: Christoph Hellwig +Reported-by: Patrick Schreurs +Tested-by: Patrick Schreurs +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/linux-2.6/xfs_sync.c | 4 ++-- + fs/xfs/xfs_iget.c | 19 +++++++++++++------ + 2 files changed, 15 insertions(+), 8 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_sync.c ++++ b/fs/xfs/linux-2.6/xfs_sync.c +@@ -692,12 +692,12 @@ xfs_inode_set_reclaim_tag( + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + +- read_lock(&pag->pag_ici_lock); ++ write_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_set_reclaim_tag(pag, ip); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); +- read_unlock(&pag->pag_ici_lock); ++ write_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); + } + +--- a/fs/xfs/xfs_iget.c ++++ b/fs/xfs/xfs_iget.c +@@ -228,13 +228,12 @@ xfs_iget_cache_hit( + xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); + + /* +- * We need to set XFS_INEW atomically with clearing the +- * reclaimable tag so that we do have an indicator of the +- * inode still being initialized. ++ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode ++ * from stomping over us while we recycle the inode. We can't ++ * clear the radix tree reclaimable tag yet as it requires ++ * pag_ici_lock to be held exclusive. + */ +- ip->i_flags |= XFS_INEW; +- ip->i_flags &= ~XFS_IRECLAIMABLE; +- __xfs_inode_clear_reclaim_tag(mp, pag, ip); ++ ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); +@@ -253,7 +252,15 @@ xfs_iget_cache_hit( + __xfs_inode_set_reclaim_tag(pag, ip); + goto out_error; + } ++ ++ write_lock(&pag->pag_ici_lock); ++ spin_lock(&ip->i_flags_lock); ++ ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); ++ ip->i_flags |= XFS_INEW; ++ __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_LOCK|I_NEW; ++ spin_unlock(&ip->i_flags_lock); ++ write_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { diff --git a/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch b/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch new file mode 100644 index 00000000000..aef2a50cc28 --- /dev/null +++ b/queue-2.6.32/xfs-fix-mmap_sem-iolock-inversion-in-xfs_free_eofblocks.patch @@ -0,0 +1,129 @@ +From david@fromorbit.com Fri Apr 2 11:05:07 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:42:00 +1100 +Subject: xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-3-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit c56c9631cbe88f08854a56ff9776c1f310916830 upstream + +When xfs_free_eofblocks is called from ->release the VM might already +hold the mmap_sem, but in the write path we take the iolock before +taking the mmap_sem in the generic write code. + +Switch xfs_free_eofblocks to only trylock the iolock if called from +->release and skip trimming the prellocated blocks in that case. +We'll still free them later on the final iput. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Alex Elder +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_rw.h | 7 ------- + fs/xfs/xfs_vnodeops.c | 34 ++++++++++++++++++++++++++-------- + 2 files changed, 26 insertions(+), 15 deletions(-) + +--- a/fs/xfs/xfs_rw.h ++++ b/fs/xfs/xfs_rw.h +@@ -37,13 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_ + } + + /* +- * Flags for xfs_free_eofblocks +- */ +-#define XFS_FREE_EOF_LOCK (1<<0) +-#define XFS_FREE_EOF_NOLOCK (1<<1) +- +- +-/* + * helper function to extract extent size hint from inode + */ + STATIC_INLINE xfs_extlen_t +--- a/fs/xfs/xfs_vnodeops.c ++++ b/fs/xfs/xfs_vnodeops.c +@@ -709,6 +709,11 @@ xfs_fsync( + } + + /* ++ * Flags for xfs_free_eofblocks ++ */ ++#define XFS_FREE_EOF_TRYLOCK (1<<0) ++ ++/* + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. +@@ -726,7 +731,6 @@ xfs_free_eofblocks( + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; +- int use_iolock = (flags & XFS_FREE_EOF_LOCK); + + /* + * Figure out if there are any blocks beyond the end +@@ -768,14 +772,19 @@ xfs_free_eofblocks( + * cache and we can't + * do that within a transaction. + */ +- if (use_iolock) ++ if (flags & XFS_FREE_EOF_TRYLOCK) { ++ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { ++ xfs_trans_cancel(tp, 0); ++ return 0; ++ } ++ } else { + xfs_ilock(ip, XFS_IOLOCK_EXCL); ++ } + error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, + ip->i_size); + if (error) { + xfs_trans_cancel(tp, 0); +- if (use_iolock) +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + +@@ -812,8 +821,7 @@ xfs_free_eofblocks( + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES); + } +- xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL) +- : XFS_ILOCK_EXCL)); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL); + } + return error; + } +@@ -1113,7 +1121,17 @@ xfs_release( + (ip->i_df.if_flags & XFS_IFEXTENTS)) && + (!(ip->i_d.di_flags & + (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { +- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); ++ ++ /* ++ * If we can't get the iolock just skip truncating ++ * the blocks past EOF because we could deadlock ++ * with the mmap_sem otherwise. We'll get another ++ * chance to drop them once the last reference to ++ * the inode is dropped, so we'll never leak blocks ++ * permanently. ++ */ ++ error = xfs_free_eofblocks(mp, ip, ++ XFS_FREE_EOF_TRYLOCK); + if (error) + return error; + } +@@ -1184,7 +1202,7 @@ xfs_inactive( + (!(ip->i_d.di_flags & + (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || + (ip->i_delayed_blks != 0)))) { +- error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); ++ error = xfs_free_eofblocks(mp, ip, 0); + if (error) + return VN_INACTIVE_CACHE; + } diff --git a/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch b/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch new file mode 100644 index 00000000000..6cc19afd3e3 --- /dev/null +++ b/queue-2.6.32/xfs-fix-stale-inode-flush-avoidance.patch @@ -0,0 +1,64 @@ +From david@fromorbit.com Fri Apr 2 11:10:21 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:11 +1100 +Subject: xfs: fix stale inode flush avoidance +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-14-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit 4b6a46882cca8349e8942e2650c33b11bc571c92 upstream + +When reclaiming stale inodes, we need to guarantee that inodes are +unpinned before returning with a "clean" status. If we don't we can +reclaim inodes that are pinned, leading to use after free in the +transaction subsystem as transactions complete. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_inode.c | 21 +++++++++++++++------ + 1 file changed, 15 insertions(+), 6 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -2878,13 +2878,9 @@ xfs_iflush( + + /* + * If the inode isn't dirty, then just release the inode flush lock and +- * do nothing. Treat stale inodes the same; we cannot rely on the +- * backing buffer remaining stale in cache for the remaining life of +- * the stale inode and so xfs_itobp() below may give us a buffer that +- * no longer contains inodes below. Doing this stale check here also +- * avoids forcing the log on pinned, stale inodes. ++ * do nothing. + */ +- if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) { ++ if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + return 0; + } +@@ -2908,6 +2904,19 @@ xfs_iflush( + xfs_iunpin_wait(ip); + + /* ++ * For stale inodes we cannot rely on the backing buffer remaining ++ * stale in cache for the remaining life of the stale inode and so ++ * xfs_itobp() below may give us a buffer that no longer contains ++ * inodes below. We have to check this after ensuring the inode is ++ * unpinned so that it is safe to reclaim the stale inode after the ++ * flush call. ++ */ ++ if (xfs_iflags_test(ip, XFS_ISTALE)) { ++ xfs_ifunlock(ip); ++ return 0; ++ } ++ ++ /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this inode + * to disk, because the log record didn't make it to disk! diff --git a/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch b/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch new file mode 100644 index 00000000000..025a4a1033b --- /dev/null +++ b/queue-2.6.32/xfs-fix-timestamp-handling-in-xfs_setattr.patch @@ -0,0 +1,215 @@ +From david@fromorbit.com Fri Apr 2 11:07:34 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:42:05 +1100 +Subject: xfs: fix timestamp handling in xfs_setattr +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-8-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit d6d59bada372bcf8bd36c3bbc71c485c29dd2a4b upstream + +We currently have some rather odd code in xfs_setattr for +updating the a/c/mtime timestamps: + + - first we do a non-transaction update if all three are updated + together + - second we implicitly update the ctime for various changes + instead of relying on the ATTR_CTIME flag + - third we set the timestamps to the current time instead of the + arguments in the iattr structure in many cases. + +This patch makes sure we update it in a consistent way: + + - always transactional + - ctime is only updated if ATTR_CTIME is set or we do a size + update, which is a special case + - always to the times passed in from the caller instead of the + current time + +The only non-size caller of xfs_setattr that doesn't come from +the VFS is updated to set ATTR_CTIME and pass in a valid ctime +value. + +Reported-by: Eric Blake +Signed-off-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/linux-2.6/xfs_acl.c | 3 - + fs/xfs/xfs_vnodeops.c | 93 ++++++++++++++++++--------------------------- + 2 files changed, 41 insertions(+), 55 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_acl.c ++++ b/fs/xfs/linux-2.6/xfs_acl.c +@@ -250,8 +250,9 @@ xfs_set_mode(struct inode *inode, mode_t + if (mode != inode->i_mode) { + struct iattr iattr; + +- iattr.ia_valid = ATTR_MODE; ++ iattr.ia_valid = ATTR_MODE | ATTR_CTIME; + iattr.ia_mode = mode; ++ iattr.ia_ctime = current_fs_time(inode->i_sb); + + error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); + } +--- a/fs/xfs/xfs_vnodeops.c ++++ b/fs/xfs/xfs_vnodeops.c +@@ -69,7 +69,6 @@ xfs_setattr( + uint commit_flags=0; + uid_t uid=0, iuid=0; + gid_t gid=0, igid=0; +- int timeflags = 0; + struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; + int need_iolock = 1; + +@@ -134,16 +133,13 @@ xfs_setattr( + if (flags & XFS_ATTR_NOLOCK) + need_iolock = 0; + if (!(mask & ATTR_SIZE)) { +- if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || +- (mp->m_flags & XFS_MOUNT_WSYNC)) { +- tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); +- commit_flags = 0; +- if ((code = xfs_trans_reserve(tp, 0, +- XFS_ICHANGE_LOG_RES(mp), 0, +- 0, 0))) { +- lock_flags = 0; +- goto error_return; +- } ++ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); ++ commit_flags = 0; ++ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), ++ 0, 0, 0); ++ if (code) { ++ lock_flags = 0; ++ goto error_return; + } + } else { + if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && +@@ -294,15 +290,23 @@ xfs_setattr( + * or we are explicitly asked to change it. This handles + * the semantic difference between truncate() and ftruncate() + * as implemented in the VFS. ++ * ++ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME ++ * is a special case where we need to update the times despite ++ * not having these flags set. For all other operations the ++ * VFS set these flags explicitly if it wants a timestamp ++ * update. + */ +- if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) +- timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; ++ if (iattr->ia_size != ip->i_size && ++ (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { ++ iattr->ia_ctime = iattr->ia_mtime = ++ current_fs_time(inode->i_sb); ++ mask |= ATTR_CTIME | ATTR_MTIME; ++ } + + if (iattr->ia_size > ip->i_size) { + ip->i_d.di_size = iattr->ia_size; + ip->i_size = iattr->ia_size; +- if (!(flags & XFS_ATTR_DMI)) +- xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } else if (iattr->ia_size <= ip->i_size || + (iattr->ia_size == 0 && ip->i_d.di_nextents)) { +@@ -373,9 +377,6 @@ xfs_setattr( + ip->i_d.di_gid = gid; + inode->i_gid = gid; + } +- +- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); +- timeflags |= XFS_ICHGTIME_CHG; + } + + /* +@@ -392,51 +393,37 @@ xfs_setattr( + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +- +- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +- timeflags |= XFS_ICHGTIME_CHG; + } + + /* + * Change file access or modified times. + */ +- if (mask & (ATTR_ATIME|ATTR_MTIME)) { +- if (mask & ATTR_ATIME) { +- inode->i_atime = iattr->ia_atime; +- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; +- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; +- ip->i_update_core = 1; +- } +- if (mask & ATTR_MTIME) { +- inode->i_mtime = iattr->ia_mtime; +- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; +- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; +- timeflags &= ~XFS_ICHGTIME_MOD; +- timeflags |= XFS_ICHGTIME_CHG; +- } +- if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET))) +- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); ++ if (mask & ATTR_ATIME) { ++ inode->i_atime = iattr->ia_atime; ++ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; ++ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; ++ ip->i_update_core = 1; + } +- +- /* +- * Change file inode change time only if ATTR_CTIME set +- * AND we have been called by a DMI function. +- */ +- +- if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) { ++ if (mask & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + ip->i_update_core = 1; +- timeflags &= ~XFS_ICHGTIME_CHG; ++ } ++ if (mask & ATTR_MTIME) { ++ inode->i_mtime = iattr->ia_mtime; ++ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ++ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; ++ ip->i_update_core = 1; + } + + /* +- * Send out timestamp changes that need to be set to the +- * current time. Not done when called by a DMI function. ++ * And finally, log the inode core if any attribute in it ++ * has been changed. + */ +- if (timeflags && !(flags & XFS_ATTR_DMI)) +- xfs_ichgtime(ip, timeflags); ++ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE| ++ ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) ++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + +@@ -451,12 +438,10 @@ xfs_setattr( + * mix so this probably isn't worth the trouble to optimize. + */ + code = 0; +- if (tp) { +- if (mp->m_flags & XFS_MOUNT_WSYNC) +- xfs_trans_set_sync(tp); ++ if (mp->m_flags & XFS_MOUNT_WSYNC) ++ xfs_trans_set_sync(tp); + +- code = xfs_trans_commit(tp, commit_flags); +- } ++ code = xfs_trans_commit(tp, commit_flags); + + xfs_iunlock(ip, lock_flags); + diff --git a/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch b/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch new file mode 100644 index 00000000000..f215decd317 --- /dev/null +++ b/queue-2.6.32/xfs-i-o-completion-handlers-must-use-nofs-allocations.patch @@ -0,0 +1,112 @@ +From david@fromorbit.com Fri Apr 2 11:05:39 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:42:01 +1100 +Subject: xfs: I/O completion handlers must use NOFS allocations +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-4-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit 80641dc66a2d6dfb22af4413227a92b8ab84c7bb upstream + +When completing I/O requests we must not allow the memory allocator to +recurse into the filesystem, as we might deadlock on waiting for the +I/O completion otherwise. The only thing currently allocating normal +GFP_KERNEL memory is the allocation of the transaction structure for +the unwritten extent conversion. Add a memflags argument to +_xfs_trans_alloc to allow controlling the allocator behaviour. + +Signed-off-by: Christoph Hellwig +Reported-by: Thomas Neumann +Tested-by: Thomas Neumann +Reviewed-by: Alex Elder +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_fsops.c | 2 +- + fs/xfs/xfs_iomap.c | 9 ++++++++- + fs/xfs/xfs_mount.c | 2 +- + fs/xfs/xfs_trans.c | 7 ++++--- + fs/xfs/xfs_trans.h | 2 +- + 5 files changed, 15 insertions(+), 7 deletions(-) + +--- a/fs/xfs/xfs_fsops.c ++++ b/fs/xfs/xfs_fsops.c +@@ -611,7 +611,7 @@ xfs_fs_log_dummy( + xfs_inode_t *ip; + int error; + +- tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); ++ tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -860,8 +860,15 @@ xfs_iomap_write_unwritten( + * set up a transaction to convert the range of extents + * from unwritten to real. Do allocations in a loop until + * we have covered the range passed in. ++ * ++ * Note that we open code the transaction allocation here ++ * to pass KM_NOFS--we can't risk to recursing back into ++ * the filesystem here as we might be asked to write out ++ * the same inode that we complete here and might deadlock ++ * on the iolock. + */ +- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); ++ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); ++ tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, resblks, + XFS_WRITE_LOG_RES(mp), 0, +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -1471,7 +1471,7 @@ xfs_log_sbcount( + if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + +- tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT); ++ tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); + if (error) { +--- a/fs/xfs/xfs_trans.c ++++ b/fs/xfs/xfs_trans.c +@@ -236,19 +236,20 @@ xfs_trans_alloc( + uint type) + { + xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); +- return _xfs_trans_alloc(mp, type); ++ return _xfs_trans_alloc(mp, type, KM_SLEEP); + } + + xfs_trans_t * + _xfs_trans_alloc( + xfs_mount_t *mp, +- uint type) ++ uint type, ++ uint memflags) + { + xfs_trans_t *tp; + + atomic_inc(&mp->m_active_trans); + +- tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); ++ tp = kmem_zone_zalloc(xfs_trans_zone, memflags); + tp->t_magic = XFS_TRANS_MAGIC; + tp->t_type = type; + tp->t_mountp = mp; +--- a/fs/xfs/xfs_trans.h ++++ b/fs/xfs/xfs_trans.h +@@ -924,7 +924,7 @@ typedef struct xfs_trans { + * XFS transaction mechanism exported interfaces. + */ + xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); +-xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); ++xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint); + xfs_trans_t *xfs_trans_dup(xfs_trans_t *); + int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, + uint, uint); diff --git a/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch b/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch new file mode 100644 index 00000000000..f445c700246 --- /dev/null +++ b/queue-2.6.32/xfs-non-blocking-inode-locking-in-io-completion.patch @@ -0,0 +1,222 @@ +From david@fromorbit.com Fri Apr 2 11:12:28 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:16 +1100 +Subject: xfs: Non-blocking inode locking in IO completion +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-19-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit 77d7a0c2eeb285c9069e15396703d0cb9690ac50 upstream + +The introduction of barriers to loop devices has created a new IO +order completion dependency that XFS does not handle. The loop +device implements barriers using fsync and so turns a log IO in the +XFS filesystem on the loop device into a data IO in the backing +filesystem. That is, the completion of log IOs in the loop +filesystem are now dependent on completion of data IO in the backing +filesystem. + +This can cause deadlocks when a flush daemon issues a log force with +an inode locked because the IO completion of IO on the inode is +blocked by the inode lock. This in turn prevents further data IO +completion from occuring on all XFS filesystems on that CPU (due to +the shared nature of the completion queues). This then prevents the +log IO from completing because the log is waiting for data IO +completion as well. + +The fix for this new completion order dependency issue is to make +the IO completion inode locking non-blocking. If the inode lock +can't be grabbed, simply requeue the IO completion back to the work +queue so that it can be processed later. This prevents the +completion queue from being blocked and allows data IO completion on +other inodes to proceed, hence avoiding completion order dependent +deadlocks. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/linux-2.6/xfs_aops.c | 118 ++++++++++++++++++++++++++++++-------------- + 1 file changed, 82 insertions(+), 36 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_aops.c ++++ b/fs/xfs/linux-2.6/xfs_aops.c +@@ -204,14 +204,17 @@ xfs_ioend_new_eof( + } + + /* +- * Update on-disk file size now that data has been written to disk. +- * The current in-memory file size is i_size. If a write is beyond +- * eof i_new_size will be the intended file size until i_size is +- * updated. If this write does not extend all the way to the valid +- * file size then restrict this update to the end of the write. ++ * Update on-disk file size now that data has been written to disk. The ++ * current in-memory file size is i_size. If a write is beyond eof i_new_size ++ * will be the intended file size until i_size is updated. If this write does ++ * not extend all the way to the valid file size then restrict this update to ++ * the end of the write. ++ * ++ * This function does not block as blocking on the inode lock in IO completion ++ * can lead to IO completion order dependency deadlocks.. If it can't get the ++ * inode ilock it will return EAGAIN. Callers must handle this. + */ +- +-STATIC void ++STATIC int + xfs_setfilesize( + xfs_ioend_t *ioend) + { +@@ -222,9 +225,11 @@ xfs_setfilesize( + ASSERT(ioend->io_type != IOMAP_READ); + + if (unlikely(ioend->io_error)) +- return; ++ return 0; ++ ++ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) ++ return EAGAIN; + +- xfs_ilock(ip, XFS_ILOCK_EXCL); + isize = xfs_ioend_new_eof(ioend); + if (isize) { + ip->i_d.di_size = isize; +@@ -232,6 +237,28 @@ xfs_setfilesize( + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ return 0; ++} ++ ++/* ++ * Schedule IO completion handling on a xfsdatad if this was ++ * the final hold on this ioend. If we are asked to wait, ++ * flush the workqueue. ++ */ ++STATIC void ++xfs_finish_ioend( ++ xfs_ioend_t *ioend, ++ int wait) ++{ ++ if (atomic_dec_and_test(&ioend->io_remaining)) { ++ struct workqueue_struct *wq; ++ ++ wq = (ioend->io_type == IOMAP_UNWRITTEN) ? ++ xfsconvertd_workqueue : xfsdatad_workqueue; ++ queue_work(wq, &ioend->io_work); ++ if (wait) ++ flush_workqueue(wq); ++ } + } + + /* +@@ -243,9 +270,23 @@ xfs_end_bio_delalloc( + { + xfs_ioend_t *ioend = + container_of(work, xfs_ioend_t, io_work); ++ int error; + +- xfs_setfilesize(ioend); +- xfs_destroy_ioend(ioend); ++ /* ++ * If we didn't complete processing of the ioend, requeue it to the ++ * tail of the workqueue for another attempt later. Otherwise destroy ++ * it. ++ */ ++ error = xfs_setfilesize(ioend); ++ if (error == EAGAIN) { ++ atomic_inc(&ioend->io_remaining); ++ xfs_finish_ioend(ioend, 0); ++ /* ensure we don't spin on blocked ioends */ ++ delay(1); ++ } else { ++ ASSERT(!error); ++ xfs_destroy_ioend(ioend); ++ } + } + + /* +@@ -257,9 +298,23 @@ xfs_end_bio_written( + { + xfs_ioend_t *ioend = + container_of(work, xfs_ioend_t, io_work); ++ int error; + +- xfs_setfilesize(ioend); +- xfs_destroy_ioend(ioend); ++ /* ++ * If we didn't complete processing of the ioend, requeue it to the ++ * tail of the workqueue for another attempt later. Otherwise destroy ++ * it. ++ */ ++ error = xfs_setfilesize(ioend); ++ if (error == EAGAIN) { ++ atomic_inc(&ioend->io_remaining); ++ xfs_finish_ioend(ioend, 0); ++ /* ensure we don't spin on blocked ioends */ ++ delay(1); ++ } else { ++ ASSERT(!error); ++ xfs_destroy_ioend(ioend); ++ } + } + + /* +@@ -279,13 +334,25 @@ xfs_end_bio_unwritten( + size_t size = ioend->io_size; + + if (likely(!ioend->io_error)) { ++ int error; + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { +- int error; + error = xfs_iomap_write_unwritten(ip, offset, size); + if (error) + ioend->io_error = error; + } +- xfs_setfilesize(ioend); ++ /* ++ * If we didn't complete processing of the ioend, requeue it to the ++ * tail of the workqueue for another attempt later. Otherwise destroy ++ * it. ++ */ ++ error = xfs_setfilesize(ioend); ++ if (error == EAGAIN) { ++ atomic_inc(&ioend->io_remaining); ++ xfs_finish_ioend(ioend, 0); ++ /* ensure we don't spin on blocked ioends */ ++ delay(1); ++ return; ++ } + } + xfs_destroy_ioend(ioend); + } +@@ -304,27 +371,6 @@ xfs_end_bio_read( + } + + /* +- * Schedule IO completion handling on a xfsdatad if this was +- * the final hold on this ioend. If we are asked to wait, +- * flush the workqueue. +- */ +-STATIC void +-xfs_finish_ioend( +- xfs_ioend_t *ioend, +- int wait) +-{ +- if (atomic_dec_and_test(&ioend->io_remaining)) { +- struct workqueue_struct *wq = xfsdatad_workqueue; +- if (ioend->io_work.func == xfs_end_bio_unwritten) +- wq = xfsconvertd_workqueue; +- +- queue_work(wq, &ioend->io_work); +- if (wait) +- flush_workqueue(wq); +- } +-} +- +-/* + * Allocate and initialise an IO completion structure. + * We need to track unwritten extent write completion here initially. + * We'll need to extend this for updating the ondisk inode size later diff --git a/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch b/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch new file mode 100644 index 00000000000..055cfcc9bdb --- /dev/null +++ b/queue-2.6.32/xfs-quota-limit-statvfs-available-blocks.patch @@ -0,0 +1,38 @@ +From david@fromorbit.com Fri Apr 2 11:11:19 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:42:13 +1100 +Subject: xfs: quota limit statvfs available blocks +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-16-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit 9b00f30762fe9f914eb6e03057a616ed63a4e8ca upstream + +A "df" run on an NFS client of an exported XFS file system reports +the wrong information for "available" blocks. When a block quota is +enforced, the amount reported as free is limited by the quota, but +the amount reported available is not (and should be). + +Reported-by: Guk-Bong, Kwon +Signed-off-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/quota/xfs_qm_bhv.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/quota/xfs_qm_bhv.c ++++ b/fs/xfs/quota/xfs_qm_bhv.c +@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot( + be64_to_cpu(dp->d_blk_hardlimit); + if (limit && statp->f_blocks > limit) { + statp->f_blocks = limit; +- statp->f_bfree = ++ statp->f_bfree = statp->f_bavail = + (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? + (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; + } diff --git a/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch b/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch new file mode 100644 index 00000000000..7610deba7b0 --- /dev/null +++ b/queue-2.6.32/xfs-reclaim-all-inodes-by-background-tree-walks.patch @@ -0,0 +1,64 @@ +From david@fromorbit.com Fri Apr 2 11:09:55 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:10 +1100 +Subject: xfs: reclaim all inodes by background tree walks +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-13-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit 57817c68229984818fea9e614d6f95249c3fb098 upstream + +We cannot do direct inode reclaim without taking the flush lock to +ensure that we do not reclaim an inode under IO. We check the inode +is clean before doing direct reclaim, but this is not good enough +because the inode flush code marks the inode clean once it has +copied the in-core dirty state to the backing buffer. + +It is the flush lock that determines whether the inode is still +under IO, even though it is marked clean, and the inode is still +required at IO completion so we can't reclaim it even though it is +clean in core. Hence the requirement that we need to take the flush +lock even on clean inodes because this guarantees that the inode +writeback IO has completed and it is safe to reclaim the inode. + +With delayed write inode flushing, we could end up waiting a long +time on the flush lock even for a clean inode. The background +reclaim already handles this efficiently, so avoid all the problems +by killing the direct reclaim path altogether. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/linux-2.6/xfs_super.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_super.c ++++ b/fs/xfs/linux-2.6/xfs_super.c +@@ -953,16 +953,14 @@ xfs_fs_destroy_inode( + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* +- * If we have nothing to flush with this inode then complete the +- * teardown now, otherwise delay the flush operation. ++ * We always use background reclaim here because even if the ++ * inode is clean, it still may be under IO and hence we have ++ * to take the flush lock. The background reclaim path handles ++ * this more efficiently than we can here, so simply let background ++ * reclaim tear down all inodes. + */ +- if (!xfs_inode_clean(ip)) { +- xfs_inode_set_reclaim_tag(ip); +- return; +- } +- + out_reclaim: +- xfs_ireclaim(ip); ++ xfs_inode_set_reclaim_tag(ip); + } + + /* diff --git a/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch b/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch new file mode 100644 index 00000000000..dbe7c012113 --- /dev/null +++ b/queue-2.6.32/xfs-reclaim-inodes-under-a-write-lock.patch @@ -0,0 +1,309 @@ +From david@fromorbit.com Fri Apr 2 11:09:00 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:08 +1100 +Subject: xfs: reclaim inodes under a write lock +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-11-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit c8e20be020f234c8d492927a424a7d8bbefd5b5d upstream + +Make the inode tree reclaim walk exclusive to avoid races with +concurrent sync walkers and lookups. This is a version of a patch +posted by Christoph Hellwig that avoids all the code duplication. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/linux-2.6/xfs_sync.c | 154 ++++++++++++++++++----------------------- + fs/xfs/linux-2.6/xfs_sync.h | 2 + fs/xfs/quota/xfs_qm_syscalls.c | 2 + 3 files changed, 71 insertions(+), 87 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_sync.c ++++ b/fs/xfs/linux-2.6/xfs_sync.c +@@ -64,7 +64,6 @@ xfs_inode_ag_lookup( + * as the tree is sparse and a gang lookup walks to find + * the number of objects requested. + */ +- read_lock(&pag->pag_ici_lock); + if (tag == XFS_ICI_NO_TAG) { + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)&ip, *first_index, 1); +@@ -73,7 +72,7 @@ xfs_inode_ag_lookup( + (void **)&ip, *first_index, 1, tag); + } + if (!nr_found) +- goto unlock; ++ return NULL; + + /* + * Update the index for the next lookup. Catch overflows +@@ -83,13 +82,8 @@ xfs_inode_ag_lookup( + */ + *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) +- goto unlock; +- ++ return NULL; + return ip; +- +-unlock: +- read_unlock(&pag->pag_ici_lock); +- return NULL; + } + + STATIC int +@@ -99,7 +93,8 @@ xfs_inode_ag_walk( + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags, +- int tag) ++ int tag, ++ int exclusive) + { + struct xfs_perag *pag = &mp->m_perag[ag]; + uint32_t first_index; +@@ -113,10 +108,20 @@ restart: + int error = 0; + xfs_inode_t *ip; + ++ if (exclusive) ++ write_lock(&pag->pag_ici_lock); ++ else ++ read_lock(&pag->pag_ici_lock); + ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); +- if (!ip) ++ if (!ip) { ++ if (exclusive) ++ write_unlock(&pag->pag_ici_lock); ++ else ++ read_unlock(&pag->pag_ici_lock); + break; ++ } + ++ /* execute releases pag->pag_ici_lock */ + error = execute(ip, pag, flags); + if (error == EAGAIN) { + skipped++; +@@ -124,9 +129,8 @@ restart: + } + if (error) + last_error = error; +- /* +- * bail out if the filesystem is corrupted. +- */ ++ ++ /* bail out if the filesystem is corrupted. */ + if (error == EFSCORRUPTED) + break; + +@@ -147,7 +151,8 @@ xfs_inode_ag_iterator( + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags, +- int tag) ++ int tag, ++ int exclusive) + { + int error = 0; + int last_error = 0; +@@ -156,7 +161,8 @@ xfs_inode_ag_iterator( + for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { + if (!mp->m_perag[ag].pag_ici_init) + continue; +- error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); ++ error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, ++ exclusive); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) +@@ -180,11 +186,7 @@ xfs_sync_inode_valid( + return EFSCORRUPTED; + } + +- /* +- * If we can't get a reference on the inode, it must be in reclaim. +- * Leave it for the reclaim code to flush. Also avoid inodes that +- * haven't been fully initialised. +- */ ++ /* If we can't get a reference on the inode, it must be in reclaim. */ + if (!igrab(inode)) { + read_unlock(&pag->pag_ici_lock); + return ENOENT; +@@ -281,7 +283,7 @@ xfs_sync_data( + ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + + error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, +- XFS_ICI_NO_TAG); ++ XFS_ICI_NO_TAG, 0); + if (error) + return XFS_ERROR(error); + +@@ -303,7 +305,7 @@ xfs_sync_attr( + ASSERT((flags & ~SYNC_WAIT) == 0); + + return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, +- XFS_ICI_NO_TAG); ++ XFS_ICI_NO_TAG, 0); + } + + STATIC int +@@ -663,60 +665,6 @@ xfs_syncd_stop( + kthread_stop(mp->m_sync_task); + } + +-STATIC int +-xfs_reclaim_inode( +- xfs_inode_t *ip, +- int sync_mode) +-{ +- xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); +- +- /* The hash lock here protects a thread in xfs_iget_core from +- * racing with us on linking the inode back with a vnode. +- * Once we have the XFS_IRECLAIM flag set it will not touch +- * us. +- */ +- write_lock(&pag->pag_ici_lock); +- spin_lock(&ip->i_flags_lock); +- if (__xfs_iflags_test(ip, XFS_IRECLAIM) || +- !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { +- spin_unlock(&ip->i_flags_lock); +- write_unlock(&pag->pag_ici_lock); +- return -EAGAIN; +- } +- __xfs_iflags_set(ip, XFS_IRECLAIM); +- spin_unlock(&ip->i_flags_lock); +- write_unlock(&pag->pag_ici_lock); +- xfs_put_perag(ip->i_mount, pag); +- +- /* +- * If the inode is still dirty, then flush it out. If the inode +- * is not in the AIL, then it will be OK to flush it delwri as +- * long as xfs_iflush() does not keep any references to the inode. +- * We leave that decision up to xfs_iflush() since it has the +- * knowledge of whether it's OK to simply do a delwri flush of +- * the inode or whether we need to wait until the inode is +- * pulled from the AIL. +- * We get the flush lock regardless, though, just to make sure +- * we don't free it while it is being flushed. +- */ +- xfs_ilock(ip, XFS_ILOCK_EXCL); +- xfs_iflock(ip); +- +- /* +- * In the case of a forced shutdown we rely on xfs_iflush() to +- * wait for the inode to be unpinned before returning an error. +- */ +- if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { +- /* synchronize with xfs_iflush_done */ +- xfs_iflock(ip); +- xfs_ifunlock(ip); +- } +- +- xfs_iunlock(ip, XFS_ILOCK_EXCL); +- xfs_ireclaim(ip); +- return 0; +-} +- + void + __xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, +@@ -759,19 +707,55 @@ __xfs_inode_clear_reclaim_tag( + } + + STATIC int +-xfs_reclaim_inode_now( ++xfs_reclaim_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, +- int flags) ++ int sync_mode) + { +- /* ignore if already under reclaim */ +- if (xfs_iflags_test(ip, XFS_IRECLAIM)) { +- read_unlock(&pag->pag_ici_lock); ++ /* ++ * The radix tree lock here protects a thread in xfs_iget from racing ++ * with us starting reclaim on the inode. Once we have the ++ * XFS_IRECLAIM flag set it will not touch us. ++ */ ++ spin_lock(&ip->i_flags_lock); ++ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); ++ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { ++ /* ignore as it is already under reclaim */ ++ spin_unlock(&ip->i_flags_lock); ++ write_unlock(&pag->pag_ici_lock); + return 0; + } +- read_unlock(&pag->pag_ici_lock); ++ __xfs_iflags_set(ip, XFS_IRECLAIM); ++ spin_unlock(&ip->i_flags_lock); ++ write_unlock(&pag->pag_ici_lock); + +- return xfs_reclaim_inode(ip, flags); ++ /* ++ * If the inode is still dirty, then flush it out. If the inode ++ * is not in the AIL, then it will be OK to flush it delwri as ++ * long as xfs_iflush() does not keep any references to the inode. ++ * We leave that decision up to xfs_iflush() since it has the ++ * knowledge of whether it's OK to simply do a delwri flush of ++ * the inode or whether we need to wait until the inode is ++ * pulled from the AIL. ++ * We get the flush lock regardless, though, just to make sure ++ * we don't free it while it is being flushed. ++ */ ++ xfs_ilock(ip, XFS_ILOCK_EXCL); ++ xfs_iflock(ip); ++ ++ /* ++ * In the case of a forced shutdown we rely on xfs_iflush() to ++ * wait for the inode to be unpinned before returning an error. ++ */ ++ if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { ++ /* synchronize with xfs_iflush_done */ ++ xfs_iflock(ip); ++ xfs_ifunlock(ip); ++ } ++ ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ xfs_ireclaim(ip); ++ return 0; + } + + int +@@ -779,6 +763,6 @@ xfs_reclaim_inodes( + xfs_mount_t *mp, + int mode) + { +- return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, +- XFS_ICI_RECLAIM_TAG); ++ return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, ++ XFS_ICI_RECLAIM_TAG, 1); + } +--- a/fs/xfs/linux-2.6/xfs_sync.h ++++ b/fs/xfs/linux-2.6/xfs_sync.h +@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struc + int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); + int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), +- int flags, int tag); ++ int flags, int tag, int write_lock); + + #endif +--- a/fs/xfs/quota/xfs_qm_syscalls.c ++++ b/fs/xfs/quota/xfs_qm_syscalls.c +@@ -893,7 +893,7 @@ xfs_qm_dqrele_all_inodes( + uint flags) + { + ASSERT(mp->m_quotainfo); +- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); ++ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); + } + + /*------------------------------------------------------------------------*/ diff --git a/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch b/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch new file mode 100644 index 00000000000..d7d38b26bb9 --- /dev/null +++ b/queue-2.6.32/xfs-remove-invalid-barrier-optimization-from-xfs_fsync.patch @@ -0,0 +1,64 @@ +From david@fromorbit.com Fri Apr 2 11:12:06 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:42:15 +1100 +Subject: xfs: remove invalid barrier optimization from xfs_fsync +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-18-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit e8b217e7530c6a073ac69f1c85b922d93fdf5647 upstream + +Date: Tue, 2 Feb 2010 10:16:26 +1100 +We always need to flush the disk write cache and can't skip it just because +the no inode attributes have changed. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Dave Chinner +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_vnodeops.c | 12 ++---------- + 1 file changed, 2 insertions(+), 10 deletions(-) + +--- a/fs/xfs/xfs_vnodeops.c ++++ b/fs/xfs/xfs_vnodeops.c +@@ -597,7 +597,7 @@ xfs_fsync( + { + xfs_trans_t *tp; + int error = 0; +- int log_flushed = 0, changed = 1; ++ int log_flushed = 0; + + xfs_itrace_entry(ip); + +@@ -627,19 +627,11 @@ xfs_fsync( + * disk yet, the inode will be still be pinned. If it is, + * force the log. + */ +- + xfs_iunlock(ip, XFS_ILOCK_SHARED); +- + if (xfs_ipincount(ip)) { + error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC, + &log_flushed); +- } else { +- /* +- * If the inode is not pinned and nothing has changed +- * we don't need to flush the cache. +- */ +- changed = 0; + } + } else { + /* +@@ -674,7 +666,7 @@ xfs_fsync( + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + +- if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) { ++ if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { + /* + * If the log write didn't issue an ordered tag we need + * to flush the disk cache for the data device now. diff --git a/queue-2.6.32/xfs-simplify-inode-teardown.patch b/queue-2.6.32/xfs-simplify-inode-teardown.patch new file mode 100644 index 00000000000..6af994eab54 --- /dev/null +++ b/queue-2.6.32/xfs-simplify-inode-teardown.patch @@ -0,0 +1,206 @@ +From david@fromorbit.com Fri Apr 2 11:04:20 2010 +From: Christoph Hellwig +Date: Fri, 12 Mar 2010 09:41:59 +1100 +Subject: xfs: simplify inode teardown +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-2-git-send-email-david@fromorbit.com> + + +From: Christoph Hellwig + +commit 848ce8f731aed0a2d4ab5884a4f6664af73d2dd0 upstream + +Currently the reclaim code for the case where we don't reclaim the +final reclaim is overly complicated. We know that the inode is clean +but instead of just directly reclaiming the clean inode we go through +the whole process of marking the inode reclaimable just to directly +reclaim it from the calling context. Besides being overly complicated +this introduces a race where iget could recycle an inode between +marked reclaimable and actually being reclaimed leading to panics. + +This patch gets rid of the existing reclaim path, and replaces it with +a simple call to xfs_ireclaim if the inode was clean. While we're at +it we also use the slightly more lax xfs_inode_clean check we'd use +later to determine if we need to flush the inode here. + +Finally get rid of xfs_reclaim function and place the remaining small +bits of reclaim code directly into xfs_fs_destroy_inode. + +Signed-off-by: Christoph Hellwig +Reported-by: Patrick Schreurs +Reported-by: Tommy van Leeuwen +Tested-by: Patrick Schreurs +Reviewed-by: Alex Elder +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/linux-2.6/xfs_super.c | 34 ++++++++++++++++++++++++++++++---- + fs/xfs/linux-2.6/xfs_sync.c | 15 ++++----------- + fs/xfs/linux-2.6/xfs_sync.h | 1 - + fs/xfs/xfs_vnodeops.c | 40 ---------------------------------------- + fs/xfs/xfs_vnodeops.h | 1 - + 5 files changed, 34 insertions(+), 57 deletions(-) + +--- a/fs/xfs/linux-2.6/xfs_super.c ++++ b/fs/xfs/linux-2.6/xfs_super.c +@@ -930,13 +930,39 @@ xfs_fs_alloc_inode( + */ + STATIC void + xfs_fs_destroy_inode( +- struct inode *inode) ++ struct inode *inode) + { +- xfs_inode_t *ip = XFS_I(inode); ++ struct xfs_inode *ip = XFS_I(inode); ++ ++ xfs_itrace_entry(ip); + + XFS_STATS_INC(vn_reclaim); +- if (xfs_reclaim(ip)) +- panic("%s: cannot reclaim 0x%p\n", __func__, inode); ++ ++ /* bad inode, get out here ASAP */ ++ if (is_bad_inode(inode)) ++ goto out_reclaim; ++ ++ xfs_ioend_wait(ip); ++ ++ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); ++ ++ /* ++ * We should never get here with one of the reclaim flags already set. ++ */ ++ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); ++ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); ++ ++ /* ++ * If we have nothing to flush with this inode then complete the ++ * teardown now, otherwise delay the flush operation. ++ */ ++ if (!xfs_inode_clean(ip)) { ++ xfs_inode_set_reclaim_tag(ip); ++ return; ++ } ++ ++out_reclaim: ++ xfs_ireclaim(ip); + } + + /* +--- a/fs/xfs/linux-2.6/xfs_sync.c ++++ b/fs/xfs/linux-2.6/xfs_sync.c +@@ -663,10 +663,9 @@ xfs_syncd_stop( + kthread_stop(mp->m_sync_task); + } + +-int ++STATIC int + xfs_reclaim_inode( + xfs_inode_t *ip, +- int locked, + int sync_mode) + { + xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); +@@ -682,10 +681,6 @@ xfs_reclaim_inode( + !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); +- if (locked) { +- xfs_ifunlock(ip); +- xfs_iunlock(ip, XFS_ILOCK_EXCL); +- } + return -EAGAIN; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); +@@ -704,10 +699,8 @@ xfs_reclaim_inode( + * We get the flush lock regardless, though, just to make sure + * we don't free it while it is being flushed. + */ +- if (!locked) { +- xfs_ilock(ip, XFS_ILOCK_EXCL); +- xfs_iflock(ip); +- } ++ xfs_ilock(ip, XFS_ILOCK_EXCL); ++ xfs_iflock(ip); + + /* + * In the case of a forced shutdown we rely on xfs_iflush() to +@@ -778,7 +771,7 @@ xfs_reclaim_inode_now( + } + read_unlock(&pag->pag_ici_lock); + +- return xfs_reclaim_inode(ip, 0, flags); ++ return xfs_reclaim_inode(ip, flags); + } + + int +--- a/fs/xfs/linux-2.6/xfs_sync.h ++++ b/fs/xfs/linux-2.6/xfs_sync.h +@@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount * + + void xfs_flush_inodes(struct xfs_inode *ip); + +-int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); + int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); + + void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +--- a/fs/xfs/xfs_vnodeops.c ++++ b/fs/xfs/xfs_vnodeops.c +@@ -2456,46 +2456,6 @@ xfs_set_dmattrs( + return error; + } + +-int +-xfs_reclaim( +- xfs_inode_t *ip) +-{ +- +- xfs_itrace_entry(ip); +- +- ASSERT(!VN_MAPPED(VFS_I(ip))); +- +- /* bad inode, get out here ASAP */ +- if (is_bad_inode(VFS_I(ip))) { +- xfs_ireclaim(ip); +- return 0; +- } +- +- xfs_ioend_wait(ip); +- +- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); +- +- /* +- * If we have nothing to flush with this inode then complete the +- * teardown now, otherwise break the link between the xfs inode and the +- * linux inode and clean up the xfs inode later. This avoids flushing +- * the inode to disk during the delete operation itself. +- * +- * When breaking the link, we need to set the XFS_IRECLAIMABLE flag +- * first to ensure that xfs_iunpin() will never see an xfs inode +- * that has a linux inode being reclaimed. Synchronisation is provided +- * by the i_flags_lock. +- */ +- if (!ip->i_update_core && (ip->i_itemp == NULL)) { +- xfs_ilock(ip, XFS_ILOCK_EXCL); +- xfs_iflock(ip); +- xfs_iflags_set(ip, XFS_IRECLAIMABLE); +- return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); +- } +- xfs_inode_set_reclaim_tag(ip); +- return 0; +-} +- + /* + * xfs_alloc_file_space() + * This routine allocates disk space for the given file. +--- a/fs/xfs/xfs_vnodeops.h ++++ b/fs/xfs/xfs_vnodeops.h +@@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, st + const char *target_path, mode_t mode, struct xfs_inode **ipp, + cred_t *credp); + int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); +-int xfs_reclaim(struct xfs_inode *ip); + int xfs_change_file_space(struct xfs_inode *ip, int cmd, + xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); + int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, diff --git a/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch b/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch new file mode 100644 index 00000000000..519d11ea65b --- /dev/null +++ b/queue-2.6.32/xfs-wrapped-journal-record-corruption-on-read-at-recovery.patch @@ -0,0 +1,130 @@ +From david@fromorbit.com Fri Apr 2 11:06:08 2010 +From: Andy Poling +Date: Fri, 12 Mar 2010 09:42:02 +1100 +Subject: xfs: Wrapped journal record corruption on read at recovery +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-5-git-send-email-david@fromorbit.com> + + +From: Andy Poling + +commit fc5bc4c85c45f0bf854404e5736aa8b65720a18d upstream + +Summary of problem: + +If a journal record wraps at the physical end of the journal, it has to be +read in two parts in xlog_do_recovery_pass(): a read at the physical end and a +read at the physical beginning. If xlog_bread() has to re-align the first +read, the second read request does not take that re-alignment into account. +If the first read was re-aligned, the second read over-writes the end of the +data from the first read, effectively corrupting it. This can happen either +when reading the record header or reading the record data. + +The first sanity check in xlog_recover_process_data() is to check for a valid +clientid, so that is the error reported. + +Summary of fix: + +If there was a first read at the physical end, XFS_BUF_PTR() returns where the +data was requested to begin. Conversely, because it is the result of +xlog_align(), offset indicates where the requested data for the first read +actually begins - whether or not xlog_bread() has re-aligned it. + +Using offset as the base for the calculation of where to place the second read +data ensures that it will be correctly placed immediately following the data +from the first read instead of sometimes over-writing the end of it. + +The attached patch has resolved the reported problem of occasional inability +to recover the journal (reporting "bad clientid"). + +Signed-off-by: Andy Poling +Reviewed-by: Alex Elder +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log_recover.c | 24 +++++++----------------- + 1 file changed, 7 insertions(+), 17 deletions(-) + +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -3517,7 +3517,7 @@ xlog_do_recovery_pass( + { + xlog_rec_header_t *rhead; + xfs_daddr_t blk_no; +- xfs_caddr_t bufaddr, offset; ++ xfs_caddr_t offset; + xfs_buf_t *hbp, *dbp; + int error = 0, h_size; + int bblks, split_bblks; +@@ -3610,7 +3610,7 @@ xlog_do_recovery_pass( + /* + * Check for header wrapping around physical end-of-log + */ +- offset = NULL; ++ offset = XFS_BUF_PTR(hbp); + split_hblks = 0; + wrapped_hblks = 0; + if (blk_no + hblks <= log->l_logBBsize) { +@@ -3646,9 +3646,8 @@ xlog_do_recovery_pass( + * - order is important. + */ + wrapped_hblks = hblks - split_hblks; +- bufaddr = XFS_BUF_PTR(hbp); + error = XFS_BUF_SET_PTR(hbp, +- bufaddr + BBTOB(split_hblks), ++ offset + BBTOB(split_hblks), + BBTOB(hblks - split_hblks)); + if (error) + goto bread_err2; +@@ -3658,14 +3657,10 @@ xlog_do_recovery_pass( + if (error) + goto bread_err2; + +- error = XFS_BUF_SET_PTR(hbp, bufaddr, ++ error = XFS_BUF_SET_PTR(hbp, offset, + BBTOB(hblks)); + if (error) + goto bread_err2; +- +- if (!offset) +- offset = xlog_align(log, 0, +- wrapped_hblks, hbp); + } + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, +@@ -3685,7 +3680,7 @@ xlog_do_recovery_pass( + } else { + /* This log record is split across the + * physical end of log */ +- offset = NULL; ++ offset = XFS_BUF_PTR(dbp); + split_bblks = 0; + if (blk_no != log->l_logBBsize) { + /* some data is before the physical +@@ -3714,9 +3709,8 @@ xlog_do_recovery_pass( + * _first_, then the log start (LR header end) + * - order is important. + */ +- bufaddr = XFS_BUF_PTR(dbp); + error = XFS_BUF_SET_PTR(dbp, +- bufaddr + BBTOB(split_bblks), ++ offset + BBTOB(split_bblks), + BBTOB(bblks - split_bblks)); + if (error) + goto bread_err2; +@@ -3727,13 +3721,9 @@ xlog_do_recovery_pass( + if (error) + goto bread_err2; + +- error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size); ++ error = XFS_BUF_SET_PTR(dbp, offset, h_size); + if (error) + goto bread_err2; +- +- if (!offset) +- offset = xlog_align(log, wrapped_hblks, +- bblks - split_bblks, dbp); + } + xlog_unpack_data(rhead, offset, log); + if ((error = xlog_recover_process_data(log, rhash, diff --git a/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch b/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch new file mode 100644 index 00000000000..e722537945f --- /dev/null +++ b/queue-2.6.32/xfs-xfs_swap_extents-needs-to-handle-dynamic-fork-offsets.patch @@ -0,0 +1,185 @@ +From david@fromorbit.com Fri Apr 2 11:10:52 2010 +From: Dave Chinner +Date: Fri, 12 Mar 2010 09:42:12 +1100 +Subject: xfs: xfs_swap_extents needs to handle dynamic fork offsets +To: stable@kernel.org +Cc: xfs@oss.sgi.com +Message-ID: <1268347337-7160-15-git-send-email-david@fromorbit.com> + +From: Dave Chinner + +commit e09f98606dcc156de1146c209d45a0d6d5f51c3f upstream + +When swapping extents, we can corrupt inodes by swapping data forks +that are in incompatible formats. This is caused by the two indoes +having different fork offsets due to the presence of an attribute +fork on an attr2 filesystem. xfs_fsr tries to be smart about +setting the fork offset, but the trick it plays only works on attr1 +(old fixed format attribute fork) filesystems. + +Changing the way xfs_fsr sets up the attribute fork will prevent +this situation from ever occurring, so in the kernel code we can get +by with a preventative fix - check that the data fork in the +defragmented inode is in a format valid for the inode it is being +swapped into. This will lead to files that will silently and +potentially repeatedly fail defragmentation, so issue a warning to +the log when this particular failure occurs to let us know that +xfs_fsr needs updating/fixing. + +To help identify how to improve xfs_fsr to avoid this issue, add +trace points for the inodes being swapped so that we can determine +why the swap was rejected and to confirm that the code is making the +right decisions and modifications when swapping forks. + +A further complication is even when the swap is allowed to proceed +when the fork offset is different between the two inodes then value +for the maximum number of extents the data fork can hold can be +wrong. Make sure these are also set correctly after the swap occurs. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_dfrag.c | 106 +++++++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 90 insertions(+), 16 deletions(-) + +--- a/fs/xfs/xfs_dfrag.c ++++ b/fs/xfs/xfs_dfrag.c +@@ -113,10 +113,82 @@ xfs_swapext( + return error; + } + ++/* ++ * We need to check that the format of the data fork in the temporary inode is ++ * valid for the target inode before doing the swap. This is not a problem with ++ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized ++ * data fork depending on the space the attribute fork is taking so we can get ++ * invalid formats on the target inode. ++ * ++ * E.g. target has space for 7 extents in extent format, temp inode only has ++ * space for 6. If we defragment down to 7 extents, then the tmp format is a ++ * btree, but when swapped it needs to be in extent format. Hence we can't just ++ * blindly swap data forks on attr2 filesystems. ++ * ++ * Note that we check the swap in both directions so that we don't end up with ++ * a corrupt temporary inode, either. ++ * ++ * Note that fixing the way xfs_fsr sets up the attribute fork in the source ++ * inode will prevent this situation from occurring, so all we do here is ++ * reject and log the attempt. basically we are putting the responsibility on ++ * userspace to get this right. ++ */ ++static int ++xfs_swap_extents_check_format( ++ xfs_inode_t *ip, /* target inode */ ++ xfs_inode_t *tip) /* tmp inode */ ++{ ++ ++ /* Should never get a local format */ ++ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || ++ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) ++ return EINVAL; ++ ++ /* ++ * if the target inode has less extents that then temporary inode then ++ * why did userspace call us? ++ */ ++ if (ip->i_d.di_nextents < tip->i_d.di_nextents) ++ return EINVAL; ++ ++ /* ++ * if the target inode is in extent form and the temp inode is in btree ++ * form then we will end up with the target inode in the wrong format ++ * as we already know there are less extents in the temp inode. ++ */ ++ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ++ tip->i_d.di_format == XFS_DINODE_FMT_BTREE) ++ return EINVAL; ++ ++ /* Check temp in extent form to max in target */ ++ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ++ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) ++ return EINVAL; ++ ++ /* Check target in extent form to max in temp */ ++ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && ++ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) ++ return EINVAL; ++ ++ /* Check root block of temp in btree form to max in target */ ++ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && ++ XFS_IFORK_BOFF(ip) && ++ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ++ return EINVAL; ++ ++ /* Check root block of target in btree form to max in temp */ ++ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && ++ XFS_IFORK_BOFF(tip) && ++ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ++ return EINVAL; ++ ++ return 0; ++} ++ + int + xfs_swap_extents( +- xfs_inode_t *ip, +- xfs_inode_t *tip, ++ xfs_inode_t *ip, /* target inode */ ++ xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp) + { + xfs_mount_t *mp; +@@ -160,13 +232,6 @@ xfs_swap_extents( + goto out_unlock; + } + +- /* Should never get a local format */ +- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || +- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { +- error = XFS_ERROR(EINVAL); +- goto out_unlock; +- } +- + if (VN_CACHED(VFS_I(tip)) != 0) { + xfs_inval_cached_trace(tip, 0, -1, 0, -1); + error = xfs_flushinval_pages(tip, 0, -1, +@@ -189,13 +254,12 @@ xfs_swap_extents( + goto out_unlock; + } + +- /* +- * If the target has extended attributes, the tmp file +- * must also in order to ensure the correct data fork +- * format. +- */ +- if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { +- error = XFS_ERROR(EINVAL); ++ /* check inode formats now that data is flushed */ ++ error = xfs_swap_extents_check_format(ip, tip); ++ if (error) { ++ xfs_fs_cmn_err(CE_NOTE, mp, ++ "%s: inode 0x%llx format is incompatible for exchanging.", ++ __FILE__, ip->i_ino); + goto out_unlock; + } + +@@ -276,6 +340,16 @@ xfs_swap_extents( + *tifp = *tempifp; /* struct copy */ + + /* ++ * Fix the in-memory data fork values that are dependent on the fork ++ * offset in the inode. We can't assume they remain the same as attr2 ++ * has dynamic fork offsets. ++ */ ++ ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / ++ (uint)sizeof(xfs_bmbt_rec_t); ++ tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / ++ (uint)sizeof(xfs_bmbt_rec_t); ++ ++ /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks;