]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/commitdiff
libxfs: bmap btree owner swap support
authorDave Chinner <dchinner@redhat.com>
Wed, 13 Nov 2013 06:40:31 +0000 (06:40 +0000)
committerRich Johnston <rjohnston@sgi.com>
Wed, 13 Nov 2013 17:04:09 +0000 (11:04 -0600)
For CRC enabled filesystems, we can't just swap inode forks from one
inode to another when defragmenting a file - the blocks in the inode
fork bmap btree contain pointers back to the owner inode. Hence if
we are to swap the inode forks we have to atomically modify every
block in the btree during the transaction.

This patch brings across the kernel code for doing the owner
swap of an entire fork - something that we are likely to end up
needing in xfs_repair when reparenting stray inodes to lost+found -
without all the associated swap extents transaction and recovery
cruft as those parts are not needed in userspace.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rich Johnston <rjohnston@sgi.com>
include/xfs_bmap_btree.h
include/xfs_btree.h
include/xfs_inode_buf.h
include/xfs_log_format.h
libxfs/xfs_bmap_btree.c
libxfs/xfs_btree.c

index 2379d334712b72efd3cdfea687ce11c7b474076a..6e42e1e50b89394e2c0a0a003c339ac03d127445 100644 (file)
@@ -133,6 +133,10 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
 extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+                                int whichfork, xfs_ino_t new_owner,
+                                struct list_head *buffer_list);
+
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
                struct xfs_trans *, struct xfs_inode *, int);
 
index 227bfa583dc8c2867cfed6b67aa54128d6e88cac..6afe0b2f8788dcbf926a01a2e0bc4c12cd743af1 100644 (file)
@@ -41,15 +41,18 @@ extern kmem_zone_t  *xfs_btree_cur_zone;
 /*
  * For logging record fields.
  */
-#define        XFS_BB_MAGIC            0x01
-#define        XFS_BB_LEVEL            0x02
-#define        XFS_BB_NUMRECS          0x04
-#define        XFS_BB_LEFTSIB          0x08
-#define        XFS_BB_RIGHTSIB         0x10
-#define        XFS_BB_BLKNO            0x20
+#define        XFS_BB_MAGIC            (1 << 0)
+#define        XFS_BB_LEVEL            (1 << 1)
+#define        XFS_BB_NUMRECS          (1 << 2)
+#define        XFS_BB_LEFTSIB          (1 << 3)
+#define        XFS_BB_RIGHTSIB         (1 << 4)
+#define        XFS_BB_BLKNO            (1 << 5)
+#define        XFS_BB_LSN              (1 << 6)
+#define        XFS_BB_UUID             (1 << 7)
+#define        XFS_BB_OWNER            (1 << 8)
 #define        XFS_BB_NUM_BITS         5
 #define        XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
-#define        XFS_BB_NUM_BITS_CRC     8
+#define        XFS_BB_NUM_BITS_CRC     9
 #define        XFS_BB_ALL_BITS_CRC     ((1 << XFS_BB_NUM_BITS_CRC) - 1)
 
 /*
@@ -381,6 +384,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
 int xfs_btree_insert(struct xfs_btree_cur *, int *);
 int xfs_btree_delete(struct xfs_btree_cur *, int *);
 int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+                          struct list_head *buffer_list);
 
 /*
  * btree block CRC helpers
index e8fd3bd11d8e51dd73228acb708bb721e9d2f2c0..9308c47f2a527dc08b75b66de5d064e0b13e0cfe 100644 (file)
@@ -32,17 +32,17 @@ struct xfs_imap {
        ushort          im_boffset;     /* inode offset in block in bytes */
 };
 
-int            xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
-                              struct xfs_imap *, struct xfs_dinode **,
-                              struct xfs_buf **, uint, uint);
-int            xfs_iread(struct xfs_mount *, struct xfs_trans *,
-                         struct xfs_inode *, uint);
-void           xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void           xfs_dinode_to_disk(struct xfs_dinode *,
-                                  struct xfs_icdinode *);
+int    xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+                      struct xfs_imap *, struct xfs_dinode **,
+                      struct xfs_buf **, uint, uint);
+int    xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                 struct xfs_inode *, uint);
+void   xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void   xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void   xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
 
 #if defined(DEBUG)
-void           xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+void   xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #else
 #define        xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
index aeaa7157e094415502454e1caa7da6f60a53fe3c..f0969c77bdbe1ea7d87e732ba27a8c4d3fbd54c4 100644 (file)
@@ -302,6 +302,8 @@ typedef struct xfs_inode_log_format_64 {
 #define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
 #define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
 #define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+#define XFS_ILOG_DOWNER        0x200   /* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER        0x400   /* change the attr fork owner on replay */
 
 
 /*
@@ -315,7 +317,8 @@ typedef struct xfs_inode_log_format_64 {
 #define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
                                 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
                                 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
-                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
 
 #define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
                                 XFS_ILOG_DBROOT)
@@ -327,7 +330,8 @@ typedef struct xfs_inode_log_format_64 {
                                 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
                                 XFS_ILOG_DEV | XFS_ILOG_UUID | \
                                 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
+                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
 
 static inline int xfs_ilog_fbroot(int w)
 {
index bf214cf85675af834c58c09d1b5aa673746e4ad4..a848b71793dba4e11f0677a6768ea4841881b352 100644 (file)
@@ -999,3 +999,47 @@ xfs_bmdr_maxrecs(
                return blocklen / sizeof(xfs_bmdr_rec_t);
        return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
 }
+
+/*
+ * Change the owner of a btree format fork of the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       xfs_ino_t               new_owner,
+       struct list_head        *buffer_list)
+{
+       struct xfs_btree_cur    *cur;
+       int                     error;
+
+       ASSERT(tp || buffer_list);
+       ASSERT(!(tp && buffer_list));
+       if (whichfork == XFS_DATA_FORK)
+               ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+       else
+               ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+       cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+       if (!cur)
+               return ENOMEM;
+
+       error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       return error;
+}
index ce149adb843e91e1d8303104cfb40f903e91171d..2dd6fb7646c4ffc3cb3c2fb536737e414bb458f8 100644 (file)
@@ -837,6 +837,41 @@ xfs_btree_readahead(
        return xfs_btree_readahead_sblock(cur, lr, block);
 }
 
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+       } else {
+               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+               ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                       be32_to_cpu(ptr->s));
+       }
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       xfs_extlen_t            count)
+{
+       xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+                         xfs_btree_ptr_to_daddr(cur, ptr),
+                         cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
 /*
  * Set the buffer for level "lev" in the cursor to bp, releasing
  * any previous buffer.
@@ -1055,24 +1090,6 @@ xfs_btree_buf_to_ptr(
        }
 }
 
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
-               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
-       } else {
-               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
-               ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
-               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
-                                       be32_to_cpu(ptr->s));
-       }
-}
-
 STATIC void
 xfs_btree_set_refs(
        struct xfs_btree_cur    *cur,
@@ -3851,3 +3868,120 @@ xfs_btree_get_rec(
        *stat = 1;
        return 0;
 }
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       __uint64_t              new_owner,
+       struct list_head        *buffer_list)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       union xfs_btree_ptr     rptr;
+
+       /* do right sibling readahead */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+       /* modify the owner */
+       block = xfs_btree_get_block(cur, level, &bp);
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+       else
+               block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+       /*
+        * If the block is a root block hosted in an inode, we might not have a
+        * buffer pointer here and we shouldn't attempt to log the change as the
+        * information is already held in the inode and discarded when the root
+        * block is formatted into the on-disk inode fork. We still change it,
+        * though, so everything is consistent in memory.
+        */
+       if (bp) {
+               if (cur->bc_tp) {
+                       xfs_trans_ordered_buf(cur->bc_tp, bp);
+                       xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+               } else {
+                       xfs_buf_delwri_queue(bp, buffer_list);
+               }
+       } else {
+               ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+               ASSERT(level == cur->bc_nlevels - 1);
+       }
+
+       /* now read rh sibling block for next iteration */
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &rptr))
+               return ENOENT;
+
+       return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+       struct xfs_btree_cur    *cur,
+       __uint64_t              new_owner,
+       struct list_head        *buffer_list)
+{
+       union xfs_btree_ptr     lptr;
+       int                     level;
+       struct xfs_btree_block  *block = NULL;
+       int                     error = 0;
+
+       cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+       /* for each level */
+       for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+               /* grab the left hand block */
+               error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+               if (error)
+                       return error;
+
+               /* readahead the left most block for the next level down */
+               if (level > 0) {
+                       union xfs_btree_ptr     *ptr;
+
+                       ptr = xfs_btree_ptr_addr(cur, 1, block);
+                       xfs_btree_readahead_ptr(cur, ptr, 1);
+
+                       /* save for the next iteration of the loop */
+                       lptr = *ptr;
+               }
+
+               /* for each buffer in the level */
+               do {
+                       error = xfs_btree_block_change_owner(cur, level,
+                                                            new_owner,
+                                                            buffer_list);
+               } while (!error);
+
+               if (error != ENOENT)
+                       return error;
+       }
+
+       return 0;
+}