libxfs: bmap btree owner swap support

author Dave Chinner <dchinner@redhat.com>

Wed, 13 Nov 2013 06:40:31 +0000 (06:40 +0000)

committer Rich Johnston <rjohnston@sgi.com>

Wed, 13 Nov 2013 17:04:09 +0000 (11:04 -0600)
author Dave Chinner <dchinner@redhat.com>
Wed, 13 Nov 2013 06:40:31 +0000 (06:40 +0000)
committer Rich Johnston <rjohnston@sgi.com>
Wed, 13 Nov 2013 17:04:09 +0000 (11:04 -0600)
diff --git a/include/xfs_bmap_btree.h b/include/xfs_bmap_btree.h

index 2379d334712b72efd3cdfea687ce11c7b474076a..6e42e1e50b89394e2c0a0a003c339ac03d127445 100644 (file)
--- a/include/xfs_bmap_btree.h
+++ b/include/xfs_bmap_btree.h
@@ -133,6 +133,10 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
  extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
  extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
  
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+                                int whichfork, xfs_ino_t new_owner,
+                                struct list_head *buffer_list);
+
  extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
                 struct xfs_trans *, struct xfs_inode *, int);
  
diff --git a/include/xfs_btree.h b/include/xfs_btree.h

index 227bfa583dc8c2867cfed6b67aa54128d6e88cac..6afe0b2f8788dcbf926a01a2e0bc4c12cd743af1 100644 (file)
--- a/include/xfs_btree.h
+++ b/include/xfs_btree.h
@@ -41,15 +41,18 @@ extern kmem_zone_t  *xfs_btree_cur_zone;
  /*
   * For logging record fields.
   */
-#define        XFS_BB_MAGIC            0x01
-#define        XFS_BB_LEVEL            0x02
-#define        XFS_BB_NUMRECS          0x04
-#define        XFS_BB_LEFTSIB          0x08
-#define        XFS_BB_RIGHTSIB         0x10
-#define        XFS_BB_BLKNO            0x20
+#define        XFS_BB_MAGIC            (1 << 0)
+#define        XFS_BB_LEVEL            (1 << 1)
+#define        XFS_BB_NUMRECS          (1 << 2)
+#define        XFS_BB_LEFTSIB          (1 << 3)
+#define        XFS_BB_RIGHTSIB         (1 << 4)
+#define        XFS_BB_BLKNO            (1 << 5)
+#define        XFS_BB_LSN              (1 << 6)
+#define        XFS_BB_UUID             (1 << 7)
+#define        XFS_BB_OWNER            (1 << 8)
  #define        XFS_BB_NUM_BITS         5
  #define        XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
-#define        XFS_BB_NUM_BITS_CRC     8
+#define        XFS_BB_NUM_BITS_CRC     9
  #define        XFS_BB_ALL_BITS_CRC     ((1 << XFS_BB_NUM_BITS_CRC) - 1)
  
  /*
@@ -381,6 +384,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
  int xfs_btree_insert(struct xfs_btree_cur *, int *);
  int xfs_btree_delete(struct xfs_btree_cur *, int *);
  int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+                          struct list_head *buffer_list);
  
  /*
   * btree block CRC helpers
diff --git a/include/xfs_inode_buf.h b/include/xfs_inode_buf.h

index e8fd3bd11d8e51dd73228acb708bb721e9d2f2c0..9308c47f2a527dc08b75b66de5d064e0b13e0cfe 100644 (file)
--- a/include/xfs_inode_buf.h
+++ b/include/xfs_inode_buf.h
@@ -32,17 +32,17 @@ struct xfs_imap {
         ushort          im_boffset;     /* inode offset in block in bytes */
  };
  
-int            xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
-                              struct xfs_imap *, struct xfs_dinode **,
-                              struct xfs_buf **, uint, uint);
-int            xfs_iread(struct xfs_mount *, struct xfs_trans *,
-                         struct xfs_inode *, uint);
-void           xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void           xfs_dinode_to_disk(struct xfs_dinode *,
-                                  struct xfs_icdinode *);
+int    xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+                      struct xfs_imap *, struct xfs_dinode **,
+                      struct xfs_buf **, uint, uint);
+int    xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                 struct xfs_inode *, uint);
+void   xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void   xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void   xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
  
  #if defined(DEBUG)
-void           xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+void   xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
  #else
  #define        xfs_inobp_check(mp, bp)
  #endif /* DEBUG */
diff --git a/include/xfs_log_format.h b/include/xfs_log_format.h

index aeaa7157e094415502454e1caa7da6f60a53fe3c..f0969c77bdbe1ea7d87e732ba27a8c4d3fbd54c4 100644 (file)
--- a/include/xfs_log_format.h
+++ b/include/xfs_log_format.h
@@ -302,6 +302,8 @@ typedef struct xfs_inode_log_format_64 {
  #define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
  #define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
  #define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+#define XFS_ILOG_DOWNER        0x200   /* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER        0x400   /* change the attr fork owner on replay */
  
  
  /*
@@ -315,7 +317,8 @@ typedef struct xfs_inode_log_format_64 {
  #define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
                                  XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
                                  XFS_ILOG_UUID | XFS_ILOG_ADATA | \
-                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
  
  #define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
                                  XFS_ILOG_DBROOT)
@@ -327,7 +330,8 @@ typedef struct xfs_inode_log_format_64 {
                                  XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
                                  XFS_ILOG_DEV | XFS_ILOG_UUID | \
                                  XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
+                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+                                XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
  
  static inline int xfs_ilog_fbroot(int w)
  {
diff --git a/libxfs/xfs_bmap_btree.c b/libxfs/xfs_bmap_btree.c

index bf214cf85675af834c58c09d1b5aa673746e4ad4..a848b71793dba4e11f0677a6768ea4841881b352 100644 (file)
--- a/libxfs/xfs_bmap_btree.c
+++ b/libxfs/xfs_bmap_btree.c
@@ -999,3 +999,47 @@ xfs_bmdr_maxrecs(
                 return blocklen / sizeof(xfs_bmdr_rec_t);
         return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
  }
+
+/*
+ * Change the owner of a btree format fork of the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       xfs_ino_t               new_owner,
+       struct list_head        *buffer_list)
+{
+       struct xfs_btree_cur    *cur;
+       int                     error;
+
+       ASSERT(tp || buffer_list);
+       ASSERT(!(tp && buffer_list));
+       if (whichfork == XFS_DATA_FORK)
+               ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+       else
+               ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+       cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+       if (!cur)
+               return ENOMEM;
+
+       error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       return error;
+}
diff --git a/libxfs/xfs_btree.c b/libxfs/xfs_btree.c

index ce149adb843e91e1d8303104cfb40f903e91171d..2dd6fb7646c4ffc3cb3c2fb536737e414bb458f8 100644 (file)
--- a/libxfs/xfs_btree.c
+++ b/libxfs/xfs_btree.c
@@ -837,6 +837,41 @@ xfs_btree_readahead(
         return xfs_btree_readahead_sblock(cur, lr, block);
  }
  
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+       } else {
+               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+               ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                       be32_to_cpu(ptr->s));
+       }
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       xfs_extlen_t            count)
+{
+       xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+                         xfs_btree_ptr_to_daddr(cur, ptr),
+                         cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
  /*
   * Set the buffer for level "lev" in the cursor to bp, releasing
   * any previous buffer.
@@ -1055,24 +1090,6 @@ xfs_btree_buf_to_ptr(
         }
  }
  
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
-       struct xfs_btree_cur    *cur,
-       union xfs_btree_ptr     *ptr)
-{
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-               ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
-               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
-       } else {
-               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
-               ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
-               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
-                                       be32_to_cpu(ptr->s));
-       }
-}
-
  STATIC void
  xfs_btree_set_refs(
         struct xfs_btree_cur    *cur,
@@ -3851,3 +3868,120 @@ xfs_btree_get_rec(
         *stat = 1;
         return 0;
  }
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       __uint64_t              new_owner,
+       struct list_head        *buffer_list)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       union xfs_btree_ptr     rptr;
+
+       /* do right sibling readahead */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+       /* modify the owner */
+       block = xfs_btree_get_block(cur, level, &bp);
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+       else
+               block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+       /*
+        * If the block is a root block hosted in an inode, we might not have a
+        * buffer pointer here and we shouldn't attempt to log the change as the
+        * information is already held in the inode and discarded when the root
+        * block is formatted into the on-disk inode fork. We still change it,
+        * though, so everything is consistent in memory.
+        */
+       if (bp) {
+               if (cur->bc_tp) {
+                       xfs_trans_ordered_buf(cur->bc_tp, bp);
+                       xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+               } else {
+                       xfs_buf_delwri_queue(bp, buffer_list);
+               }
+       } else {
+               ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+               ASSERT(level == cur->bc_nlevels - 1);
+       }
+
+       /* now read rh sibling block for next iteration */
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &rptr))
+               return ENOENT;
+
+       return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+       struct xfs_btree_cur    *cur,
+       __uint64_t              new_owner,
+       struct list_head        *buffer_list)
+{
+       union xfs_btree_ptr     lptr;
+       int                     level;
+       struct xfs_btree_block  *block = NULL;
+       int                     error = 0;
+
+       cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+       /* for each level */
+       for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+               /* grab the left hand block */
+               error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+               if (error)
+                       return error;
+
+               /* readahead the left most block for the next level down */
+               if (level > 0) {
+                       union xfs_btree_ptr     *ptr;
+
+                       ptr = xfs_btree_ptr_addr(cur, 1, block);
+                       xfs_btree_readahead_ptr(cur, ptr, 1);
+
+                       /* save for the next iteration of the loop */
+                       lptr = *ptr;
+               }
+
+               /* for each buffer in the level */
+               do {
+                       error = xfs_btree_block_change_owner(cur, level,
+                                                            new_owner,
+                                                            buffer_list);
+               } while (!error);
+
+               if (error != ENOENT)
+                       return error;
+       }
+
+       return 0;
+}
author	Dave Chinner <dchinner@redhat.com>
	Wed, 13 Nov 2013 06:40:31 +0000 (06:40 +0000)
committer	Rich Johnston <rjohnston@sgi.com>
	Wed, 13 Nov 2013 17:04:09 +0000 (11:04 -0600)
include/xfs_bmap_btree.h		patch \| blob \| blame \| history
include/xfs_btree.h		patch \| blob \| blame \| history
include/xfs_inode_buf.h		patch \| blob \| blame \| history
include/xfs_log_format.h		patch \| blob \| blame \| history
libxfs/xfs_bmap_btree.c		patch \| blob \| blame \| history
libxfs/xfs_btree.c		patch \| blob \| blame \| history