]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/commitdiff
Add priority tagging and buffer reuse for libxfs cache
authorBarry Naujok <bnaujok@sgi.com>
Mon, 16 Jul 2007 15:56:17 +0000 (15:56 +0000)
committerBarry Naujok <bnaujok@sgi.com>
Mon, 16 Jul 2007 15:56:17 +0000 (15:56 +0000)
Merge of master-melb:xfs-cmds:29150a by kenmcd.

  Update to 2.9.2

VERSION
doc/CHANGES
include/cache.h
include/libxfs.h
include/list.h
libxfs/cache.c
libxfs/rdwr.c
repair/phase6.c
repair/prefetch.c

diff --git a/VERSION b/VERSION
index fc4221ec10dd8de7d08222256334e4b3c1b8d881..e6abef7de1cfeb8a63e89ddc3a8c2e280c52c1ae 100644 (file)
--- a/VERSION
+++ b/VERSION
@@ -3,5 +3,5 @@
 #
 PKG_MAJOR=2
 PKG_MINOR=9
-PKG_REVISION=1
+PKG_REVISION=2
 PKG_BUILD=1
index 4bbc1ea3275f8e6fc8f2659fdf051755207d735d..bbaced5f17a548c6b91376ec46970f28bc2ee223 100644 (file)
@@ -1,3 +1,20 @@
+xfsprogs-2.9.2 (16 July 2007)
+       - Next major round of xfs_repair performance improvements:
+               - Cache disk nlink values in Phase 3 for Phase 7.
+               - Do multithreaded prefetch/processing based on AG stride
+                 option (ie. for concats).
+               - Don't trash lost+found at the start of Phase 4, eliminates
+                 repeated "moving disconnected inode to lost+found" with
+                 successive xfs_repair runs.
+               - Do multi-threaded sequential metadata prefetch.
+                 Method based on Agami patches posted for 2.7.18 xfsprogs.
+               - Improve the libxfs cache with priority tagging to keep
+                 blocks around that have unfavourable I/O characteristics.
+       - Make mkfs.xfs -f zero the old secondary superblocks before writing
+         the new superblocks.
+       - Fix up xfs_info and xfs_quota's -c handling with global commands.
+       - Improve xfs_bmap -vp output to always show the FLAGS column.
+
 xfsprogs-2.9.1 (28 June 2007)
        - Added filestreams support to xfs_io.
        - Fix up libattr Makefile to append to LTLDFLAGS. Thanks to
index 241796d42b88c92ed1a9d23108c5b1efa17a29c7..317ff738d41f14e13dc8795505e81e8904ec04e9 100644 (file)
 
 #define        HASH_CACHE_RATIO        8
 
+#define CACHE_MAX_PRIORITY     15
+
 /*
  * Simple, generic implementation of a cache (arbitrary data).
  * Provides a hash table with a capped number of cache entries.
  */
 
 struct cache;
-struct cache_hash;
 struct cache_node;
 
 typedef void *cache_key_t;
@@ -48,6 +49,27 @@ struct cache_operations {
        cache_bulk_relse_t      bulkrelse;      /* optional */
 };
 
+struct cache_hash {
+       struct list_head        ch_list;        /* hash chain head */
+       unsigned int            ch_count;       /* hash chain length */
+       pthread_mutex_t         ch_mutex;       /* hash chain mutex */
+};
+
+struct cache_mru {
+       struct list_head        cm_list;        /* MRU head */
+       unsigned int            cm_count;       /* MRU length */
+       pthread_mutex_t         cm_mutex;       /* MRU lock */
+};
+
+struct cache_node {
+       struct list_head        cn_hash;        /* hash chain */
+       struct list_head        cn_mru;         /* MRU chain */
+       unsigned int            cn_count;       /* reference count */
+       unsigned int            cn_hashidx;     /* hash chain index */
+       int                     cn_priority;    /* priority, -1 = free list */
+       pthread_mutex_t         cn_mutex;       /* node mutex */
+};
+
 struct cache {
        unsigned int            c_maxcount;     /* max cache nodes */
        unsigned int            c_count;        /* count of nodes */
@@ -60,23 +82,12 @@ struct cache {
        cache_bulk_relse_t      bulkrelse;      /* bulk release routine */
        unsigned int            c_hashsize;     /* hash bucket count */
        struct cache_hash       *c_hash;        /* hash table buckets */
+       struct cache_mru        c_mrus[CACHE_MAX_PRIORITY + 1];
        unsigned long long      c_misses;       /* cache misses */
        unsigned long long      c_hits;         /* cache hits */
        unsigned int            c_max;          /* max nodes ever used */
 };
 
-struct cache_hash {
-       struct list_head        ch_list;        /* hash chain head */
-       unsigned int            ch_count;       /* hash chain length */
-       pthread_mutex_t         ch_mutex;       /* hash chain mutex */
-};
-
-struct cache_node {
-       struct list_head        cn_list;        /* hash chain */
-       unsigned int            cn_count;       /* reference count */
-       pthread_mutex_t         cn_mutex;       /* refcount mutex */
-};
-
 struct cache *cache_init(unsigned int, struct cache_operations *);
 void cache_destroy(struct cache *);
 void cache_walk(struct cache *, cache_walk_t);
@@ -85,6 +96,8 @@ void cache_flush(struct cache *);
 
 int cache_node_get(struct cache *, cache_key_t, struct cache_node **);
 void cache_node_put(struct cache_node *);
+void cache_node_set_priority(struct cache *, struct cache_node *, int);
+int cache_node_get_priority(struct cache_node *);
 int cache_node_purge(struct cache *, cache_key_t, struct cache_node *);
 void cache_report(FILE *fp, const char *, struct cache *);
 int cache_overflowed(struct cache *);
index 8de631782fe09a22a7b4dd65a0f30ab55c06599c..b55c72dc3bfbe8fd55f5af48604e1cb84408e109 100644 (file)
@@ -254,6 +254,13 @@ enum xfs_buf_flags_t {     /* b_flags bits */
 #define XFS_BUF_FSPRIVATE3(bp,type)    ((type)(bp)->b_fsprivate3)
 #define XFS_BUF_SET_FSPRIVATE3(bp,val) (bp)->b_fsprivate3 = (void *)(val)
 
+#define XFS_BUF_SET_PRIORITY(bp,pri)   cache_node_set_priority( \
+                                               libxfs_bcache, \
+                                               (struct cache_node *)(bp), \
+                                               (pri))
+#define XFS_BUF_PRIORITY(bp)           (cache_node_get_priority( \
+                                               (struct cache_node *)(bp)))
+
 /* Buffer Cache Interfaces */
 
 extern struct cache    *libxfs_bcache;
index 3ba491d323237008c8dbd1ae1f14983d65c9c8a9..2389a6c0eb6be3eec3c6ba4eef8900d1464521a5 100644 (file)
@@ -85,4 +85,55 @@ static inline int list_empty(const struct list_head *head)
        return head->next == head;
 }
 
+static inline void __list_splice(struct list_head *list,
+                                struct list_head *head)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+       struct list_head *at = head->next;
+
+       first->prev = head;
+       head->next = first;
+
+       last->next = at;
+       at->prev = last;
+}
+
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head);
+}
+
+static inline void list_splice_init(struct list_head *list,
+                                   struct list_head *head)
+{
+       if (!list_empty(list)) {
+               __list_splice(list, head);
+               list_head_init(list);
+       }
+}
+
+#define list_entry(ptr, type, member) ({                       \
+       const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+       (type *)( (char *)__mptr - offsetof(type,member) );})
+
+#define list_for_each(pos, head) \
+       for (pos = (head)->next; pos != (head); pos = pos->next)
+
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+#define list_for_each_entry(pos, head, member)                         \
+       for (pos = list_entry((head)->next, typeof(*pos), member);      \
+            &pos->member != (head);    \
+            pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member)                 \
+       for (pos = list_entry((head)->next, typeof(*pos), member),      \
+               n = list_entry(pos->member.next, typeof(*pos), member); \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
 #endif /* __LIST_H__ */
index c37356d0dd6f27d7ad1a652cbd2c1c7c0442f34b..4e0831fd5aa721916aa5182265e91eb884783a3e 100644 (file)
@@ -32,6 +32,8 @@
 #undef CACHE_ABORT
 /* #define CACHE_ABORT 1 */
 
+#define CACHE_SHAKE_COUNT      64
+
 static unsigned int cache_generic_bulkrelse(struct cache *, struct list_head *);
 
 struct cache *
@@ -71,6 +73,12 @@ cache_init(
                cache->c_hash[i].ch_count = 0;
                pthread_mutex_init(&cache->c_hash[i].ch_mutex, NULL);
        }
+
+       for (i = 0; i <= CACHE_MAX_PRIORITY; i++) {
+               list_head_init(&cache->c_mrus[i].cm_list);
+               cache->c_mrus[i].cm_count = 0;
+               pthread_mutex_init(&cache->c_mrus[i].cm_mutex, NULL);
+       }
        return cache;
 }
 
@@ -127,161 +135,119 @@ cache_destroy(
                list_head_destroy(&cache->c_hash[i].ch_list);
                pthread_mutex_destroy(&cache->c_hash[i].ch_mutex);
        }
+       for (i = 0; i <= CACHE_MAX_PRIORITY; i++) {
+               list_head_destroy(&cache->c_mrus[i].cm_list);
+               pthread_mutex_destroy(&cache->c_mrus[i].cm_mutex);
+       }
        pthread_mutex_destroy(&cache->c_mutex);
        free(cache->c_hash);
        free(cache);
 }
 
-static int
-cache_shake_node(
+static unsigned int
+cache_generic_bulkrelse(
        struct cache *          cache,
-       cache_key_t             key,
-       struct cache_node *     node)
+       struct list_head *      list)
 {
-       struct list_head *      head;
-       struct list_head *      pos;
-       struct list_head *      n;
-       struct cache_hash *     hash;
-       int                     count = -1;
+       struct cache_node *     node;
+       unsigned int            count = 0;
 
-       hash = cache->c_hash + cache->hash(key, cache->c_hashsize);
-       head = &hash->ch_list;
-       pthread_mutex_lock(&hash->ch_mutex);
-       for (pos = head->next, n = pos->next;
-            pos != head;
-            pos = n, n = pos->next) {
-               if ((struct cache_node *)pos != node)
-                       continue;
-               pthread_mutex_lock(&node->cn_mutex);
-               count = node->cn_count;
-               pthread_mutex_unlock(&node->cn_mutex);
-               if (count != 0)
-                       break;
+       while (!list_empty(list)) {
+               node = list_entry(list->next, struct cache_node, cn_mru);
                pthread_mutex_destroy(&node->cn_mutex);
-               list_del_init(&node->cn_list);
-               hash->ch_count--;
+               list_del_init(&node->cn_mru);
                cache->relse(node);
-               break;
+               count++;
        }
-       pthread_mutex_unlock(&hash->ch_mutex);
+
        return count;
 }
 
 /*
  * We've hit the limit on cache size, so we need to start reclaiming
- * nodes we've used.  This reclaims from the one given hash bucket
- * only.  Returns the number of freed up nodes, its left to the
- * caller to updates the global counter of used nodes for the cache.
- * The hash chain lock is held for the hash list argument, must be
- * dropped before returning.
- * We walk backwards through the hash (remembering we keep recently
- * used nodes toward the front) until we hit an in-use node.  We'll
- * stop there if its a low priority call but keep going if its not.
+ * nodes we've used. The MRU specified by the priority is shaken.
+ * Returns new priority at end of the call (in case we call again).
  */
 static unsigned int
-cache_shake_hash(
+cache_shake(
        struct cache *          cache,
-       struct cache_hash *     hash,
-       unsigned int            priority)
+       unsigned int            priority,
+       int                     all)
 {
+       struct cache_mru        *mru;
+       struct cache_hash *     hash;
        struct list_head        temp;
        struct list_head *      head;
        struct list_head *      pos;
        struct list_head *      n;
        struct cache_node *     node;
-       unsigned int            inuse = 0;
+       unsigned int            count;
 
+       ASSERT(priority <= CACHE_MAX_PRIORITY);
+       if (priority > CACHE_MAX_PRIORITY)
+               priority = 0;
+
+       mru = &cache->c_mrus[priority];
+       count = 0;
        list_head_init(&temp);
-       head = &hash->ch_list;
-       for (pos = head->prev, n = pos->prev;
-            pos != head;
-            pos = n, n = pos->prev) {
-               node = (struct cache_node *)pos;
-               pthread_mutex_lock(&node->cn_mutex);
-               if (!(inuse = (node->cn_count > 0))) {
-                       hash->ch_count--;
-                       list_move_tail(&node->cn_list, &temp);
+       head = &mru->cm_list;
+
+       pthread_mutex_lock(&mru->cm_mutex);
+       for (pos = head->prev, n = pos->prev; pos != head;
+                                               pos = n, n = pos->prev) {
+               node = list_entry(pos, struct cache_node, cn_mru);
+
+               if (pthread_mutex_trylock(&node->cn_mutex) != 0)
+                       continue;
+
+               hash = cache->c_hash + node->cn_hashidx;
+               if (node->cn_count > 0 ||
+                               pthread_mutex_trylock(&hash->ch_mutex) != 0) {
+                       pthread_mutex_unlock(&node->cn_mutex);
+                       continue;
                }
-               pthread_mutex_unlock(&node->cn_mutex);
-               if (inuse && !priority)
-                       break;
-       }
-       pthread_mutex_unlock(&hash->ch_mutex);
-       return cache->bulkrelse(cache, &temp);
-}
+               ASSERT(node->cn_priority == priority);
+               node->cn_priority = -1;
 
-/*
- * Generic implementation of bulk release, which just iterates over
- * the list calling the single node relse routine for each node.
- */
-static unsigned int
-cache_generic_bulkrelse(
-       struct cache *          cache,
-       struct list_head *      list)
-{
-       struct cache_node *     node;
-       unsigned int            count = 0;
+               list_move(&node->cn_mru, &temp);
+               list_del_init(&node->cn_hash);
+               hash->ch_count--;
+               mru->cm_count--;
+               pthread_mutex_unlock(&hash->ch_mutex);
+               pthread_mutex_unlock(&node->cn_mutex);
 
-       while (!list_empty(list)) {
-               node = (struct cache_node *)list->next;
-               pthread_mutex_destroy(&node->cn_mutex);
-               list_del_init(&node->cn_list);
-               cache->relse(node);
                count++;
+               if (!all && count == CACHE_SHAKE_COUNT)
+                       break;
        }
-       return count;
-}
+       pthread_mutex_unlock(&mru->cm_mutex);
 
-/*
- * We've hit the limit on cache size, so we need to start reclaiming
- * nodes we've used.  Start by shaking this hash chain only, unless
- * the shake priority has been increased already.
- * The hash chain lock is held for the hash list argument, must be
- * dropped before returning.
- * Returns new priority at end of the call (in case we call again).
- */
-static unsigned int
-cache_shake(
-       struct cache *          cache,
-       struct cache_hash *     hash,
-       unsigned int            priority)
-{
-       unsigned int            count;
-       unsigned int            i;
+       if (count > 0) {
+               cache->bulkrelse(cache, &temp);
 
-       if (!priority) {        /* do just one */
-               count = cache_shake_hash(cache, hash, priority);
-       } else {        /* use a bigger hammer */
-               pthread_mutex_unlock(&hash->ch_mutex);
-               for (count = 0, i = 0; i < cache->c_hashsize; i++) {
-                       hash = &cache->c_hash[i];
-                       pthread_mutex_lock(&hash->ch_mutex);
-                       count += cache_shake_hash(cache, hash, priority - 1);
-               }
-       }
-       if (count) {
                pthread_mutex_lock(&cache->c_mutex);
                cache->c_count -= count;
                pthread_mutex_unlock(&cache->c_mutex);
        }
-       return ++priority;
+
+       return (count == CACHE_SHAKE_COUNT) ? priority : ++priority;
 }
 
 /*
  * Allocate a new hash node (updating atomic counter in the process),
  * unless doing so will push us over the maximum cache size.
  */
-struct cache_node *
+static struct cache_node *
 cache_node_allocate(
        struct cache *          cache,
-       struct cache_hash *     hashlist,
        cache_key_t             key)
 {
        unsigned int            nodesfree;
        struct cache_node *     node;
 
        pthread_mutex_lock(&cache->c_mutex);
-       if ((nodesfree = (cache->c_count < cache->c_maxcount))) {
+       nodesfree = (cache->c_count < cache->c_maxcount);
+       if (nodesfree) {
                cache->c_count++;
                if (cache->c_count > cache->c_max)
                        cache->c_max = cache->c_count;
@@ -290,15 +256,16 @@ cache_node_allocate(
        pthread_mutex_unlock(&cache->c_mutex);
        if (!nodesfree)
                return NULL;
-       if (!(node = cache->alloc(key))) {      /* uh-oh */
+       node = cache->alloc(key);
+       if (node == NULL) {     /* uh-oh */
                pthread_mutex_lock(&cache->c_mutex);
                cache->c_count--;
                pthread_mutex_unlock(&cache->c_mutex);
                return NULL;
        }
        pthread_mutex_init(&node->cn_mutex, NULL);
-       list_head_init(&node->cn_list);
        node->cn_count = 1;
+       node->cn_priority = 0;
        return node;
 }
 
@@ -325,42 +292,69 @@ cache_node_get(
 {
        struct cache_node *     node = NULL;
        struct cache_hash *     hash;
+       struct cache_mru *      mru;
        struct list_head *      head;
        struct list_head *      pos;
+       unsigned int            hashidx;
        int                     priority = 0;
-       int                     allocated = 0;
 
-       hash = cache->c_hash + cache->hash(key, cache->c_hashsize);
+       hashidx = cache->hash(key, cache->c_hashsize);
+       hash = cache->c_hash + hashidx;
        head = &hash->ch_list;
 
-  restart:
-       pthread_mutex_lock(&hash->ch_mutex);
-       for (pos = head->next; pos != head; pos = pos->next) {
-               node = (struct cache_node *)pos;
-               if (cache->compare(node, key) == 0)
-                       continue;
-               pthread_mutex_lock(&node->cn_mutex);
-               node->cn_count++;
-               pthread_mutex_unlock(&node->cn_mutex);
-               pthread_mutex_lock(&cache->c_mutex);
-               cache->c_hits++;
-               pthread_mutex_unlock(&cache->c_mutex);
-               break;
-       }
-       if (pos == head) {
-               node = cache_node_allocate(cache, hash, key);
-               if (!node) {
-                       priority = cache_shake(cache, hash, priority);
-                       goto restart;
+       for (;;) {
+               pthread_mutex_lock(&hash->ch_mutex);
+               for (pos = head->next; pos != head; pos = pos->next) {
+                       node = list_entry(pos, struct cache_node, cn_hash);
+                       if (!cache->compare(node, key))
+                               continue;
+                       /*
+                       * node found, bump node's reference count, move it to the
+                       * top of its MRU list, and update stats.
+                       */
+                       pthread_mutex_lock(&node->cn_mutex);
+                       node->cn_count++;
+
+                       mru = &cache->c_mrus[node->cn_priority];
+                       pthread_mutex_lock(&mru->cm_mutex);
+                       list_move(&node->cn_mru, &mru->cm_list);
+                       pthread_mutex_unlock(&mru->cm_mutex);
+
+                       pthread_mutex_unlock(&node->cn_mutex);
+                       pthread_mutex_unlock(&hash->ch_mutex);
+
+                       pthread_mutex_lock(&cache->c_mutex);
+                       cache->c_hits++;
+                       pthread_mutex_unlock(&cache->c_mutex);
+
+                       *nodep = node;
+                       return 0;
                }
-               allocated = 1;
-               hash->ch_count++;       /* new entry */
+               pthread_mutex_unlock(&hash->ch_mutex);
+               /*
+                * not found, allocate a new entry
+                */
+               node = cache_node_allocate(cache, key);
+               if (node)
+                       break;
+               priority = cache_shake(cache, priority, 0);
        }
-       /* looked at it, move to hash list head */
-       list_move(&node->cn_list, &hash->ch_list);
+
+       node->cn_hashidx = hashidx;
+
+       /* add new node to appropriate hash and lowest priority MRU */
+       mru = &cache->c_mrus[0];
+       pthread_mutex_lock(&mru->cm_mutex);
+       pthread_mutex_lock(&hash->ch_mutex);
+       hash->ch_count++;
+       mru->cm_count++;
+       list_add(&node->cn_hash, &hash->ch_list);
+       list_add(&node->cn_mru, &mru->cm_list);
        pthread_mutex_unlock(&hash->ch_mutex);
+       pthread_mutex_unlock(&mru->cm_mutex);
+
        *nodep = node;
-       return allocated;
+       return 1;
 }
 
 void
@@ -379,6 +373,56 @@ cache_node_put(
        pthread_mutex_unlock(&node->cn_mutex);
 }
 
+void
+cache_node_set_priority(
+       struct cache *          cache,
+       struct cache_node *     node,
+       int                     priority)
+{
+       struct cache_mru *      mru;
+
+       if (priority < 0)
+               priority = 0;
+       else if (priority > CACHE_MAX_PRIORITY)
+               priority = CACHE_MAX_PRIORITY;
+
+       pthread_mutex_lock(&node->cn_mutex);
+
+       ASSERT(node->cn_count > 0);
+       if (priority == node->cn_priority) {
+               pthread_mutex_unlock(&node->cn_mutex);
+               return;
+       }
+       mru = &cache->c_mrus[node->cn_priority];
+       pthread_mutex_lock(&mru->cm_mutex);
+       list_del_init(&node->cn_mru);
+       mru->cm_count--;
+       pthread_mutex_unlock(&mru->cm_mutex);
+
+       mru = &cache->c_mrus[priority];
+       pthread_mutex_lock(&mru->cm_mutex);
+       list_add(&node->cn_mru, &mru->cm_list);
+       node->cn_priority = priority;
+       mru->cm_count++;
+       pthread_mutex_unlock(&mru->cm_mutex);
+
+       pthread_mutex_unlock(&node->cn_mutex);
+}
+
+int
+cache_node_get_priority(
+       struct cache_node *     node)
+{
+       int                     priority;
+
+       pthread_mutex_lock(&node->cn_mutex);
+       priority = node->cn_priority;
+       pthread_mutex_unlock(&node->cn_mutex);
+
+       return priority;
+}
+
+
 /*
  * Purge a specific node from the cache.  Reference count must be zero.
  */
@@ -388,27 +432,60 @@ cache_node_purge(
        cache_key_t             key,
        struct cache_node *     node)
 {
-       int                     refcount;
+       struct list_head *      head;
+       struct list_head *      pos;
+       struct list_head *      n;
+       struct cache_hash *     hash;
+       struct cache_mru *      mru;
+       int                     count = -1;
+
+       hash = cache->c_hash + cache->hash(key, cache->c_hashsize);
+       head = &hash->ch_list;
+       pthread_mutex_lock(&hash->ch_mutex);
+       for (pos = head->next, n = pos->next; pos != head;
+                                               pos = n, n = pos->next) {
+               if ((struct cache_node *)pos != node)
+                       continue;
 
-       refcount = cache_shake_node(cache, key, node);
-       if (refcount == 0) {
+               pthread_mutex_lock(&node->cn_mutex);
+               count = node->cn_count;
+               if (count != 0) {
+                       pthread_mutex_unlock(&node->cn_mutex);
+                       break;
+               }
+               mru = &cache->c_mrus[node->cn_priority];
+               pthread_mutex_lock(&mru->cm_mutex);
+               list_del_init(&node->cn_mru);
+               mru->cm_count--;
+               pthread_mutex_unlock(&mru->cm_mutex);
+
+               pthread_mutex_unlock(&node->cn_mutex);
+               pthread_mutex_destroy(&node->cn_mutex);
+               list_del_init(&node->cn_hash);
+               hash->ch_count--;
+               cache->relse(node);
+               break;
+       }
+       pthread_mutex_unlock(&hash->ch_mutex);
+
+       if (count == 0) {
                pthread_mutex_lock(&cache->c_mutex);
                cache->c_count--;
                pthread_mutex_unlock(&cache->c_mutex);
        }
 #ifdef CACHE_DEBUG
-       if (refcount >= 1) {
+       if (count >= 1) {
                fprintf(stderr, "%s: refcount was %u, not zero (node=%p)\n",
-                               __FUNCTION__, refcount, node);
+                               __FUNCTION__, count, node);
                cache_abort();
        }
-       if (refcount == -1) {
+       if (count == -1) {
                fprintf(stderr, "%s: purge node not found! (node=%p)\n",
                        __FUNCTION__, node);
                cache_abort();
        }
 #endif
-       return (refcount == 0);
+       return (count == 0);
 }
 
 /*
@@ -418,20 +495,20 @@ void
 cache_purge(
        struct cache *          cache)
 {
-       struct cache_hash *     hash;
+       int                     i;
+
+       for (i = 0; i <= CACHE_MAX_PRIORITY; i++)
+               cache_shake(cache, i, 1);
 
-       hash = &cache->c_hash[0];
-       pthread_mutex_lock(&hash->ch_mutex);
-       cache_shake(cache, hash, (unsigned int)-1);
 #ifdef CACHE_DEBUG
        if (cache->c_count != 0) {
+               /* flush referenced nodes to disk */
+               cache_flush(cache);
                fprintf(stderr, "%s: shake on cache %p left %u nodes!?\n",
                                __FUNCTION__, cache, cache->c_count);
                cache_abort();
        }
 #endif
-       /* flush any remaining nodes to disk */
-       cache_flush(cache);
 }
 
 /*
@@ -465,15 +542,18 @@ cache_flush(
        }
 }
 
-#define        HASH_REPORT     (3*HASH_CACHE_RATIO)
+#define        HASH_REPORT     (3 * HASH_CACHE_RATIO)
 void
-cache_report(FILE *fp, const char *name, struct cache * cache)
+cache_report(
+       FILE                    *fp,
+       const char              *name,
+       struct cache            *cache)
 {
-       int i;
-       unsigned long count, index, total;
-       unsigned long hash_bucket_lengths[HASH_REPORT+2];
+       int                     i;
+       unsigned long           count, index, total;
+       unsigned long           hash_bucket_lengths[HASH_REPORT + 2];
 
-       if ((cache->c_hits+cache->c_misses) == 0)
+       if ((cache->c_hits + cache->c_misses) == 0)
                return;
 
        /* report cache summary */
@@ -492,9 +572,15 @@ cache_report(FILE *fp, const char *name, struct cache * cache)
                        cache->c_hashsize,
                        cache->c_hits,
                        cache->c_misses,
-                       (double) (cache->c_hits*100/(cache->c_hits+cache->c_misses))
+                       (double)cache->c_hits * 100 /
+                               (cache->c_hits + cache->c_misses)
        );
 
+       for (i = 0; i <= CACHE_MAX_PRIORITY; i++)
+               fprintf(fp, "MRU %d entries = %6u (%3u%%)\n",
+                       i, cache->c_mrus[i].cm_count,
+                       cache->c_mrus[i].cm_count * 100 / cache->c_count);
+
        /* report hash bucket lengths */
        bzero(hash_bucket_lengths, sizeof(hash_bucket_lengths));
 
@@ -508,14 +594,16 @@ cache_report(FILE *fp, const char *name, struct cache * cache)
        }
 
        total = 0;
-       for (i = 0; i < HASH_REPORT+1; i++) {
-               total += i*hash_bucket_lengths[i];
+       for (i = 0; i < HASH_REPORT + 1; i++) {
+               total += i * hash_bucket_lengths[i];
                if (hash_bucket_lengths[i] == 0)
                        continue;
-               fprintf(fp, "Hash buckets with  %2d entries %5ld (%3ld%%)\n",
-                       i, hash_bucket_lengths[i], (i*hash_bucket_lengths[i]*100)/cache->c_count);
+               fprintf(fp, "Hash buckets with  %2d entries %6ld (%3ld%%)\n",
+                       i, hash_bucket_lengths[i],
+                       (i * hash_bucket_lengths[i] * 100) / cache->c_count);
        }
        if (hash_bucket_lengths[i])     /* last report bucket is the overflow bucket */
-               fprintf(fp, "Hash buckets with >%2d entries %5ld (%3ld%%)\n",
-                       i-1, hash_bucket_lengths[i], ((cache->c_count-total)*100)/cache->c_count);
+               fprintf(fp, "Hash buckets with >%2d entries %6ld (%3ld%%)\n",
+                       i - 1, hash_bucket_lengths[i],
+                       ((cache->c_count - total) * 100) / cache->c_count);
 }
index 921ced7fa1c827b81ccb3f38b9fca39f19fb1519..830d0cf018a0be0687b4d207c8d69e51fd7a0d0f 100644 (file)
@@ -257,7 +257,11 @@ libxfs_getsb(xfs_mount_t *mp, int flags)
                                XFS_FSS_TO_BB(mp, 1), flags);
 }
 
-xfs_zone_t     *xfs_buf_zone;
+xfs_zone_t                     *xfs_buf_zone;
+
+static struct cache_mru                xfs_buf_freelist =
+       {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
+        0, PTHREAD_MUTEX_INITIALIZER };
 
 typedef struct {
        dev_t           device;
@@ -308,7 +312,8 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
        bp->b_blkno = bno;
        bp->b_bcount = bytes;
        bp->b_dev = device;
-       bp->b_addr = memalign(libxfs_device_alignment(), bytes);
+       if (!bp->b_addr)
+               bp->b_addr = memalign(libxfs_device_alignment(), bytes);
        if (!bp->b_addr) {
                fprintf(stderr,
                        _("%s: %s can't memalign %u bytes: %s\n"),
@@ -323,18 +328,44 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
 }
 
 xfs_buf_t *
-libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int len)
+libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int bblen)
 {
        xfs_buf_t       *bp;
+       int             blen = BBTOB(bblen);
+
+       /*
+        * first look for a buffer that can be used as-is,
+        * if one cannot be found, see if there is a buffer,
+        * and if so, free it's buffer and set b_addr to NULL
+        * before calling libxfs_initbuf.
+        */
+       pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+       if (!list_empty(&xfs_buf_freelist.cm_list)) {
+               list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
+                       if (bp->b_bcount == blen) {
+                               list_del_init(&bp->b_node.cn_mru);
+                               break;
+                       }
+               }
+               if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
+                       bp = list_entry(xfs_buf_freelist.cm_list.next,
+                                       xfs_buf_t, b_node.cn_mru);
+                       list_del_init(&bp->b_node.cn_mru);
+                       free(bp->b_addr);
+                       bp->b_addr = NULL;
+               }
+       } else
+               bp = libxfs_zone_zalloc(xfs_buf_zone);
+       pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 
-       bp = libxfs_zone_zalloc(xfs_buf_zone);
        if (bp != NULL)
-               libxfs_initbuf(bp, device, blkno, BBTOB(len));
+               libxfs_initbuf(bp, device, blkno, blen);
 #ifdef IO_DEBUG
        printf("%lx: %s: allocated %u bytes buffer, key=%llu(%llu), %p\n",
                pthread_self(), __FUNCTION__, BBTOB(len),
                (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 #endif
+
        return bp;
 }
 
@@ -358,6 +389,8 @@ libxfs_getbuf(dev_t device, xfs_daddr_t blkno, int len)
        miss = cache_node_get(libxfs_bcache, &key, (struct cache_node **)&bp);
        if (bp) {
                pthread_mutex_lock(&bp->b_lock);
+               cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
+                       cache_node_get_priority((struct cache_node *)bp) - 4);
 #ifdef XFS_BUF_TRACING
                pthread_mutex_lock(&libxfs_bcache->c_mutex);
                lock_buf_count++;
@@ -525,34 +558,46 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
 }
 
 static void
-libxfs_bflush(struct cache_node *node)
+libxfs_brelse(struct cache_node *node)
 {
        xfs_buf_t               *bp = (xfs_buf_t *)node;
 
-       if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
-               libxfs_writebufr(bp);
+       if (bp != NULL) {
+               if (bp->b_flags & LIBXFS_B_DIRTY)
+                       libxfs_writebufr(bp);
+               pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+               list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
+               pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
+       }
 }
 
 static void
-libxfs_brelse(struct cache_node *node)
+libxfs_bulkrelse(
+       struct cache            *cache,
+       struct list_head        *list)
 {
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
-       xfs_buf_log_item_t      *bip;
-       extern xfs_zone_t       *xfs_buf_item_zone;
+       xfs_buf_t               *bp;
 
-       if (bp != NULL) {
+       if (list_empty(list))
+               return;
+
+       list_for_each_entry(bp, list, b_node.cn_mru) {
                if (bp->b_flags & LIBXFS_B_DIRTY)
                        libxfs_writebufr(bp);
-               bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-               if (bip)
-                       libxfs_zone_free(xfs_buf_item_zone, bip);
-               free(bp->b_addr);
-               pthread_mutex_destroy(&bp->b_lock);
-               bp->b_addr = NULL;
-               bp->b_flags = 0;
-               free(bp);
-               bp = NULL;
        }
+
+       pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+       __list_splice(list, &xfs_buf_freelist.cm_list);
+       pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
+}
+
+static void
+libxfs_bflush(struct cache_node *node)
+{
+       xfs_buf_t               *bp = (xfs_buf_t *)node;
+
+       if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
+               libxfs_writebufr(bp);
 }
 
 void
@@ -586,7 +631,7 @@ struct cache_operations libxfs_bcache_operations = {
        /* .flush */    libxfs_bflush,
        /* .relse */    libxfs_brelse,
        /* .compare */  libxfs_bcompare,
-       /* .bulkrelse */ NULL   /* TODO: lio_listio64 interface? */
+       /* .bulkrelse */libxfs_bulkrelse
 };
 
 
index e43addabfc28ade9f91c26251439073a48d7a6b9..147b80d8661c18927ab715359b3adf644ed69962 100644 (file)
@@ -745,6 +745,7 @@ mk_root_dir(xfs_mount_t *mp)
        int             i;
        int             error;
        const mode_t    mode = 0755;
+       ino_tree_node_t *irec;
 
        tp = libxfs_trans_alloc(mp, 0);
        ip = NULL;
@@ -788,6 +789,11 @@ mk_root_dir(xfs_mount_t *mp)
        dir_init(mp, tp, ip, ip);
 
        libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_SYNC, 0);
+
+       irec = find_inode_rec(XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
+                               XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
+       set_inode_isadir(irec, XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino) -
+                               irec->ino_startnum);
 }
 
 /*
@@ -3802,29 +3808,20 @@ traverse_ags(
        xfs_mount_t             *mp)
 {
        int                     i;
-       work_queue_t            *queues;
+       work_queue_t            queue;
        prefetch_args_t         *pf_args[2];
 
-       queues = malloc(thread_count * sizeof(work_queue_t));
-       queues[0].mp = mp;
-
-       if (!libxfs_bcache_overflowed()) {
-               /*create_work_queue(&queues[0], mp, libxfs_nproc());
-               for (i = 0; i < glob_agcount; i++)
-                       queue_work(&queues[0], traverse_function, i, NULL);
-               destroy_work_queue(&queues[0]);*/
-               for (i = 0; i < glob_agcount; i++)
-                       traverse_function(&queues[0], i, NULL);
-       } else {
-               /* TODO: AG stride support */
-               pf_args[0] = start_inode_prefetch(0, 1, NULL);
-               for (i = 0; i < glob_agcount; i++) {
-                       pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 1,
-                                       pf_args[i & 1]);
-                       traverse_function(&queues[0], i, pf_args[i & 1]);
-               }
+       /*
+        * we always do prefetch for phase 6 as it will fill in the gaps
+        * not read during phase 3 prefetch.
+        */
+       queue.mp = mp;
+       pf_args[0] = start_inode_prefetch(0, 1, NULL);
+       for (i = 0; i < glob_agcount; i++) {
+               pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 1,
+                               pf_args[i & 1]);
+               traverse_function(&queue, i, pf_args[i & 1]);
        }
-       free(queues);
 }
 
 void
@@ -3901,7 +3898,7 @@ _("        - resetting contents of realtime bitmap and summary inodes\n"));
 
        mark_standalone_inodes(mp);
 
-       do_log(_("        - traversing filesystem ... \n"));
+       do_log(_("        - traversing filesystem ...\n"));
 
        irec = find_inode_rec(XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
                                XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
@@ -3919,8 +3916,8 @@ _("        - resetting contents of realtime bitmap and summary inodes\n"));
         */
        traverse_ags(mp);
 
-       do_log(_("        - traversals finished ... \n"));
-       do_log(_("        - moving disconnected inodes to %s ... \n"),
+       do_log(_("        - traversal finished ...\n"));
+       do_log(_("        - moving disconnected inodes to %s ...\n"),
                ORPHANAGE);
 
        /*
index 4b7ea7f11d1c24843eaa1428f1c074545ba11a07..4a31cbaea82cad62c1745fdc7476f411c80af55d 100644 (file)
@@ -34,14 +34,27 @@ static int          pf_max_fsbs;
 static int             pf_batch_bytes;
 static int             pf_batch_fsbs;
 
-#define B_INODE                0x1000000
-#define B_META         0x2000000
+static void            pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
+
+/* buffer priorities for the libxfs cache */
+
+#define B_DIR_BMAP     15
+#define B_DIR_META_2   13      /* metadata in secondary queue */
+#define B_DIR_META_H   11      /* metadata fetched for PF_META_ONLY */
+#define B_DIR_META_S   9       /* single block of metadata */
+#define B_DIR_META     7
+#define B_DIR_INODE    6
+#define B_BMAP         5
+#define B_INODE                4
+
+#define B_IS_INODE(b)  (((b) & 1) == 0)
+#define B_IS_META(b)   (((b) & 1) != 0)
 
 #define DEF_BATCH_BYTES        0x10000
 
 #define MAX_BUFS       128
 
-#define IO_THRESHOLD   (MAX_BUFS * PF_THREAD_COUNT)
+#define IO_THRESHOLD   (MAX_BUFS * 2)
 
 typedef enum pf_which {
        PF_PRIMARY,
@@ -89,16 +102,19 @@ pf_queue_io(
        bp = libxfs_getbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
                        XFS_FSB_TO_BB(mp, blen));
        if (bp->b_flags & LIBXFS_B_UPTODATE) {
+               if (B_IS_INODE(flag))
+                       pf_read_inode_dirs(args, bp);
+               XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) + 8);
                libxfs_putbuf(bp);
                return;
        }
-       bp->b_flags |= flag;
+       XFS_BUF_SET_PRIORITY(bp, flag);
 
        pthread_mutex_lock(&args->lock);
 
        if (fsbno > args->last_bno_read) {
                radix_tree_insert(&args->primary_io_queue, fsbno, bp);
-               if (flag == B_META)
+               if (B_IS_META(flag))
                        radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
                else {
                        args->inode_bufs_queued++;
@@ -108,7 +124,7 @@ pf_queue_io(
 #ifdef XR_PF_TRACE
                pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
                        "primary queue (inode_bufs_queued = %d, last_bno = %lu)",
-                       flag == B_INODE ? 'I' : 'M', bp,
+                       B_IS_INODE(flag) ? 'I' : 'M', bp,
                        (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
                        args->inode_bufs_queued, args->last_bno_read);
 #endif
@@ -116,11 +132,12 @@ pf_queue_io(
 #ifdef XR_PF_TRACE
                pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
                        "secondary queue (last_bno = %lu)",
-                       flag == B_INODE ? 'I' : 'M', bp,
+                       B_IS_INODE(flag) ? 'I' : 'M', bp,
                        (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
                        args->last_bno_read);
 #endif
-               ASSERT(flag == B_META);
+               ASSERT(B_IS_META(flag));
+               XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
                radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
        }
 
@@ -163,7 +180,7 @@ pf_read_bmbt_reclist(
 #ifdef XR_PF_TRACE
                        pftrace("queuing dir extent in AG %d", args->agno);
 #endif
-                       pf_queue_io(args, s, 1, B_META);
+                       pf_queue_io(args, s, 1, B_DIR_META);
                        c--;
                        s++;
                }
@@ -194,6 +211,8 @@ pf_scan_lbtree(
        if (!bp)
                return 0;
 
+       XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
+
        rc = (*func)((xfs_btree_lblock_t *)XFS_BUF_PTR(bp), level - 1, isadir, args);
 
        libxfs_putbuf(bp);
@@ -307,6 +326,8 @@ pf_read_inode_dirs(
 {
        xfs_dinode_t            *dino;
        int                     icnt = 0;
+       int                     hasdir = 0;
+       int                     isadir;
        xfs_dinode_core_t       *dinoc;
 
        for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
@@ -317,9 +338,14 @@ pf_read_inode_dirs(
                 * We are only prefetching directory contents in extents
                 * and btree nodes for other inodes
                 */
-               if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL ||
-                               (dinoc->di_format == XFS_DINODE_FMT_EXTENTS &&
-                                (be16_to_cpu(dinoc->di_mode) & S_IFMT) != S_IFDIR))
+               isadir = (be16_to_cpu(dinoc->di_mode) & S_IFMT) == S_IFDIR;
+               hasdir |= isadir;
+
+               if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL)
+                       continue;
+
+               if (!isadir && (dinoc->di_format == XFS_DINODE_FMT_EXTENTS ||
+                               args->dirs_only))
                        continue;
 
                /*
@@ -350,11 +376,12 @@ pf_read_inode_dirs(
                                pf_read_exinode(args, dino);
                                break;
                        case XFS_DINODE_FMT_BTREE:
-                               pf_read_btinode(args, dino, (be16_to_cpu(
-                                       dinoc->di_mode) & S_IFMT) == S_IFDIR);
+                               pf_read_btinode(args, dino, isadir);
                                break;
                }
        }
+       if (hasdir)
+               XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
 }
 
 /*
@@ -434,7 +461,7 @@ pf_batch_read(
 
                if (which == PF_PRIMARY) {
                        for (inode_bufs = 0, i = 0; i < num; i++) {
-                               if (bplist[i]->b_flags & B_INODE)
+                               if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
                                        inode_bufs++;
                        }
                        args->inode_bufs_queued -= inode_bufs;
@@ -470,14 +497,20 @@ pf_batch_read(
                                memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
                                bplist[i]->b_flags |= LIBXFS_B_UPTODATE;
                                len -= size;
-                               if (bplist[i]->b_flags & B_INODE)
+                               if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
                                        pf_read_inode_dirs(args, bplist[i]);
+                               else if (which == PF_META_ONLY)
+                                       XFS_BUF_SET_PRIORITY(bplist[i],
+                                                               B_DIR_META_H);
+                               else if (which == PF_PRIMARY && num == 1)
+                                       XFS_BUF_SET_PRIORITY(bplist[i],
+                                                               B_DIR_META_S);
                        }
                }
                for (i = 0; i < num; i++) {
 #ifdef XR_PF_TRACE
                        pftrace("putbuf %c %p (%llu) in AG %d",
-                               bplist[i]->b_flags & B_INODE ? 'I' : 'M',
+                               B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
                                bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
                                args->agno);
 #endif
@@ -623,7 +656,9 @@ pf_queuing_worker(
 
                do {
                        pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno),
-                                       blks_per_cluster, B_INODE);
+                                       blks_per_cluster,
+                                       (cur_irec->ino_isa_dir != 0) ?
+                                               B_DIR_INODE : B_INODE);
                        bno += blks_per_cluster;
                        num_inos += inos_per_cluster;
                } while (num_inos < XFS_IALLOC_INODES(mp));