From 69ec88b58fe0180dc641637baae9a09e6648098f Mon Sep 17 00:00:00 2001
From: Barry Naujok <bnaujok@sgi.com>
Date: Mon, 16 Jul 2007 15:56:17 +0000
Subject: [PATCH] Add priority tagging and buffer reuse for libxfs cache Merge
 of master-melb:xfs-cmds:29150a by kenmcd.

  Update to 2.9.2
---
 VERSION           |   2 +-
 doc/CHANGES       |  17 ++
 include/cache.h   |  39 +++--
 include/libxfs.h  |   7 +
 include/list.h    |  51 ++++++
 libxfs/cache.c    | 402 ++++++++++++++++++++++++++++------------------
 libxfs/rdwr.c     |  91 ++++++++---
 repair/phase6.c   |  43 +++--
 repair/prefetch.c |  71 +++++---
 9 files changed, 488 insertions(+), 235 deletions(-)

diff --git a/VERSION b/VERSION
index fc4221ec1..e6abef7de 100644
--- a/VERSION
+++ b/VERSION
@@ -3,5 +3,5 @@
 #
 PKG_MAJOR=2
 PKG_MINOR=9
-PKG_REVISION=1
+PKG_REVISION=2
 PKG_BUILD=1
diff --git a/doc/CHANGES b/doc/CHANGES
index 4bbc1ea32..bbaced5f1 100644
--- a/doc/CHANGES
+++ b/doc/CHANGES
@@ -1,3 +1,20 @@
+xfsprogs-2.9.2 (16 July 2007)
+	- Next major round of xfs_repair performance improvements:
+		- Cache disk nlink values in Phase 3 for Phase 7.
+		- Do multithreaded prefetch/processing based on AG stride
+		  option (ie. for concats).
+		- Don't trash lost+found at the start of Phase 4, eliminates
+		  repeated "moving disconnected inode to lost+found" with
+		  successive xfs_repair runs.
+		- Do multi-threaded sequential metadata prefetch.
+		  Method based on Agami patches posted for 2.7.18 xfsprogs.
+		- Improve the libxfs cache with priority tagging to keep
+		  blocks around that have unfavourable I/O characteristics.
+	- Make mkfs.xfs -f zero the old secondary superblocks before writing
+	  the new superblocks.
+	- Fix up xfs_info and xfs_quota's -c handling with global commands.
+	- Improve xfs_bmap -vp output to always show the FLAGS column.
+
 xfsprogs-2.9.1 (28 June 2007)
 	- Added filestreams support to xfs_io.
 	- Fix up libattr Makefile to append to LTLDFLAGS. Thanks to
diff --git a/include/cache.h b/include/cache.h
index 241796d42..317ff738d 100644
--- a/include/cache.h
+++ b/include/cache.h
@@ -20,13 +20,14 @@
 
 #define	HASH_CACHE_RATIO	8
 
+#define CACHE_MAX_PRIORITY	15
+
 /*
  * Simple, generic implementation of a cache (arbitrary data).
  * Provides a hash table with a capped number of cache entries.
  */
 
 struct cache;
-struct cache_hash;
 struct cache_node;
 
 typedef void *cache_key_t;
@@ -48,6 +49,27 @@ struct cache_operations {
 	cache_bulk_relse_t	bulkrelse;	/* optional */
 };
 
+struct cache_hash {
+	struct list_head	ch_list;	/* hash chain head */
+	unsigned int		ch_count;	/* hash chain length */
+	pthread_mutex_t		ch_mutex;	/* hash chain mutex */
+};
+
+struct cache_mru {
+	struct list_head	cm_list;	/* MRU head */
+	unsigned int		cm_count;	/* MRU length */
+	pthread_mutex_t		cm_mutex;	/* MRU lock */
+};
+
+struct cache_node {
+	struct list_head	cn_hash;	/* hash chain */
+	struct list_head	cn_mru;		/* MRU chain */
+	unsigned int		cn_count;	/* reference count */
+	unsigned int		cn_hashidx;	/* hash chain index */
+	int			cn_priority;	/* priority, -1 = free list */
+	pthread_mutex_t		cn_mutex;	/* node mutex */
+};
+
 struct cache {
 	unsigned int		c_maxcount;	/* max cache nodes */
 	unsigned int		c_count;	/* count of nodes */
@@ -60,23 +82,12 @@ struct cache {
 	cache_bulk_relse_t	bulkrelse;	/* bulk release routine */
 	unsigned int		c_hashsize;	/* hash bucket count */
 	struct cache_hash	*c_hash;	/* hash table buckets */
+	struct cache_mru	c_mrus[CACHE_MAX_PRIORITY + 1];
 	unsigned long long	c_misses;	/* cache misses */
 	unsigned long long	c_hits;		/* cache hits */
 	unsigned int 		c_max;		/* max nodes ever used */
 };
 
-struct cache_hash {
-	struct list_head	ch_list;	/* hash chain head */
-	unsigned int		ch_count;	/* hash chain length */
-	pthread_mutex_t		ch_mutex;	/* hash chain mutex */
-};
-
-struct cache_node {
-	struct list_head	cn_list;	/* hash chain */
-	unsigned int		cn_count;	/* reference count */
-	pthread_mutex_t		cn_mutex;	/* refcount mutex */
-};
-
 struct cache *cache_init(unsigned int, struct cache_operations *);
 void cache_destroy(struct cache *);
 void cache_walk(struct cache *, cache_walk_t);
@@ -85,6 +96,8 @@ void cache_flush(struct cache *);
 
 int cache_node_get(struct cache *, cache_key_t, struct cache_node **);
 void cache_node_put(struct cache_node *);
+void cache_node_set_priority(struct cache *, struct cache_node *, int);
+int cache_node_get_priority(struct cache_node *);
 int cache_node_purge(struct cache *, cache_key_t, struct cache_node *);
 void cache_report(FILE *fp, const char *, struct cache *);
 int cache_overflowed(struct cache *);
diff --git a/include/libxfs.h b/include/libxfs.h
index 8de631782..b55c72dc3 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -254,6 +254,13 @@ enum xfs_buf_flags_t {	/* b_flags bits */
 #define XFS_BUF_FSPRIVATE3(bp,type)	((type)(bp)->b_fsprivate3)
 #define XFS_BUF_SET_FSPRIVATE3(bp,val)	(bp)->b_fsprivate3 = (void *)(val)
 
+#define XFS_BUF_SET_PRIORITY(bp,pri)	cache_node_set_priority( \
+						libxfs_bcache, \
+						(struct cache_node *)(bp), \
+						(pri))
+#define XFS_BUF_PRIORITY(bp)		(cache_node_get_priority( \
+						(struct cache_node *)(bp)))
+
 /* Buffer Cache Interfaces */
 
 extern struct cache	*libxfs_bcache;
diff --git a/include/list.h b/include/list.h
index 3ba491d32..2389a6c0e 100644
--- a/include/list.h
+++ b/include/list.h
@@ -85,4 +85,55 @@ static inline int list_empty(const struct list_head *head)
 	return head->next == head;
 }
 
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		list_head_init(list);
+	}
+}
+
+#define list_entry(ptr, type, member) ({			\
+	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+	(type *)( (char *)__mptr - offsetof(type,member) );})
+
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_entry((head)->next, typeof(*pos), member);	\
+	     &pos->member != (head); 	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member),	\
+		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
 #endif	/* __LIST_H__ */
diff --git a/libxfs/cache.c b/libxfs/cache.c
index c37356d0d..4e0831fd5 100644
--- a/libxfs/cache.c
+++ b/libxfs/cache.c
@@ -32,6 +32,8 @@
 #undef CACHE_ABORT
 /* #define CACHE_ABORT 1 */
 
+#define CACHE_SHAKE_COUNT	64
+
 static unsigned int cache_generic_bulkrelse(struct cache *, struct list_head *);
 
 struct cache *
@@ -71,6 +73,12 @@ cache_init(
 		cache->c_hash[i].ch_count = 0;
 		pthread_mutex_init(&cache->c_hash[i].ch_mutex, NULL);
 	}
+
+	for (i = 0; i <= CACHE_MAX_PRIORITY; i++) {
+		list_head_init(&cache->c_mrus[i].cm_list);
+		cache->c_mrus[i].cm_count = 0;
+		pthread_mutex_init(&cache->c_mrus[i].cm_mutex, NULL);
+	}
 	return cache;
 }
 
@@ -127,161 +135,119 @@ cache_destroy(
 		list_head_destroy(&cache->c_hash[i].ch_list);
 		pthread_mutex_destroy(&cache->c_hash[i].ch_mutex);
 	}
+	for (i = 0; i <= CACHE_MAX_PRIORITY; i++) {
+		list_head_destroy(&cache->c_mrus[i].cm_list);
+		pthread_mutex_destroy(&cache->c_mrus[i].cm_mutex);
+	}
 	pthread_mutex_destroy(&cache->c_mutex);
 	free(cache->c_hash);
 	free(cache);
 }
 
-static int
-cache_shake_node(
+static unsigned int
+cache_generic_bulkrelse(
 	struct cache *		cache,
-	cache_key_t		key,
-	struct cache_node *	node)
+	struct list_head *	list)
 {
-	struct list_head *	head;
-	struct list_head *	pos;
-	struct list_head *	n;
-	struct cache_hash *	hash;
-	int			count = -1;
+	struct cache_node *	node;
+	unsigned int		count = 0;
 
-	hash = cache->c_hash + cache->hash(key, cache->c_hashsize);
-	head = &hash->ch_list;
-	pthread_mutex_lock(&hash->ch_mutex);
-	for (pos = head->next, n = pos->next;
-	     pos != head;
-	     pos = n, n = pos->next) {
-		if ((struct cache_node *)pos != node)
-			continue;
-		pthread_mutex_lock(&node->cn_mutex);
-		count = node->cn_count;
-		pthread_mutex_unlock(&node->cn_mutex);
-		if (count != 0)
-			break;
+	while (!list_empty(list)) {
+		node = list_entry(list->next, struct cache_node, cn_mru);
 		pthread_mutex_destroy(&node->cn_mutex);
-		list_del_init(&node->cn_list);
-		hash->ch_count--;
+		list_del_init(&node->cn_mru);
 		cache->relse(node);
-		break;
+		count++;
 	}
-	pthread_mutex_unlock(&hash->ch_mutex);
+
 	return count;
 }
 
 /*
  * We've hit the limit on cache size, so we need to start reclaiming
- * nodes we've used.  This reclaims from the one given hash bucket
- * only.  Returns the number of freed up nodes, its left to the
- * caller to updates the global counter of used nodes for the cache.
- * The hash chain lock is held for the hash list argument, must be
- * dropped before returning.
- * We walk backwards through the hash (remembering we keep recently
- * used nodes toward the front) until we hit an in-use node.  We'll
- * stop there if its a low priority call but keep going if its not.
+ * nodes we've used. The MRU specified by the priority is shaken.
+ * Returns new priority at end of the call (in case we call again).
  */
 static unsigned int
-cache_shake_hash(
+cache_shake(
 	struct cache *		cache,
-	struct cache_hash *	hash,
-	unsigned int		priority)
+	unsigned int		priority,
+	int			all)
 {
+	struct cache_mru	*mru;
+	struct cache_hash *	hash;
 	struct list_head	temp;
 	struct list_head *	head;
 	struct list_head *	pos;
 	struct list_head *	n;
 	struct cache_node *	node;
-	unsigned int		inuse = 0;
+	unsigned int		count;
 
+	ASSERT(priority <= CACHE_MAX_PRIORITY);
+	if (priority > CACHE_MAX_PRIORITY)
+		priority = 0;
+
+	mru = &cache->c_mrus[priority];
+	count = 0;
 	list_head_init(&temp);
-	head = &hash->ch_list;
-	for (pos = head->prev, n = pos->prev;
-	     pos != head;
-	     pos = n, n = pos->prev) {
-		node = (struct cache_node *)pos;
-		pthread_mutex_lock(&node->cn_mutex);
-		if (!(inuse = (node->cn_count > 0))) {
-			hash->ch_count--;
-			list_move_tail(&node->cn_list, &temp);
+	head = &mru->cm_list;
+
+	pthread_mutex_lock(&mru->cm_mutex);
+	for (pos = head->prev, n = pos->prev; pos != head;
+						pos = n, n = pos->prev) {
+		node = list_entry(pos, struct cache_node, cn_mru);
+
+		if (pthread_mutex_trylock(&node->cn_mutex) != 0)
+			continue;
+
+		hash = cache->c_hash + node->cn_hashidx;
+		if (node->cn_count > 0 ||
+				pthread_mutex_trylock(&hash->ch_mutex) != 0) {
+			pthread_mutex_unlock(&node->cn_mutex);
+			continue;
 		}
-		pthread_mutex_unlock(&node->cn_mutex);
-		if (inuse && !priority)
-			break;
-	}
-	pthread_mutex_unlock(&hash->ch_mutex);
-	return cache->bulkrelse(cache, &temp);
-}
+		ASSERT(node->cn_priority == priority);
+		node->cn_priority = -1;
 
-/*
- * Generic implementation of bulk release, which just iterates over
- * the list calling the single node relse routine for each node.
- */
-static unsigned int
-cache_generic_bulkrelse(
-	struct cache *		cache,
-	struct list_head *	list)
-{
-	struct cache_node *	node;
-	unsigned int		count = 0;
+		list_move(&node->cn_mru, &temp);
+		list_del_init(&node->cn_hash);
+		hash->ch_count--;
+		mru->cm_count--;
+		pthread_mutex_unlock(&hash->ch_mutex);
+		pthread_mutex_unlock(&node->cn_mutex);
 
-	while (!list_empty(list)) {
-		node = (struct cache_node *)list->next;
-		pthread_mutex_destroy(&node->cn_mutex);
-		list_del_init(&node->cn_list);
-		cache->relse(node);
 		count++;
+		if (!all && count == CACHE_SHAKE_COUNT)
+			break;
 	}
-	return count;
-}
+	pthread_mutex_unlock(&mru->cm_mutex);
 
-/*
- * We've hit the limit on cache size, so we need to start reclaiming
- * nodes we've used.  Start by shaking this hash chain only, unless
- * the shake priority has been increased already.
- * The hash chain lock is held for the hash list argument, must be
- * dropped before returning.
- * Returns new priority at end of the call (in case we call again).
- */
-static unsigned int
-cache_shake(
-	struct cache *		cache,
-	struct cache_hash *	hash,
-	unsigned int		priority)
-{
-	unsigned int		count;
-	unsigned int		i;
+	if (count > 0) {
+		cache->bulkrelse(cache, &temp);
 
-	if (!priority) {	/* do just one */
-		count = cache_shake_hash(cache, hash, priority);
-	} else {	/* use a bigger hammer */
-		pthread_mutex_unlock(&hash->ch_mutex);
-		for (count = 0, i = 0; i < cache->c_hashsize; i++) {
-			hash = &cache->c_hash[i];
-			pthread_mutex_lock(&hash->ch_mutex);
-			count += cache_shake_hash(cache, hash, priority - 1);
-		}
-	}
-	if (count) {
 		pthread_mutex_lock(&cache->c_mutex);
 		cache->c_count -= count;
 		pthread_mutex_unlock(&cache->c_mutex);
 	}
-	return ++priority;
+
+	return (count == CACHE_SHAKE_COUNT) ? priority : ++priority;
 }
 
 /*
  * Allocate a new hash node (updating atomic counter in the process),
  * unless doing so will push us over the maximum cache size.
  */
-struct cache_node *
+static struct cache_node *
 cache_node_allocate(
 	struct cache *		cache,
-	struct cache_hash *	hashlist,
 	cache_key_t		key)
 {
 	unsigned int		nodesfree;
 	struct cache_node *	node;
 
 	pthread_mutex_lock(&cache->c_mutex);
-	if ((nodesfree = (cache->c_count < cache->c_maxcount))) {
+	nodesfree = (cache->c_count < cache->c_maxcount);
+	if (nodesfree) {
 		cache->c_count++;
 		if (cache->c_count > cache->c_max)
 			cache->c_max = cache->c_count;
@@ -290,15 +256,16 @@ cache_node_allocate(
 	pthread_mutex_unlock(&cache->c_mutex);
 	if (!nodesfree)
 		return NULL;
-	if (!(node = cache->alloc(key))) {	/* uh-oh */
+	node = cache->alloc(key);
+	if (node == NULL) {	/* uh-oh */
 		pthread_mutex_lock(&cache->c_mutex);
 		cache->c_count--;
 		pthread_mutex_unlock(&cache->c_mutex);
 		return NULL;
 	}
 	pthread_mutex_init(&node->cn_mutex, NULL);
-	list_head_init(&node->cn_list);
 	node->cn_count = 1;
+	node->cn_priority = 0;
 	return node;
 }
 
@@ -325,42 +292,69 @@ cache_node_get(
 {
 	struct cache_node *	node = NULL;
 	struct cache_hash *	hash;
+	struct cache_mru *	mru;
 	struct list_head *	head;
 	struct list_head *	pos;
+	unsigned int		hashidx;
 	int			priority = 0;
-	int			allocated = 0;
 
-	hash = cache->c_hash + cache->hash(key, cache->c_hashsize);
+	hashidx = cache->hash(key, cache->c_hashsize);
+	hash = cache->c_hash + hashidx;
 	head = &hash->ch_list;
 
-  restart:
-	pthread_mutex_lock(&hash->ch_mutex);
-	for (pos = head->next; pos != head; pos = pos->next) {
-		node = (struct cache_node *)pos;
-		if (cache->compare(node, key) == 0)
-			continue;
-		pthread_mutex_lock(&node->cn_mutex);
-		node->cn_count++;
-		pthread_mutex_unlock(&node->cn_mutex);
-		pthread_mutex_lock(&cache->c_mutex);
-		cache->c_hits++;
-		pthread_mutex_unlock(&cache->c_mutex);
-		break;
-	}
-	if (pos == head) {
-		node = cache_node_allocate(cache, hash, key);
-		if (!node) {
-			priority = cache_shake(cache, hash, priority);
-			goto restart;
+	for (;;) {
+		pthread_mutex_lock(&hash->ch_mutex);
+		for (pos = head->next; pos != head; pos = pos->next) {
+			node = list_entry(pos, struct cache_node, cn_hash);
+			if (!cache->compare(node, key))
+				continue;
+			/*
+			* node found, bump node's reference count, move it to the
+			* top of its MRU list, and update stats.
+			*/
+			pthread_mutex_lock(&node->cn_mutex);
+			node->cn_count++;
+
+			mru = &cache->c_mrus[node->cn_priority];
+			pthread_mutex_lock(&mru->cm_mutex);
+			list_move(&node->cn_mru, &mru->cm_list);
+			pthread_mutex_unlock(&mru->cm_mutex);
+
+			pthread_mutex_unlock(&node->cn_mutex);
+			pthread_mutex_unlock(&hash->ch_mutex);
+
+			pthread_mutex_lock(&cache->c_mutex);
+			cache->c_hits++;
+			pthread_mutex_unlock(&cache->c_mutex);
+
+			*nodep = node;
+			return 0;
 		}
-		allocated = 1;
-		hash->ch_count++;	/* new entry */
+		pthread_mutex_unlock(&hash->ch_mutex);
+		/*
+		 * not found, allocate a new entry
+		 */
+		node = cache_node_allocate(cache, key);
+		if (node)
+			break;
+		priority = cache_shake(cache, priority, 0);
 	}
-	/* looked at it, move to hash list head */
-	list_move(&node->cn_list, &hash->ch_list);
+
+	node->cn_hashidx = hashidx;
+
+	/* add new node to appropriate hash and lowest priority MRU */
+	mru = &cache->c_mrus[0];
+	pthread_mutex_lock(&mru->cm_mutex);
+	pthread_mutex_lock(&hash->ch_mutex);
+	hash->ch_count++;
+	mru->cm_count++;
+	list_add(&node->cn_hash, &hash->ch_list);
+	list_add(&node->cn_mru, &mru->cm_list);
 	pthread_mutex_unlock(&hash->ch_mutex);
+	pthread_mutex_unlock(&mru->cm_mutex);
+
 	*nodep = node;
-	return allocated;
+	return 1;
 }
 
 void
@@ -379,6 +373,56 @@ cache_node_put(
 	pthread_mutex_unlock(&node->cn_mutex);
 }
 
+void
+cache_node_set_priority(
+	struct cache *		cache,
+	struct cache_node *	node,
+	int			priority)
+{
+	struct cache_mru *	mru;
+
+	if (priority < 0)
+		priority = 0;
+	else if (priority > CACHE_MAX_PRIORITY)
+		priority = CACHE_MAX_PRIORITY;
+
+	pthread_mutex_lock(&node->cn_mutex);
+
+	ASSERT(node->cn_count > 0);
+	if (priority == node->cn_priority) {
+		pthread_mutex_unlock(&node->cn_mutex);
+		return;
+	}
+	mru = &cache->c_mrus[node->cn_priority];
+	pthread_mutex_lock(&mru->cm_mutex);
+	list_del_init(&node->cn_mru);
+	mru->cm_count--;
+	pthread_mutex_unlock(&mru->cm_mutex);
+
+	mru = &cache->c_mrus[priority];
+	pthread_mutex_lock(&mru->cm_mutex);
+	list_add(&node->cn_mru, &mru->cm_list);
+	node->cn_priority = priority;
+	mru->cm_count++;
+	pthread_mutex_unlock(&mru->cm_mutex);
+
+	pthread_mutex_unlock(&node->cn_mutex);
+}
+
+int
+cache_node_get_priority(
+	struct cache_node *	node)
+{
+	int			priority;
+
+	pthread_mutex_lock(&node->cn_mutex);
+	priority = node->cn_priority;
+	pthread_mutex_unlock(&node->cn_mutex);
+
+	return priority;
+}
+
+
 /*
  * Purge a specific node from the cache.  Reference count must be zero.
  */
@@ -388,27 +432,60 @@ cache_node_purge(
 	cache_key_t		key,
 	struct cache_node *	node)
 {
-	int			refcount;
+	struct list_head *	head;
+	struct list_head *	pos;
+	struct list_head *	n;
+	struct cache_hash *	hash;
+	struct cache_mru *	mru;
+	int			count = -1;
+
+	hash = cache->c_hash + cache->hash(key, cache->c_hashsize);
+	head = &hash->ch_list;
+	pthread_mutex_lock(&hash->ch_mutex);
+	for (pos = head->next, n = pos->next; pos != head;
+						pos = n, n = pos->next) {
+		if ((struct cache_node *)pos != node)
+			continue;
 
-	refcount = cache_shake_node(cache, key, node);
-	if (refcount == 0) {
+		pthread_mutex_lock(&node->cn_mutex);
+		count = node->cn_count;
+		if (count != 0) {
+			pthread_mutex_unlock(&node->cn_mutex);
+			break;
+		}
+		mru = &cache->c_mrus[node->cn_priority];
+		pthread_mutex_lock(&mru->cm_mutex);
+		list_del_init(&node->cn_mru);
+		mru->cm_count--;
+		pthread_mutex_unlock(&mru->cm_mutex);
+
+		pthread_mutex_unlock(&node->cn_mutex);
+		pthread_mutex_destroy(&node->cn_mutex);
+		list_del_init(&node->cn_hash);
+		hash->ch_count--;
+		cache->relse(node);
+		break;
+	}
+	pthread_mutex_unlock(&hash->ch_mutex);
+
+	if (count == 0) {
 		pthread_mutex_lock(&cache->c_mutex);
 		cache->c_count--;
 		pthread_mutex_unlock(&cache->c_mutex);
 	}
 #ifdef CACHE_DEBUG
-	if (refcount >= 1) {
+	if (count >= 1) {
 		fprintf(stderr, "%s: refcount was %u, not zero (node=%p)\n",
-				__FUNCTION__, refcount, node);
+				__FUNCTION__, count, node);
 		cache_abort();
 	}
-	if (refcount == -1) {
+	if (count == -1) {
 		fprintf(stderr, "%s: purge node not found! (node=%p)\n",
 			__FUNCTION__, node);
 		cache_abort();
 	}
 #endif
-	return (refcount == 0);
+	return (count == 0);
 }
 
 /*
@@ -418,20 +495,20 @@ void
 cache_purge(
 	struct cache *		cache)
 {
-	struct cache_hash *	hash;
+	int			i;
+
+	for (i = 0; i <= CACHE_MAX_PRIORITY; i++)
+		cache_shake(cache, i, 1);
 
-	hash = &cache->c_hash[0];
-	pthread_mutex_lock(&hash->ch_mutex);
-	cache_shake(cache, hash, (unsigned int)-1);
 #ifdef CACHE_DEBUG
 	if (cache->c_count != 0) {
+		/* flush referenced nodes to disk */
+		cache_flush(cache);
 		fprintf(stderr, "%s: shake on cache %p left %u nodes!?\n",
 				__FUNCTION__, cache, cache->c_count);
 		cache_abort();
 	}
 #endif
-	/* flush any remaining nodes to disk */
-	cache_flush(cache);
 }
 
 /*
@@ -465,15 +542,18 @@ cache_flush(
 	}
 }
 
-#define	HASH_REPORT	(3*HASH_CACHE_RATIO)
+#define	HASH_REPORT	(3 * HASH_CACHE_RATIO)
 void
-cache_report(FILE *fp, const char *name, struct cache * cache)
+cache_report(
+	FILE 			*fp,
+	const char 		*name,
+	struct cache 		*cache)
 {
-	int i;
-	unsigned long count, index, total;
-	unsigned long hash_bucket_lengths[HASH_REPORT+2];
+	int 			i;
+	unsigned long 		count, index, total;
+	unsigned long 		hash_bucket_lengths[HASH_REPORT + 2];
 
-	if ((cache->c_hits+cache->c_misses) == 0)
+	if ((cache->c_hits + cache->c_misses) == 0)
 		return;
 
 	/* report cache summary */
@@ -492,9 +572,15 @@ cache_report(FILE *fp, const char *name, struct cache * cache)
 			cache->c_hashsize,
 			cache->c_hits,
 			cache->c_misses,
-			(double) (cache->c_hits*100/(cache->c_hits+cache->c_misses))
+			(double)cache->c_hits * 100 /
+				(cache->c_hits + cache->c_misses)
 	);
 
+	for (i = 0; i <= CACHE_MAX_PRIORITY; i++)
+		fprintf(fp, "MRU %d entries = %6u (%3u%%)\n",
+			i, cache->c_mrus[i].cm_count,
+			cache->c_mrus[i].cm_count * 100 / cache->c_count);
+
 	/* report hash bucket lengths */
 	bzero(hash_bucket_lengths, sizeof(hash_bucket_lengths));
 
@@ -508,14 +594,16 @@ cache_report(FILE *fp, const char *name, struct cache * cache)
 	}
 
 	total = 0;
-	for (i = 0; i < HASH_REPORT+1; i++) {
-		total += i*hash_bucket_lengths[i];
+	for (i = 0; i < HASH_REPORT + 1; i++) {
+		total += i * hash_bucket_lengths[i];
 		if (hash_bucket_lengths[i] == 0)
 			continue;
-		fprintf(fp, "Hash buckets with  %2d entries %5ld (%3ld%%)\n",
-			i, hash_bucket_lengths[i], (i*hash_bucket_lengths[i]*100)/cache->c_count);
+		fprintf(fp, "Hash buckets with  %2d entries %6ld (%3ld%%)\n",
+			i, hash_bucket_lengths[i],
+			(i * hash_bucket_lengths[i] * 100) / cache->c_count);
 	}
 	if (hash_bucket_lengths[i])	/* last report bucket is the overflow bucket */
-		fprintf(fp, "Hash buckets with >%2d entries %5ld (%3ld%%)\n",
-			i-1, hash_bucket_lengths[i], ((cache->c_count-total)*100)/cache->c_count);
+		fprintf(fp, "Hash buckets with >%2d entries %6ld (%3ld%%)\n",
+			i - 1, hash_bucket_lengths[i],
+			((cache->c_count - total) * 100) / cache->c_count);
 }
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c
index 921ced7fa..830d0cf01 100644
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -257,7 +257,11 @@ libxfs_getsb(xfs_mount_t *mp, int flags)
 				XFS_FSS_TO_BB(mp, 1), flags);
 }
 
-xfs_zone_t	*xfs_buf_zone;
+xfs_zone_t			*xfs_buf_zone;
+
+static struct cache_mru		xfs_buf_freelist =
+	{{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list},
+	 0, PTHREAD_MUTEX_INITIALIZER };
 
 typedef struct {
 	dev_t		device;
@@ -308,7 +312,8 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
 	bp->b_blkno = bno;
 	bp->b_bcount = bytes;
 	bp->b_dev = device;
-	bp->b_addr = memalign(libxfs_device_alignment(), bytes);
+	if (!bp->b_addr)
+		bp->b_addr = memalign(libxfs_device_alignment(), bytes);
 	if (!bp->b_addr) {
 		fprintf(stderr,
 			_("%s: %s can't memalign %u bytes: %s\n"),
@@ -323,18 +328,44 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
 }
 
 xfs_buf_t *
-libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int len)
+libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int bblen)
 {
 	xfs_buf_t	*bp;
+	int		blen = BBTOB(bblen);
+
+	/*
+	 * first look for a buffer that can be used as-is,
+	 * if one cannot be found, see if there is a buffer,
+	 * and if so, free it's buffer and set b_addr to NULL
+	 * before calling libxfs_initbuf.
+	 */
+	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+	if (!list_empty(&xfs_buf_freelist.cm_list)) {
+		list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) {
+			if (bp->b_bcount == blen) {
+				list_del_init(&bp->b_node.cn_mru);
+				break;
+			}
+		}
+		if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) {
+			bp = list_entry(xfs_buf_freelist.cm_list.next,
+					xfs_buf_t, b_node.cn_mru);
+			list_del_init(&bp->b_node.cn_mru);
+			free(bp->b_addr);
+			bp->b_addr = NULL;
+		}
+	} else
+		bp = libxfs_zone_zalloc(xfs_buf_zone);
+	pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
 
-	bp = libxfs_zone_zalloc(xfs_buf_zone);
 	if (bp != NULL)
-		libxfs_initbuf(bp, device, blkno, BBTOB(len));
+		libxfs_initbuf(bp, device, blkno, blen);
 #ifdef IO_DEBUG
 	printf("%lx: %s: allocated %u bytes buffer, key=%llu(%llu), %p\n",
 		pthread_self(), __FUNCTION__, BBTOB(len),
 		(long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
 #endif
+
 	return bp;
 }
 
@@ -358,6 +389,8 @@ libxfs_getbuf(dev_t device, xfs_daddr_t blkno, int len)
 	miss = cache_node_get(libxfs_bcache, &key, (struct cache_node **)&bp);
 	if (bp) {
 		pthread_mutex_lock(&bp->b_lock);
+		cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp,
+			cache_node_get_priority((struct cache_node *)bp) - 4);
 #ifdef XFS_BUF_TRACING
 		pthread_mutex_lock(&libxfs_bcache->c_mutex);
 		lock_buf_count++;
@@ -525,34 +558,46 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
 }
 
 static void
-libxfs_bflush(struct cache_node *node)
+libxfs_brelse(struct cache_node *node)
 {
 	xfs_buf_t		*bp = (xfs_buf_t *)node;
 
-	if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
-		libxfs_writebufr(bp);
+	if (bp != NULL) {
+		if (bp->b_flags & LIBXFS_B_DIRTY)
+			libxfs_writebufr(bp);
+		pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+		list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list);
+		pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
+	}
 }
 
 static void
-libxfs_brelse(struct cache_node *node)
+libxfs_bulkrelse(
+	struct cache 		*cache,
+	struct list_head 	*list)
 {
-	xfs_buf_t		*bp = (xfs_buf_t *)node;
-	xfs_buf_log_item_t	*bip;
-	extern xfs_zone_t	*xfs_buf_item_zone;
+	xfs_buf_t		*bp;
 
-	if (bp != NULL) {
+	if (list_empty(list))
+		return;
+
+	list_for_each_entry(bp, list, b_node.cn_mru) {
 		if (bp->b_flags & LIBXFS_B_DIRTY)
 			libxfs_writebufr(bp);
-		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-		if (bip)
-			libxfs_zone_free(xfs_buf_item_zone, bip);
-		free(bp->b_addr);
-		pthread_mutex_destroy(&bp->b_lock);
-		bp->b_addr = NULL;
-		bp->b_flags = 0;
-		free(bp);
-		bp = NULL;
 	}
+
+	pthread_mutex_lock(&xfs_buf_freelist.cm_mutex);
+	__list_splice(list, &xfs_buf_freelist.cm_list);
+	pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex);
+}
+
+static void
+libxfs_bflush(struct cache_node *node)
+{
+	xfs_buf_t		*bp = (xfs_buf_t *)node;
+
+	if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY))
+		libxfs_writebufr(bp);
 }
 
 void
@@ -586,7 +631,7 @@ struct cache_operations libxfs_bcache_operations = {
 	/* .flush */	libxfs_bflush,
 	/* .relse */	libxfs_brelse,
 	/* .compare */	libxfs_bcompare,
-	/* .bulkrelse */ NULL	/* TODO: lio_listio64 interface? */
+	/* .bulkrelse */libxfs_bulkrelse
 };
 
 
diff --git a/repair/phase6.c b/repair/phase6.c
index e43addabf..147b80d86 100644
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -745,6 +745,7 @@ mk_root_dir(xfs_mount_t *mp)
 	int		i;
 	int		error;
 	const mode_t	mode = 0755;
+	ino_tree_node_t	*irec;
 
 	tp = libxfs_trans_alloc(mp, 0);
 	ip = NULL;
@@ -788,6 +789,11 @@ mk_root_dir(xfs_mount_t *mp)
 	dir_init(mp, tp, ip, ip);
 
 	libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_SYNC, 0);
+
+	irec = find_inode_rec(XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
+				XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
+	set_inode_isadir(irec, XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino) -
+				irec->ino_startnum);
 }
 
 /*
@@ -3802,29 +3808,20 @@ traverse_ags(
 	xfs_mount_t 		*mp)
 {
 	int			i;
-	work_queue_t		*queues;
+	work_queue_t		queue;
 	prefetch_args_t		*pf_args[2];
 
-	queues = malloc(thread_count * sizeof(work_queue_t));
-	queues[0].mp = mp;
-
-	if (!libxfs_bcache_overflowed()) {
-		/*create_work_queue(&queues[0], mp, libxfs_nproc());
-		for (i = 0; i < glob_agcount; i++)
-			queue_work(&queues[0], traverse_function, i, NULL);
-		destroy_work_queue(&queues[0]);*/
-		for (i = 0; i < glob_agcount; i++)
-			traverse_function(&queues[0], i, NULL);
-	} else {
-		/* TODO: AG stride support */
-		pf_args[0] = start_inode_prefetch(0, 1, NULL);
-		for (i = 0; i < glob_agcount; i++) {
-			pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 1,
-					pf_args[i & 1]);
-			traverse_function(&queues[0], i, pf_args[i & 1]);
-		}
+	/*
+	 * we always do prefetch for phase 6 as it will fill in the gaps
+	 * not read during phase 3 prefetch.
+	 */
+	queue.mp = mp;
+	pf_args[0] = start_inode_prefetch(0, 1, NULL);
+	for (i = 0; i < glob_agcount; i++) {
+		pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 1,
+				pf_args[i & 1]);
+		traverse_function(&queue, i, pf_args[i & 1]);
 	}
-	free(queues);
 }
 
 void
@@ -3901,7 +3898,7 @@ _("        - resetting contents of realtime bitmap and summary inodes\n"));
 
 	mark_standalone_inodes(mp);
 
-	do_log(_("        - traversing filesystem ... \n"));
+	do_log(_("        - traversing filesystem ...\n"));
 
 	irec = find_inode_rec(XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
 				XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
@@ -3919,8 +3916,8 @@ _("        - resetting contents of realtime bitmap and summary inodes\n"));
 	 */
 	traverse_ags(mp);
 
-	do_log(_("        - traversals finished ... \n"));
-	do_log(_("        - moving disconnected inodes to %s ... \n"),
+	do_log(_("        - traversal finished ...\n"));
+	do_log(_("        - moving disconnected inodes to %s ...\n"),
 		ORPHANAGE);
 
 	/*
diff --git a/repair/prefetch.c b/repair/prefetch.c
index 4b7ea7f11..4a31cbaea 100644
--- a/repair/prefetch.c
+++ b/repair/prefetch.c
@@ -34,14 +34,27 @@ static int		pf_max_fsbs;
 static int		pf_batch_bytes;
 static int		pf_batch_fsbs;
 
-#define B_INODE		0x1000000
-#define B_META		0x2000000
+static void		pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
+
+/* buffer priorities for the libxfs cache */
+
+#define B_DIR_BMAP	15
+#define B_DIR_META_2	13	/* metadata in secondary queue */
+#define B_DIR_META_H	11	/* metadata fetched for PF_META_ONLY */
+#define B_DIR_META_S	9	/* single block of metadata */
+#define B_DIR_META	7
+#define B_DIR_INODE	6
+#define B_BMAP		5
+#define B_INODE		4
+
+#define B_IS_INODE(b)	(((b) & 1) == 0)
+#define B_IS_META(b)	(((b) & 1) != 0)
 
 #define DEF_BATCH_BYTES	0x10000
 
 #define MAX_BUFS	128
 
-#define IO_THRESHOLD	(MAX_BUFS * PF_THREAD_COUNT)
+#define IO_THRESHOLD	(MAX_BUFS * 2)
 
 typedef enum pf_which {
 	PF_PRIMARY,
@@ -89,16 +102,19 @@ pf_queue_io(
 	bp = libxfs_getbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
 			XFS_FSB_TO_BB(mp, blen));
 	if (bp->b_flags & LIBXFS_B_UPTODATE) {
+		if (B_IS_INODE(flag))
+			pf_read_inode_dirs(args, bp);
+		XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) + 8);
 		libxfs_putbuf(bp);
 		return;
 	}
-	bp->b_flags |= flag;
+	XFS_BUF_SET_PRIORITY(bp, flag);
 
 	pthread_mutex_lock(&args->lock);
 
 	if (fsbno > args->last_bno_read) {
 		radix_tree_insert(&args->primary_io_queue, fsbno, bp);
-		if (flag == B_META)
+		if (B_IS_META(flag))
 			radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
 		else {
 			args->inode_bufs_queued++;
@@ -108,7 +124,7 @@ pf_queue_io(
 #ifdef XR_PF_TRACE
 		pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
 			"primary queue (inode_bufs_queued = %d, last_bno = %lu)",
-			flag == B_INODE ? 'I' : 'M', bp,
+			B_IS_INODE(flag) ? 'I' : 'M', bp,
 			(long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
 			args->inode_bufs_queued, args->last_bno_read);
 #endif
@@ -116,11 +132,12 @@ pf_queue_io(
 #ifdef XR_PF_TRACE
 		pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
 			"secondary queue (last_bno = %lu)",
-			flag == B_INODE ? 'I' : 'M', bp,
+			B_IS_INODE(flag) ? 'I' : 'M', bp,
 			(long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
 			args->last_bno_read);
 #endif
-		ASSERT(flag == B_META);
+		ASSERT(B_IS_META(flag));
+		XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
 		radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
 	}
 
@@ -163,7 +180,7 @@ pf_read_bmbt_reclist(
 #ifdef XR_PF_TRACE
 			pftrace("queuing dir extent in AG %d", args->agno);
 #endif
-			pf_queue_io(args, s, 1, B_META);
+			pf_queue_io(args, s, 1, B_DIR_META);
 			c--;
 			s++;
 		}
@@ -194,6 +211,8 @@ pf_scan_lbtree(
 	if (!bp)
 		return 0;
 
+	XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
+
 	rc = (*func)((xfs_btree_lblock_t *)XFS_BUF_PTR(bp), level - 1, isadir, args);
 
 	libxfs_putbuf(bp);
@@ -307,6 +326,8 @@ pf_read_inode_dirs(
 {
 	xfs_dinode_t		*dino;
 	int			icnt = 0;
+	int			hasdir = 0;
+	int			isadir;
 	xfs_dinode_core_t	*dinoc;
 
 	for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
@@ -317,9 +338,14 @@ pf_read_inode_dirs(
 		 * We are only prefetching directory contents in extents
 		 * and btree nodes for other inodes
 		 */
-		if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL ||
-				(dinoc->di_format == XFS_DINODE_FMT_EXTENTS &&
-				 (be16_to_cpu(dinoc->di_mode) & S_IFMT) != S_IFDIR))
+		isadir = (be16_to_cpu(dinoc->di_mode) & S_IFMT) == S_IFDIR;
+		hasdir |= isadir;
+
+		if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL)
+			continue;
+
+		if (!isadir && (dinoc->di_format == XFS_DINODE_FMT_EXTENTS ||
+				args->dirs_only))
 			continue;
 
 		/*
@@ -350,11 +376,12 @@ pf_read_inode_dirs(
 				pf_read_exinode(args, dino);
 				break;
 			case XFS_DINODE_FMT_BTREE:
-				pf_read_btinode(args, dino, (be16_to_cpu(
-					dinoc->di_mode) & S_IFMT) == S_IFDIR);
+				pf_read_btinode(args, dino, isadir);
 				break;
 		}
 	}
+	if (hasdir)
+		XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
 }
 
 /*
@@ -434,7 +461,7 @@ pf_batch_read(
 
 		if (which == PF_PRIMARY) {
 			for (inode_bufs = 0, i = 0; i < num; i++) {
-				if (bplist[i]->b_flags & B_INODE)
+				if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
 					inode_bufs++;
 			}
 			args->inode_bufs_queued -= inode_bufs;
@@ -470,14 +497,20 @@ pf_batch_read(
 				memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
 				bplist[i]->b_flags |= LIBXFS_B_UPTODATE;
 				len -= size;
-				if (bplist[i]->b_flags & B_INODE)
+				if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
 					pf_read_inode_dirs(args, bplist[i]);
+				else if (which == PF_META_ONLY)
+					XFS_BUF_SET_PRIORITY(bplist[i],
+								B_DIR_META_H);
+				else if (which == PF_PRIMARY && num == 1)
+					XFS_BUF_SET_PRIORITY(bplist[i],
+								B_DIR_META_S);
 			}
 		}
 		for (i = 0; i < num; i++) {
 #ifdef XR_PF_TRACE
 			pftrace("putbuf %c %p (%llu) in AG %d",
-				bplist[i]->b_flags & B_INODE ? 'I' : 'M',
+				B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
 				bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
 				args->agno);
 #endif
@@ -623,7 +656,9 @@ pf_queuing_worker(
 
 		do {
 			pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno),
-					blks_per_cluster, B_INODE);
+					blks_per_cluster,
+					(cur_irec->ino_isa_dir != 0) ?
+						B_DIR_INODE : B_INODE);
 			bno += blks_per_cluster;
 			num_inos += inos_per_cluster;
 		} while (num_inos < XFS_IALLOC_INODES(mp));
-- 
2.47.2