From 69ec88b58fe0180dc641637baae9a09e6648098f Mon Sep 17 00:00:00 2001 From: Barry Naujok Date: Mon, 16 Jul 2007 15:56:17 +0000 Subject: [PATCH] Add priority tagging and buffer reuse for libxfs cache Merge of master-melb:xfs-cmds:29150a by kenmcd. Update to 2.9.2 --- VERSION | 2 +- doc/CHANGES | 17 ++ include/cache.h | 39 +++-- include/libxfs.h | 7 + include/list.h | 51 ++++++ libxfs/cache.c | 402 ++++++++++++++++++++++++++++------------------ libxfs/rdwr.c | 91 ++++++++--- repair/phase6.c | 43 +++-- repair/prefetch.c | 71 +++++--- 9 files changed, 488 insertions(+), 235 deletions(-) diff --git a/VERSION b/VERSION index fc4221ec1..e6abef7de 100644 --- a/VERSION +++ b/VERSION @@ -3,5 +3,5 @@ # PKG_MAJOR=2 PKG_MINOR=9 -PKG_REVISION=1 +PKG_REVISION=2 PKG_BUILD=1 diff --git a/doc/CHANGES b/doc/CHANGES index 4bbc1ea32..bbaced5f1 100644 --- a/doc/CHANGES +++ b/doc/CHANGES @@ -1,3 +1,20 @@ +xfsprogs-2.9.2 (16 July 2007) + - Next major round of xfs_repair performance improvements: + - Cache disk nlink values in Phase 3 for Phase 7. + - Do multithreaded prefetch/processing based on AG stride + option (ie. for concats). + - Don't trash lost+found at the start of Phase 4, eliminates + repeated "moving disconnected inode to lost+found" with + successive xfs_repair runs. + - Do multi-threaded sequential metadata prefetch. + Method based on Agami patches posted for 2.7.18 xfsprogs. + - Improve the libxfs cache with priority tagging to keep + blocks around that have unfavourable I/O characteristics. + - Make mkfs.xfs -f zero the old secondary superblocks before writing + the new superblocks. + - Fix up xfs_info and xfs_quota's -c handling with global commands. + - Improve xfs_bmap -vp output to always show the FLAGS column. + xfsprogs-2.9.1 (28 June 2007) - Added filestreams support to xfs_io. - Fix up libattr Makefile to append to LTLDFLAGS. Thanks to diff --git a/include/cache.h b/include/cache.h index 241796d42..317ff738d 100644 --- a/include/cache.h +++ b/include/cache.h @@ -20,13 +20,14 @@ #define HASH_CACHE_RATIO 8 +#define CACHE_MAX_PRIORITY 15 + /* * Simple, generic implementation of a cache (arbitrary data). * Provides a hash table with a capped number of cache entries. */ struct cache; -struct cache_hash; struct cache_node; typedef void *cache_key_t; @@ -48,6 +49,27 @@ struct cache_operations { cache_bulk_relse_t bulkrelse; /* optional */ }; +struct cache_hash { + struct list_head ch_list; /* hash chain head */ + unsigned int ch_count; /* hash chain length */ + pthread_mutex_t ch_mutex; /* hash chain mutex */ +}; + +struct cache_mru { + struct list_head cm_list; /* MRU head */ + unsigned int cm_count; /* MRU length */ + pthread_mutex_t cm_mutex; /* MRU lock */ +}; + +struct cache_node { + struct list_head cn_hash; /* hash chain */ + struct list_head cn_mru; /* MRU chain */ + unsigned int cn_count; /* reference count */ + unsigned int cn_hashidx; /* hash chain index */ + int cn_priority; /* priority, -1 = free list */ + pthread_mutex_t cn_mutex; /* node mutex */ +}; + struct cache { unsigned int c_maxcount; /* max cache nodes */ unsigned int c_count; /* count of nodes */ @@ -60,23 +82,12 @@ struct cache { cache_bulk_relse_t bulkrelse; /* bulk release routine */ unsigned int c_hashsize; /* hash bucket count */ struct cache_hash *c_hash; /* hash table buckets */ + struct cache_mru c_mrus[CACHE_MAX_PRIORITY + 1]; unsigned long long c_misses; /* cache misses */ unsigned long long c_hits; /* cache hits */ unsigned int c_max; /* max nodes ever used */ }; -struct cache_hash { - struct list_head ch_list; /* hash chain head */ - unsigned int ch_count; /* hash chain length */ - pthread_mutex_t ch_mutex; /* hash chain mutex */ -}; - -struct cache_node { - struct list_head cn_list; /* hash chain */ - unsigned int cn_count; /* reference count */ - pthread_mutex_t cn_mutex; /* refcount mutex */ -}; - struct cache *cache_init(unsigned int, struct cache_operations *); void cache_destroy(struct cache *); void cache_walk(struct cache *, cache_walk_t); @@ -85,6 +96,8 @@ void cache_flush(struct cache *); int cache_node_get(struct cache *, cache_key_t, struct cache_node **); void cache_node_put(struct cache_node *); +void cache_node_set_priority(struct cache *, struct cache_node *, int); +int cache_node_get_priority(struct cache_node *); int cache_node_purge(struct cache *, cache_key_t, struct cache_node *); void cache_report(FILE *fp, const char *, struct cache *); int cache_overflowed(struct cache *); diff --git a/include/libxfs.h b/include/libxfs.h index 8de631782..b55c72dc3 100644 --- a/include/libxfs.h +++ b/include/libxfs.h @@ -254,6 +254,13 @@ enum xfs_buf_flags_t { /* b_flags bits */ #define XFS_BUF_FSPRIVATE3(bp,type) ((type)(bp)->b_fsprivate3) #define XFS_BUF_SET_FSPRIVATE3(bp,val) (bp)->b_fsprivate3 = (void *)(val) +#define XFS_BUF_SET_PRIORITY(bp,pri) cache_node_set_priority( \ + libxfs_bcache, \ + (struct cache_node *)(bp), \ + (pri)) +#define XFS_BUF_PRIORITY(bp) (cache_node_get_priority( \ + (struct cache_node *)(bp))) + /* Buffer Cache Interfaces */ extern struct cache *libxfs_bcache; diff --git a/include/list.h b/include/list.h index 3ba491d32..2389a6c0e 100644 --- a/include/list.h +++ b/include/list.h @@ -85,4 +85,55 @@ static inline int list_empty(const struct list_head *head) return head->next == head; } +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + list_head_init(list); + } +} + +#define list_entry(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + #endif /* __LIST_H__ */ diff --git a/libxfs/cache.c b/libxfs/cache.c index c37356d0d..4e0831fd5 100644 --- a/libxfs/cache.c +++ b/libxfs/cache.c @@ -32,6 +32,8 @@ #undef CACHE_ABORT /* #define CACHE_ABORT 1 */ +#define CACHE_SHAKE_COUNT 64 + static unsigned int cache_generic_bulkrelse(struct cache *, struct list_head *); struct cache * @@ -71,6 +73,12 @@ cache_init( cache->c_hash[i].ch_count = 0; pthread_mutex_init(&cache->c_hash[i].ch_mutex, NULL); } + + for (i = 0; i <= CACHE_MAX_PRIORITY; i++) { + list_head_init(&cache->c_mrus[i].cm_list); + cache->c_mrus[i].cm_count = 0; + pthread_mutex_init(&cache->c_mrus[i].cm_mutex, NULL); + } return cache; } @@ -127,161 +135,119 @@ cache_destroy( list_head_destroy(&cache->c_hash[i].ch_list); pthread_mutex_destroy(&cache->c_hash[i].ch_mutex); } + for (i = 0; i <= CACHE_MAX_PRIORITY; i++) { + list_head_destroy(&cache->c_mrus[i].cm_list); + pthread_mutex_destroy(&cache->c_mrus[i].cm_mutex); + } pthread_mutex_destroy(&cache->c_mutex); free(cache->c_hash); free(cache); } -static int -cache_shake_node( +static unsigned int +cache_generic_bulkrelse( struct cache * cache, - cache_key_t key, - struct cache_node * node) + struct list_head * list) { - struct list_head * head; - struct list_head * pos; - struct list_head * n; - struct cache_hash * hash; - int count = -1; + struct cache_node * node; + unsigned int count = 0; - hash = cache->c_hash + cache->hash(key, cache->c_hashsize); - head = &hash->ch_list; - pthread_mutex_lock(&hash->ch_mutex); - for (pos = head->next, n = pos->next; - pos != head; - pos = n, n = pos->next) { - if ((struct cache_node *)pos != node) - continue; - pthread_mutex_lock(&node->cn_mutex); - count = node->cn_count; - pthread_mutex_unlock(&node->cn_mutex); - if (count != 0) - break; + while (!list_empty(list)) { + node = list_entry(list->next, struct cache_node, cn_mru); pthread_mutex_destroy(&node->cn_mutex); - list_del_init(&node->cn_list); - hash->ch_count--; + list_del_init(&node->cn_mru); cache->relse(node); - break; + count++; } - pthread_mutex_unlock(&hash->ch_mutex); + return count; } /* * We've hit the limit on cache size, so we need to start reclaiming - * nodes we've used. This reclaims from the one given hash bucket - * only. Returns the number of freed up nodes, its left to the - * caller to updates the global counter of used nodes for the cache. - * The hash chain lock is held for the hash list argument, must be - * dropped before returning. - * We walk backwards through the hash (remembering we keep recently - * used nodes toward the front) until we hit an in-use node. We'll - * stop there if its a low priority call but keep going if its not. + * nodes we've used. The MRU specified by the priority is shaken. + * Returns new priority at end of the call (in case we call again). */ static unsigned int -cache_shake_hash( +cache_shake( struct cache * cache, - struct cache_hash * hash, - unsigned int priority) + unsigned int priority, + int all) { + struct cache_mru *mru; + struct cache_hash * hash; struct list_head temp; struct list_head * head; struct list_head * pos; struct list_head * n; struct cache_node * node; - unsigned int inuse = 0; + unsigned int count; + ASSERT(priority <= CACHE_MAX_PRIORITY); + if (priority > CACHE_MAX_PRIORITY) + priority = 0; + + mru = &cache->c_mrus[priority]; + count = 0; list_head_init(&temp); - head = &hash->ch_list; - for (pos = head->prev, n = pos->prev; - pos != head; - pos = n, n = pos->prev) { - node = (struct cache_node *)pos; - pthread_mutex_lock(&node->cn_mutex); - if (!(inuse = (node->cn_count > 0))) { - hash->ch_count--; - list_move_tail(&node->cn_list, &temp); + head = &mru->cm_list; + + pthread_mutex_lock(&mru->cm_mutex); + for (pos = head->prev, n = pos->prev; pos != head; + pos = n, n = pos->prev) { + node = list_entry(pos, struct cache_node, cn_mru); + + if (pthread_mutex_trylock(&node->cn_mutex) != 0) + continue; + + hash = cache->c_hash + node->cn_hashidx; + if (node->cn_count > 0 || + pthread_mutex_trylock(&hash->ch_mutex) != 0) { + pthread_mutex_unlock(&node->cn_mutex); + continue; } - pthread_mutex_unlock(&node->cn_mutex); - if (inuse && !priority) - break; - } - pthread_mutex_unlock(&hash->ch_mutex); - return cache->bulkrelse(cache, &temp); -} + ASSERT(node->cn_priority == priority); + node->cn_priority = -1; -/* - * Generic implementation of bulk release, which just iterates over - * the list calling the single node relse routine for each node. - */ -static unsigned int -cache_generic_bulkrelse( - struct cache * cache, - struct list_head * list) -{ - struct cache_node * node; - unsigned int count = 0; + list_move(&node->cn_mru, &temp); + list_del_init(&node->cn_hash); + hash->ch_count--; + mru->cm_count--; + pthread_mutex_unlock(&hash->ch_mutex); + pthread_mutex_unlock(&node->cn_mutex); - while (!list_empty(list)) { - node = (struct cache_node *)list->next; - pthread_mutex_destroy(&node->cn_mutex); - list_del_init(&node->cn_list); - cache->relse(node); count++; + if (!all && count == CACHE_SHAKE_COUNT) + break; } - return count; -} + pthread_mutex_unlock(&mru->cm_mutex); -/* - * We've hit the limit on cache size, so we need to start reclaiming - * nodes we've used. Start by shaking this hash chain only, unless - * the shake priority has been increased already. - * The hash chain lock is held for the hash list argument, must be - * dropped before returning. - * Returns new priority at end of the call (in case we call again). - */ -static unsigned int -cache_shake( - struct cache * cache, - struct cache_hash * hash, - unsigned int priority) -{ - unsigned int count; - unsigned int i; + if (count > 0) { + cache->bulkrelse(cache, &temp); - if (!priority) { /* do just one */ - count = cache_shake_hash(cache, hash, priority); - } else { /* use a bigger hammer */ - pthread_mutex_unlock(&hash->ch_mutex); - for (count = 0, i = 0; i < cache->c_hashsize; i++) { - hash = &cache->c_hash[i]; - pthread_mutex_lock(&hash->ch_mutex); - count += cache_shake_hash(cache, hash, priority - 1); - } - } - if (count) { pthread_mutex_lock(&cache->c_mutex); cache->c_count -= count; pthread_mutex_unlock(&cache->c_mutex); } - return ++priority; + + return (count == CACHE_SHAKE_COUNT) ? priority : ++priority; } /* * Allocate a new hash node (updating atomic counter in the process), * unless doing so will push us over the maximum cache size. */ -struct cache_node * +static struct cache_node * cache_node_allocate( struct cache * cache, - struct cache_hash * hashlist, cache_key_t key) { unsigned int nodesfree; struct cache_node * node; pthread_mutex_lock(&cache->c_mutex); - if ((nodesfree = (cache->c_count < cache->c_maxcount))) { + nodesfree = (cache->c_count < cache->c_maxcount); + if (nodesfree) { cache->c_count++; if (cache->c_count > cache->c_max) cache->c_max = cache->c_count; @@ -290,15 +256,16 @@ cache_node_allocate( pthread_mutex_unlock(&cache->c_mutex); if (!nodesfree) return NULL; - if (!(node = cache->alloc(key))) { /* uh-oh */ + node = cache->alloc(key); + if (node == NULL) { /* uh-oh */ pthread_mutex_lock(&cache->c_mutex); cache->c_count--; pthread_mutex_unlock(&cache->c_mutex); return NULL; } pthread_mutex_init(&node->cn_mutex, NULL); - list_head_init(&node->cn_list); node->cn_count = 1; + node->cn_priority = 0; return node; } @@ -325,42 +292,69 @@ cache_node_get( { struct cache_node * node = NULL; struct cache_hash * hash; + struct cache_mru * mru; struct list_head * head; struct list_head * pos; + unsigned int hashidx; int priority = 0; - int allocated = 0; - hash = cache->c_hash + cache->hash(key, cache->c_hashsize); + hashidx = cache->hash(key, cache->c_hashsize); + hash = cache->c_hash + hashidx; head = &hash->ch_list; - restart: - pthread_mutex_lock(&hash->ch_mutex); - for (pos = head->next; pos != head; pos = pos->next) { - node = (struct cache_node *)pos; - if (cache->compare(node, key) == 0) - continue; - pthread_mutex_lock(&node->cn_mutex); - node->cn_count++; - pthread_mutex_unlock(&node->cn_mutex); - pthread_mutex_lock(&cache->c_mutex); - cache->c_hits++; - pthread_mutex_unlock(&cache->c_mutex); - break; - } - if (pos == head) { - node = cache_node_allocate(cache, hash, key); - if (!node) { - priority = cache_shake(cache, hash, priority); - goto restart; + for (;;) { + pthread_mutex_lock(&hash->ch_mutex); + for (pos = head->next; pos != head; pos = pos->next) { + node = list_entry(pos, struct cache_node, cn_hash); + if (!cache->compare(node, key)) + continue; + /* + * node found, bump node's reference count, move it to the + * top of its MRU list, and update stats. + */ + pthread_mutex_lock(&node->cn_mutex); + node->cn_count++; + + mru = &cache->c_mrus[node->cn_priority]; + pthread_mutex_lock(&mru->cm_mutex); + list_move(&node->cn_mru, &mru->cm_list); + pthread_mutex_unlock(&mru->cm_mutex); + + pthread_mutex_unlock(&node->cn_mutex); + pthread_mutex_unlock(&hash->ch_mutex); + + pthread_mutex_lock(&cache->c_mutex); + cache->c_hits++; + pthread_mutex_unlock(&cache->c_mutex); + + *nodep = node; + return 0; } - allocated = 1; - hash->ch_count++; /* new entry */ + pthread_mutex_unlock(&hash->ch_mutex); + /* + * not found, allocate a new entry + */ + node = cache_node_allocate(cache, key); + if (node) + break; + priority = cache_shake(cache, priority, 0); } - /* looked at it, move to hash list head */ - list_move(&node->cn_list, &hash->ch_list); + + node->cn_hashidx = hashidx; + + /* add new node to appropriate hash and lowest priority MRU */ + mru = &cache->c_mrus[0]; + pthread_mutex_lock(&mru->cm_mutex); + pthread_mutex_lock(&hash->ch_mutex); + hash->ch_count++; + mru->cm_count++; + list_add(&node->cn_hash, &hash->ch_list); + list_add(&node->cn_mru, &mru->cm_list); pthread_mutex_unlock(&hash->ch_mutex); + pthread_mutex_unlock(&mru->cm_mutex); + *nodep = node; - return allocated; + return 1; } void @@ -379,6 +373,56 @@ cache_node_put( pthread_mutex_unlock(&node->cn_mutex); } +void +cache_node_set_priority( + struct cache * cache, + struct cache_node * node, + int priority) +{ + struct cache_mru * mru; + + if (priority < 0) + priority = 0; + else if (priority > CACHE_MAX_PRIORITY) + priority = CACHE_MAX_PRIORITY; + + pthread_mutex_lock(&node->cn_mutex); + + ASSERT(node->cn_count > 0); + if (priority == node->cn_priority) { + pthread_mutex_unlock(&node->cn_mutex); + return; + } + mru = &cache->c_mrus[node->cn_priority]; + pthread_mutex_lock(&mru->cm_mutex); + list_del_init(&node->cn_mru); + mru->cm_count--; + pthread_mutex_unlock(&mru->cm_mutex); + + mru = &cache->c_mrus[priority]; + pthread_mutex_lock(&mru->cm_mutex); + list_add(&node->cn_mru, &mru->cm_list); + node->cn_priority = priority; + mru->cm_count++; + pthread_mutex_unlock(&mru->cm_mutex); + + pthread_mutex_unlock(&node->cn_mutex); +} + +int +cache_node_get_priority( + struct cache_node * node) +{ + int priority; + + pthread_mutex_lock(&node->cn_mutex); + priority = node->cn_priority; + pthread_mutex_unlock(&node->cn_mutex); + + return priority; +} + + /* * Purge a specific node from the cache. Reference count must be zero. */ @@ -388,27 +432,60 @@ cache_node_purge( cache_key_t key, struct cache_node * node) { - int refcount; + struct list_head * head; + struct list_head * pos; + struct list_head * n; + struct cache_hash * hash; + struct cache_mru * mru; + int count = -1; + + hash = cache->c_hash + cache->hash(key, cache->c_hashsize); + head = &hash->ch_list; + pthread_mutex_lock(&hash->ch_mutex); + for (pos = head->next, n = pos->next; pos != head; + pos = n, n = pos->next) { + if ((struct cache_node *)pos != node) + continue; - refcount = cache_shake_node(cache, key, node); - if (refcount == 0) { + pthread_mutex_lock(&node->cn_mutex); + count = node->cn_count; + if (count != 0) { + pthread_mutex_unlock(&node->cn_mutex); + break; + } + mru = &cache->c_mrus[node->cn_priority]; + pthread_mutex_lock(&mru->cm_mutex); + list_del_init(&node->cn_mru); + mru->cm_count--; + pthread_mutex_unlock(&mru->cm_mutex); + + pthread_mutex_unlock(&node->cn_mutex); + pthread_mutex_destroy(&node->cn_mutex); + list_del_init(&node->cn_hash); + hash->ch_count--; + cache->relse(node); + break; + } + pthread_mutex_unlock(&hash->ch_mutex); + + if (count == 0) { pthread_mutex_lock(&cache->c_mutex); cache->c_count--; pthread_mutex_unlock(&cache->c_mutex); } #ifdef CACHE_DEBUG - if (refcount >= 1) { + if (count >= 1) { fprintf(stderr, "%s: refcount was %u, not zero (node=%p)\n", - __FUNCTION__, refcount, node); + __FUNCTION__, count, node); cache_abort(); } - if (refcount == -1) { + if (count == -1) { fprintf(stderr, "%s: purge node not found! (node=%p)\n", __FUNCTION__, node); cache_abort(); } #endif - return (refcount == 0); + return (count == 0); } /* @@ -418,20 +495,20 @@ void cache_purge( struct cache * cache) { - struct cache_hash * hash; + int i; + + for (i = 0; i <= CACHE_MAX_PRIORITY; i++) + cache_shake(cache, i, 1); - hash = &cache->c_hash[0]; - pthread_mutex_lock(&hash->ch_mutex); - cache_shake(cache, hash, (unsigned int)-1); #ifdef CACHE_DEBUG if (cache->c_count != 0) { + /* flush referenced nodes to disk */ + cache_flush(cache); fprintf(stderr, "%s: shake on cache %p left %u nodes!?\n", __FUNCTION__, cache, cache->c_count); cache_abort(); } #endif - /* flush any remaining nodes to disk */ - cache_flush(cache); } /* @@ -465,15 +542,18 @@ cache_flush( } } -#define HASH_REPORT (3*HASH_CACHE_RATIO) +#define HASH_REPORT (3 * HASH_CACHE_RATIO) void -cache_report(FILE *fp, const char *name, struct cache * cache) +cache_report( + FILE *fp, + const char *name, + struct cache *cache) { - int i; - unsigned long count, index, total; - unsigned long hash_bucket_lengths[HASH_REPORT+2]; + int i; + unsigned long count, index, total; + unsigned long hash_bucket_lengths[HASH_REPORT + 2]; - if ((cache->c_hits+cache->c_misses) == 0) + if ((cache->c_hits + cache->c_misses) == 0) return; /* report cache summary */ @@ -492,9 +572,15 @@ cache_report(FILE *fp, const char *name, struct cache * cache) cache->c_hashsize, cache->c_hits, cache->c_misses, - (double) (cache->c_hits*100/(cache->c_hits+cache->c_misses)) + (double)cache->c_hits * 100 / + (cache->c_hits + cache->c_misses) ); + for (i = 0; i <= CACHE_MAX_PRIORITY; i++) + fprintf(fp, "MRU %d entries = %6u (%3u%%)\n", + i, cache->c_mrus[i].cm_count, + cache->c_mrus[i].cm_count * 100 / cache->c_count); + /* report hash bucket lengths */ bzero(hash_bucket_lengths, sizeof(hash_bucket_lengths)); @@ -508,14 +594,16 @@ cache_report(FILE *fp, const char *name, struct cache * cache) } total = 0; - for (i = 0; i < HASH_REPORT+1; i++) { - total += i*hash_bucket_lengths[i]; + for (i = 0; i < HASH_REPORT + 1; i++) { + total += i * hash_bucket_lengths[i]; if (hash_bucket_lengths[i] == 0) continue; - fprintf(fp, "Hash buckets with %2d entries %5ld (%3ld%%)\n", - i, hash_bucket_lengths[i], (i*hash_bucket_lengths[i]*100)/cache->c_count); + fprintf(fp, "Hash buckets with %2d entries %6ld (%3ld%%)\n", + i, hash_bucket_lengths[i], + (i * hash_bucket_lengths[i] * 100) / cache->c_count); } if (hash_bucket_lengths[i]) /* last report bucket is the overflow bucket */ - fprintf(fp, "Hash buckets with >%2d entries %5ld (%3ld%%)\n", - i-1, hash_bucket_lengths[i], ((cache->c_count-total)*100)/cache->c_count); + fprintf(fp, "Hash buckets with >%2d entries %6ld (%3ld%%)\n", + i - 1, hash_bucket_lengths[i], + ((cache->c_count - total) * 100) / cache->c_count); } diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c index 921ced7fa..830d0cf01 100644 --- a/libxfs/rdwr.c +++ b/libxfs/rdwr.c @@ -257,7 +257,11 @@ libxfs_getsb(xfs_mount_t *mp, int flags) XFS_FSS_TO_BB(mp, 1), flags); } -xfs_zone_t *xfs_buf_zone; +xfs_zone_t *xfs_buf_zone; + +static struct cache_mru xfs_buf_freelist = + {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list}, + 0, PTHREAD_MUTEX_INITIALIZER }; typedef struct { dev_t device; @@ -308,7 +312,8 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes) bp->b_blkno = bno; bp->b_bcount = bytes; bp->b_dev = device; - bp->b_addr = memalign(libxfs_device_alignment(), bytes); + if (!bp->b_addr) + bp->b_addr = memalign(libxfs_device_alignment(), bytes); if (!bp->b_addr) { fprintf(stderr, _("%s: %s can't memalign %u bytes: %s\n"), @@ -323,18 +328,44 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes) } xfs_buf_t * -libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int len) +libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int bblen) { xfs_buf_t *bp; + int blen = BBTOB(bblen); + + /* + * first look for a buffer that can be used as-is, + * if one cannot be found, see if there is a buffer, + * and if so, free it's buffer and set b_addr to NULL + * before calling libxfs_initbuf. + */ + pthread_mutex_lock(&xfs_buf_freelist.cm_mutex); + if (!list_empty(&xfs_buf_freelist.cm_list)) { + list_for_each_entry(bp, &xfs_buf_freelist.cm_list, b_node.cn_mru) { + if (bp->b_bcount == blen) { + list_del_init(&bp->b_node.cn_mru); + break; + } + } + if (&bp->b_node.cn_mru == &xfs_buf_freelist.cm_list) { + bp = list_entry(xfs_buf_freelist.cm_list.next, + xfs_buf_t, b_node.cn_mru); + list_del_init(&bp->b_node.cn_mru); + free(bp->b_addr); + bp->b_addr = NULL; + } + } else + bp = libxfs_zone_zalloc(xfs_buf_zone); + pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex); - bp = libxfs_zone_zalloc(xfs_buf_zone); if (bp != NULL) - libxfs_initbuf(bp, device, blkno, BBTOB(len)); + libxfs_initbuf(bp, device, blkno, blen); #ifdef IO_DEBUG printf("%lx: %s: allocated %u bytes buffer, key=%llu(%llu), %p\n", pthread_self(), __FUNCTION__, BBTOB(len), (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp); #endif + return bp; } @@ -358,6 +389,8 @@ libxfs_getbuf(dev_t device, xfs_daddr_t blkno, int len) miss = cache_node_get(libxfs_bcache, &key, (struct cache_node **)&bp); if (bp) { pthread_mutex_lock(&bp->b_lock); + cache_node_set_priority(libxfs_bcache, (struct cache_node *)bp, + cache_node_get_priority((struct cache_node *)bp) - 4); #ifdef XFS_BUF_TRACING pthread_mutex_lock(&libxfs_bcache->c_mutex); lock_buf_count++; @@ -525,34 +558,46 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags) } static void -libxfs_bflush(struct cache_node *node) +libxfs_brelse(struct cache_node *node) { xfs_buf_t *bp = (xfs_buf_t *)node; - if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY)) - libxfs_writebufr(bp); + if (bp != NULL) { + if (bp->b_flags & LIBXFS_B_DIRTY) + libxfs_writebufr(bp); + pthread_mutex_lock(&xfs_buf_freelist.cm_mutex); + list_add(&bp->b_node.cn_mru, &xfs_buf_freelist.cm_list); + pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex); + } } static void -libxfs_brelse(struct cache_node *node) +libxfs_bulkrelse( + struct cache *cache, + struct list_head *list) { - xfs_buf_t *bp = (xfs_buf_t *)node; - xfs_buf_log_item_t *bip; - extern xfs_zone_t *xfs_buf_item_zone; + xfs_buf_t *bp; - if (bp != NULL) { + if (list_empty(list)) + return; + + list_for_each_entry(bp, list, b_node.cn_mru) { if (bp->b_flags & LIBXFS_B_DIRTY) libxfs_writebufr(bp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - if (bip) - libxfs_zone_free(xfs_buf_item_zone, bip); - free(bp->b_addr); - pthread_mutex_destroy(&bp->b_lock); - bp->b_addr = NULL; - bp->b_flags = 0; - free(bp); - bp = NULL; } + + pthread_mutex_lock(&xfs_buf_freelist.cm_mutex); + __list_splice(list, &xfs_buf_freelist.cm_list); + pthread_mutex_unlock(&xfs_buf_freelist.cm_mutex); +} + +static void +libxfs_bflush(struct cache_node *node) +{ + xfs_buf_t *bp = (xfs_buf_t *)node; + + if ((bp != NULL) && (bp->b_flags & LIBXFS_B_DIRTY)) + libxfs_writebufr(bp); } void @@ -586,7 +631,7 @@ struct cache_operations libxfs_bcache_operations = { /* .flush */ libxfs_bflush, /* .relse */ libxfs_brelse, /* .compare */ libxfs_bcompare, - /* .bulkrelse */ NULL /* TODO: lio_listio64 interface? */ + /* .bulkrelse */libxfs_bulkrelse }; diff --git a/repair/phase6.c b/repair/phase6.c index e43addabf..147b80d86 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -745,6 +745,7 @@ mk_root_dir(xfs_mount_t *mp) int i; int error; const mode_t mode = 0755; + ino_tree_node_t *irec; tp = libxfs_trans_alloc(mp, 0); ip = NULL; @@ -788,6 +789,11 @@ mk_root_dir(xfs_mount_t *mp) dir_init(mp, tp, ip, ip); libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_SYNC, 0); + + irec = find_inode_rec(XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino), + XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino)); + set_inode_isadir(irec, XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino) - + irec->ino_startnum); } /* @@ -3802,29 +3808,20 @@ traverse_ags( xfs_mount_t *mp) { int i; - work_queue_t *queues; + work_queue_t queue; prefetch_args_t *pf_args[2]; - queues = malloc(thread_count * sizeof(work_queue_t)); - queues[0].mp = mp; - - if (!libxfs_bcache_overflowed()) { - /*create_work_queue(&queues[0], mp, libxfs_nproc()); - for (i = 0; i < glob_agcount; i++) - queue_work(&queues[0], traverse_function, i, NULL); - destroy_work_queue(&queues[0]);*/ - for (i = 0; i < glob_agcount; i++) - traverse_function(&queues[0], i, NULL); - } else { - /* TODO: AG stride support */ - pf_args[0] = start_inode_prefetch(0, 1, NULL); - for (i = 0; i < glob_agcount; i++) { - pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 1, - pf_args[i & 1]); - traverse_function(&queues[0], i, pf_args[i & 1]); - } + /* + * we always do prefetch for phase 6 as it will fill in the gaps + * not read during phase 3 prefetch. + */ + queue.mp = mp; + pf_args[0] = start_inode_prefetch(0, 1, NULL); + for (i = 0; i < glob_agcount; i++) { + pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 1, + pf_args[i & 1]); + traverse_function(&queue, i, pf_args[i & 1]); } - free(queues); } void @@ -3901,7 +3898,7 @@ _(" - resetting contents of realtime bitmap and summary inodes\n")); mark_standalone_inodes(mp); - do_log(_(" - traversing filesystem ... \n")); + do_log(_(" - traversing filesystem ...\n")); irec = find_inode_rec(XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino), XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino)); @@ -3919,8 +3916,8 @@ _(" - resetting contents of realtime bitmap and summary inodes\n")); */ traverse_ags(mp); - do_log(_(" - traversals finished ... \n")); - do_log(_(" - moving disconnected inodes to %s ... \n"), + do_log(_(" - traversal finished ...\n")); + do_log(_(" - moving disconnected inodes to %s ...\n"), ORPHANAGE); /* diff --git a/repair/prefetch.c b/repair/prefetch.c index 4b7ea7f11..4a31cbaea 100644 --- a/repair/prefetch.c +++ b/repair/prefetch.c @@ -34,14 +34,27 @@ static int pf_max_fsbs; static int pf_batch_bytes; static int pf_batch_fsbs; -#define B_INODE 0x1000000 -#define B_META 0x2000000 +static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *); + +/* buffer priorities for the libxfs cache */ + +#define B_DIR_BMAP 15 +#define B_DIR_META_2 13 /* metadata in secondary queue */ +#define B_DIR_META_H 11 /* metadata fetched for PF_META_ONLY */ +#define B_DIR_META_S 9 /* single block of metadata */ +#define B_DIR_META 7 +#define B_DIR_INODE 6 +#define B_BMAP 5 +#define B_INODE 4 + +#define B_IS_INODE(b) (((b) & 1) == 0) +#define B_IS_META(b) (((b) & 1) != 0) #define DEF_BATCH_BYTES 0x10000 #define MAX_BUFS 128 -#define IO_THRESHOLD (MAX_BUFS * PF_THREAD_COUNT) +#define IO_THRESHOLD (MAX_BUFS * 2) typedef enum pf_which { PF_PRIMARY, @@ -89,16 +102,19 @@ pf_queue_io( bp = libxfs_getbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno), XFS_FSB_TO_BB(mp, blen)); if (bp->b_flags & LIBXFS_B_UPTODATE) { + if (B_IS_INODE(flag)) + pf_read_inode_dirs(args, bp); + XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) + 8); libxfs_putbuf(bp); return; } - bp->b_flags |= flag; + XFS_BUF_SET_PRIORITY(bp, flag); pthread_mutex_lock(&args->lock); if (fsbno > args->last_bno_read) { radix_tree_insert(&args->primary_io_queue, fsbno, bp); - if (flag == B_META) + if (B_IS_META(flag)) radix_tree_tag_set(&args->primary_io_queue, fsbno, 0); else { args->inode_bufs_queued++; @@ -108,7 +124,7 @@ pf_queue_io( #ifdef XR_PF_TRACE pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to " "primary queue (inode_bufs_queued = %d, last_bno = %lu)", - flag == B_INODE ? 'I' : 'M', bp, + B_IS_INODE(flag) ? 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, args->inode_bufs_queued, args->last_bno_read); #endif @@ -116,11 +132,12 @@ pf_queue_io( #ifdef XR_PF_TRACE pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to " "secondary queue (last_bno = %lu)", - flag == B_INODE ? 'I' : 'M', bp, + B_IS_INODE(flag) ? 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, args->last_bno_read); #endif - ASSERT(flag == B_META); + ASSERT(B_IS_META(flag)); + XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2); radix_tree_insert(&args->secondary_io_queue, fsbno, bp); } @@ -163,7 +180,7 @@ pf_read_bmbt_reclist( #ifdef XR_PF_TRACE pftrace("queuing dir extent in AG %d", args->agno); #endif - pf_queue_io(args, s, 1, B_META); + pf_queue_io(args, s, 1, B_DIR_META); c--; s++; } @@ -194,6 +211,8 @@ pf_scan_lbtree( if (!bp) return 0; + XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP); + rc = (*func)((xfs_btree_lblock_t *)XFS_BUF_PTR(bp), level - 1, isadir, args); libxfs_putbuf(bp); @@ -307,6 +326,8 @@ pf_read_inode_dirs( { xfs_dinode_t *dino; int icnt = 0; + int hasdir = 0; + int isadir; xfs_dinode_core_t *dinoc; for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) { @@ -317,9 +338,14 @@ pf_read_inode_dirs( * We are only prefetching directory contents in extents * and btree nodes for other inodes */ - if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL || - (dinoc->di_format == XFS_DINODE_FMT_EXTENTS && - (be16_to_cpu(dinoc->di_mode) & S_IFMT) != S_IFDIR)) + isadir = (be16_to_cpu(dinoc->di_mode) & S_IFMT) == S_IFDIR; + hasdir |= isadir; + + if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL) + continue; + + if (!isadir && (dinoc->di_format == XFS_DINODE_FMT_EXTENTS || + args->dirs_only)) continue; /* @@ -350,11 +376,12 @@ pf_read_inode_dirs( pf_read_exinode(args, dino); break; case XFS_DINODE_FMT_BTREE: - pf_read_btinode(args, dino, (be16_to_cpu( - dinoc->di_mode) & S_IFMT) == S_IFDIR); + pf_read_btinode(args, dino, isadir); break; } } + if (hasdir) + XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE); } /* @@ -434,7 +461,7 @@ pf_batch_read( if (which == PF_PRIMARY) { for (inode_bufs = 0, i = 0; i < num; i++) { - if (bplist[i]->b_flags & B_INODE) + if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) inode_bufs++; } args->inode_bufs_queued -= inode_bufs; @@ -470,14 +497,20 @@ pf_batch_read( memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size); bplist[i]->b_flags |= LIBXFS_B_UPTODATE; len -= size; - if (bplist[i]->b_flags & B_INODE) + if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) pf_read_inode_dirs(args, bplist[i]); + else if (which == PF_META_ONLY) + XFS_BUF_SET_PRIORITY(bplist[i], + B_DIR_META_H); + else if (which == PF_PRIMARY && num == 1) + XFS_BUF_SET_PRIORITY(bplist[i], + B_DIR_META_S); } } for (i = 0; i < num; i++) { #ifdef XR_PF_TRACE pftrace("putbuf %c %p (%llu) in AG %d", - bplist[i]->b_flags & B_INODE ? 'I' : 'M', + B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M', bplist[i], (long long)XFS_BUF_ADDR(bplist[i]), args->agno); #endif @@ -623,7 +656,9 @@ pf_queuing_worker( do { pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno), - blks_per_cluster, B_INODE); + blks_per_cluster, + (cur_irec->ino_isa_dir != 0) ? + B_DIR_INODE : B_INODE); bno += blks_per_cluster; num_inos += inos_per_cluster; } while (num_inos < XFS_IALLOC_INODES(mp)); -- 2.47.2