From ed609d9554227ccc3969b704273c8c1da29eab43 Mon Sep 17 00:00:00 2001 From: Alberto Ruiz Date: Wed, 18 Mar 2026 07:23:09 +0000 Subject: [PATCH] Add hash table to dirpool for O(1) directory lookup Profiling repo2solv with perf on a full primary+filelists workload (Fedora 42, ~75K packages) revealed that dirpool_add_dir consumed 48.6% of all cycles and caused 92% of last-level cache misses. The root cause: directories sharing a parent are stored across multiple non-contiguous blocks in the dirs[] array, linked through the dirtraverse[] auxiliary array. Looking up whether a (parent, component) pair already exists required scanning every block for that parent -- O(B*C) where B is the number of blocks and C the average block size. Popular parents like /usr accumulate hundreds of blocks from filelists data, causing cache-hostile pointer chasing across scattered memory. Replace the dirtraverse-based lookup with an open-addressed hash table mapping (parent, comp) pairs to directory ids. The hash is computed via relhash() (same scheme used for reldeps), and parent is verified by a short backward walk to the block header -- always within a few entries of the matched slot, so it stays in the same cache line. The hash table is built on demand and resized at 50% load factor. The dirtraverse array is kept for dirpool_child/dirpool_sibling traversal APIs but is no longer needed for the add_dir fast path. Benchmark results (Fedora 42 metadata, 5 runs, median wall-clock): primary + filelists: 10,256 ms vs 20,028 ms baseline (49% faster) primary only: 2,629 ms vs 2,546 ms baseline (within noise) perf profile after the change shows dirpool_add_dir dropped from 48.6% to 1.78% of cycles. Output is byte-identical to baseline. --- src/dirpool.c | 85 ++++++++++++++++++++++++++++++++++++++++----------- src/dirpool.h | 6 ++++ 2 files changed, 74 insertions(+), 17 deletions(-) diff --git a/src/dirpool.c b/src/dirpool.c index bed9435e..42ae5e46 100644 --- a/src/dirpool.c +++ b/src/dirpool.c @@ -75,6 +75,7 @@ dirpool_free(Dirpool *dp) { solv_free(dp->dirs); solv_free(dp->dirtraverse); + solv_free(dp->dirhashtbl); } void @@ -96,10 +97,44 @@ dirpool_make_dirtraverse(Dirpool *dp) dp->dirtraverse = dirtraverse; } +/* Build or rebuild the (parent, comp) -> dirid hash table. + * This replaces the old O(B*C) dirtraverse linked-list scan + * with O(1) amortized lookup in dirpool_add_dir. */ +static void +dirpool_resize_hash(Dirpool *dp, int numnew) +{ + Hashval hashmask, h, hh; + Hashtable ht; + Id i, d, parent; + + hashmask = mkmask(dp->ndirs + numnew); + if (dp->dirhashtbl && hashmask <= dp->dirhashmask) + return; + dp->dirhashmask = hashmask; + solv_free(dp->dirhashtbl); + ht = dp->dirhashtbl = (Hashtable)solv_calloc(hashmask + 1, sizeof(Id)); + for (i = 2; i < dp->ndirs; i++) + { + if (dp->dirs[i] <= 0) + continue; + /* walk back to block header to find parent */ + for (d = i; dp->dirs[--d] > 0; ) + ; + parent = -dp->dirs[d]; + h = relhash(parent, dp->dirs[i], 0) & hashmask; + hh = HASHCHAIN_START; + while (ht[h]) + h = HASHCHAIN_NEXT(h, hh, hashmask); + ht[h] = i; + } +} + Id dirpool_add_dir(Dirpool *dp, Id parent, Id comp, int create) { - Id did, d, ds; + Id did, d; + Hashval h, hh, hashmask; + Hashtable ht; if (!dp->ndirs) { @@ -114,28 +149,39 @@ dirpool_add_dir(Dirpool *dp, Id parent, Id comp, int create) return 0; if (parent == 0 && comp == 1) return 1; - if (!dp->dirtraverse) - dirpool_make_dirtraverse(dp); - /* check all entries with this parent if we - * already have this component */ - ds = dp->dirtraverse[parent]; - while (ds) + + /* grow hash table if load factor exceeds 50% */ + if ((Hashval)dp->ndirs * 2 > dp->dirhashmask) + dirpool_resize_hash(dp, DIR_BLOCK); + + ht = dp->dirhashtbl; + hashmask = dp->dirhashmask; + + /* probe for existing (parent, comp) entry */ + h = relhash(parent, comp, 0) & hashmask; + hh = HASHCHAIN_START; + while ((did = ht[h]) != 0) { - /* ds: first component in this block - * ds-1: parent link */ - for (d = ds--; d < dp->ndirs; d++) + if (dp->dirs[did] == comp) { - if (dp->dirs[d] == comp) - return d; - if (dp->dirs[d] <= 0) /* reached end of this block */ - break; + /* comp matches, verify parent by walking back to + * the block header (short sequential scan) */ + d = did; + while (dp->dirs[--d] > 0) + ; + if (-dp->dirs[d] == parent) + return did; } - if (ds) - ds = dp->dirtraverse[ds]; + h = HASHCHAIN_NEXT(h, hh, hashmask); } + if (!create) return 0; - /* a new one, find last parent */ + + if (!dp->dirtraverse) + dirpool_make_dirtraverse(dp); + + /* find last parent block */ for (did = dp->ndirs - 1; did > 0; did--) if (dp->dirs[did] <= 0) break; @@ -154,5 +200,10 @@ dirpool_add_dir(Dirpool *dp, Id parent, Id comp, int create) dp->dirtraverse = solv_extend(dp->dirtraverse, dp->ndirs, 1, sizeof(Id), DIR_BLOCK); dp->dirs[dp->ndirs] = comp; dp->dirtraverse[dp->ndirs] = 0; + + /* insert new entry into hash table (h still points at + * the empty slot from the failed probe above) */ + ht[h] = dp->ndirs; + return dp->ndirs++; } diff --git a/src/dirpool.h b/src/dirpool.h index ca929548..cd8c311b 100644 --- a/src/dirpool.h +++ b/src/dirpool.h @@ -9,6 +9,7 @@ #include "pooltypes.h" +#include "hash.h" #include "util.h" #ifdef __cplusplus @@ -19,6 +20,8 @@ typedef struct s_Dirpool { Id *dirs; int ndirs; Id *dirtraverse; + Hashtable dirhashtbl; /* (parent, comp) -> dirid hash, built on demand */ + Hashval dirhashmask; } Dirpool; void dirpool_init(Dirpool *dp); @@ -70,6 +73,9 @@ dirpool_free_dirtraverse(Dirpool *dp) { solv_free(dp->dirtraverse); dp->dirtraverse = 0; + solv_free(dp->dirhashtbl); + dp->dirhashtbl = 0; + dp->dirhashmask = 0; } static inline Id -- 2.47.3