From: Michael Schroeder Date: Mon, 22 Jul 2024 11:47:47 +0000 (+0200) Subject: Refactor string pool merging in repo_solv X-Git-Tag: 0.7.31~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8e390d8f8efd89bd5707fcc5cc32dd089b439a79;p=thirdparty%2Flibsolv.git Refactor string pool merging in repo_solv Move merging code into strpool.c where it belongs. Also clean up hashmask handling a bit. --- diff --git a/src/poolid.c b/src/poolid.c index 52d98c38..c73d5b44 100644 --- a/src/poolid.c +++ b/src/poolid.c @@ -92,7 +92,7 @@ pool_rel2id(Pool *pool, Id name, Id evr, int flags, int create) /* extend hashtable if needed */ hashmask = pool->relhashmask; - if ((Hashval)pool->nrels * 2 > hashmask) + if ((Hashval)pool->nrels * 2 >= hashmask) { pool_resize_rels_hash(pool, REL_BLOCK); hashmask = pool->relhashmask; diff --git a/src/repo_solv.c b/src/repo_solv.c index 629ac683..d98c973f 100644 --- a/src/repo_solv.c +++ b/src/repo_solv.c @@ -546,9 +546,7 @@ repo_add_solv(Repo *repo, FILE *fp, int flags) int numkeys, numschemata; Offset sizeid; - Offset *str; /* map Id -> Offset into string space */ char *strsp; /* repo string space */ - char *sp; /* pointer into string space */ Id *idmap; /* map of repo Ids to pool Ids */ Id id, type; Hashval hashmask, h, hh; @@ -668,6 +666,8 @@ repo_add_solv(Repo *repo, FILE *fp, int flags) /******* Part 1: string IDs *****************************************/ sizeid = read_u32(&data); /* size of string space */ + if (sizeid >= 0xf0000000) + return pool_error(pool, SOLV_ERROR_CORRUPT, "bad string size"); /* * read strings and Ids @@ -682,22 +682,16 @@ repo_add_solv(Repo *repo, FILE *fp, int flags) if (!(flags & REPO_LOCALPOOL)) { spool = &pool->ss; - /* alloc max needed string buffer and string pointers, will shrink again later */ -#if 0 - spool->stringspace = solv_realloc(spool->stringspace, spool->sstrings + sizeid + 1); - spool->strings = solv_realloc2(spool->strings, spool->nstrings + numid, sizeof(Offset)); -#else - spool->sstrings += sizeid + 1; - spool->nstrings += numid; - stringpool_shrink(spool); /* we misuse stringpool_shrink so that the correct BLOCK factor is used */ - spool->sstrings -= sizeid + 1; - spool->nstrings -= numid; -#endif + /* reserve max needed string buffer and offsets, will shrink again later */ + /* we use sizeid + 1 because we add a terminating zero */ + stringpool_reserve(spool, numid, sizeid + 1); } else { data.localpool = 1; spool = &data.spool; + /* the local pool is unlikely to get changed, so we do not use blocking and + * do not create a hash (see comment in stringpool_strn2id()) */ spool->stringspace = solv_malloc(7 + sizeid + 1); spool->strings = solv_malloc2(numid < 2 ? 2 : numid, sizeof(Offset)); strcpy(spool->stringspace, ""); @@ -761,12 +755,12 @@ repo_add_solv(Repo *repo, FILE *fp, int flags) } } strsp[sizeid] = 0; /* make string space \0 terminated */ - sp = strsp; /* now merge */ - str = spool->strings; /* array of offsets into strsp, indexed by Id */ if ((flags & REPO_LOCALPOOL) != 0) { + Offset *str = spool->strings; /* array of offsets into strsp, indexed by Id */ + char *sp = strsp; /* no shared pool, thus no idmap and no unification needed */ idmap = 0; spool->nstrings = numid < 2 ? 2 : numid; /* make sure we have at least id 0 and 1 */ @@ -790,69 +784,15 @@ repo_add_solv(Repo *repo, FILE *fp, int flags) } else { - Offset oldsstrings = spool->sstrings; - - /* alloc id map for name and rel Ids. this maps ids in the solv files + /* alloc id map for name and rel Ids. this maps ids in the solv file * to the ids in our pool */ idmap = solv_calloc(numid + numrel, sizeof(Id)); - stringpool_resize_hash(spool, numid); - hashtbl = spool->stringhashtbl; - hashmask = spool->stringhashmask; -#if 0 - POOL_DEBUG(SOLV_DEBUG_STATS, "read %d strings\n", numid); - POOL_DEBUG(SOLV_DEBUG_STATS, "string hash buckets: %d\n", hashmask + 1); -#endif - /* - * run over strings and merge with pool. - * we could use stringpool_str2id, but this is faster. - * also populate id map (maps solv Id -> pool Id) - */ - for (i = 1; i < numid; i++) + if (!stringpool_integrate(spool, numid, sizeid, idmap)) { - if (sp >= strsp + sizeid) - { - solv_free(idmap); - spool->nstrings = oldnstrings; - spool->sstrings = oldsstrings; - stringpool_freehash(spool); - repodata_freedata(&data); - return pool_error(pool, SOLV_ERROR_OVERFLOW, "not enough strings %d %d", i, numid); - } - if (!*sp) /* empty string */ - { - idmap[i] = ID_EMPTY; - sp++; - continue; - } - - /* find hash slot */ - h = strhash(sp) & hashmask; - hh = HASHCHAIN_START; - for (;;) - { - id = hashtbl[h]; - if (!id) - break; - if (!strcmp(spool->stringspace + spool->strings[id], sp)) - break; /* already in pool */ - h = HASHCHAIN_NEXT(h, hh, hashmask); - } - - /* length == offset to next string */ - l = strlen(sp) + 1; - if (!id) /* end of hash chain -> new string */ - { - id = spool->nstrings++; - hashtbl[h] = id; - str[id] = spool->sstrings; /* save offset */ - if (sp != spool->stringspace + spool->sstrings) - memmove(spool->stringspace + spool->sstrings, sp, l); - spool->sstrings += l; - } - idmap[i] = id; /* repo relative -> pool relative */ - sp += l; /* next string */ + solv_free(idmap); + repodata_freedata(&data); + return pool_error(pool, SOLV_ERROR_OVERFLOW, "not enough strings"); } - stringpool_shrink(spool); /* vacuum */ } diff --git a/src/strpool.c b/src/strpool.c index b4a09a51..0b95f0f7 100644 --- a/src/strpool.c +++ b/src/strpool.c @@ -72,7 +72,7 @@ stringpool_clone(Stringpool *ss, Stringpool *from) ss->sstrings = from->sstrings; } -void +static void stringpool_resize_hash(Stringpool *ss, int numnew) { Hashval h, hh, hashmask; @@ -104,7 +104,7 @@ stringpool_resize_hash(Stringpool *ss, int numnew) Id stringpool_strn2id(Stringpool *ss, const char *str, unsigned int len, int create) { - Hashval h, hh, hashmask, oldhashmask; + Hashval h, hh, hashmask; Id id; Hashtable hashtbl; @@ -113,10 +113,17 @@ stringpool_strn2id(Stringpool *ss, const char *str, unsigned int len, int create if (!len) return STRID_EMPTY; - hashmask = oldhashmask = ss->stringhashmask; + hashmask = ss->stringhashmask; + /* expand hashtable if needed */ - if ((Hashval)ss->nstrings * 2 > hashmask) + if ((Hashval)ss->nstrings * 2 >= hashmask) { + /* this should be a test for a flag that tells us if the + * correct blocking is used, but adding a flag would break + * the ABI. So we use the existance of the hash area as + * indication instead */ + if (!hashmask) + stringpool_reserve(ss, 1, len + 1); stringpool_resize_hash(ss, STRING_BLOCK); hashmask = ss->stringhashmask; } @@ -135,16 +142,6 @@ stringpool_strn2id(Stringpool *ss, const char *str, unsigned int len, int create if (id || !create) /* exit here if string found */ return id; - /* this should be a test for a flag that tells us if the - * correct blocking is used, but adding a flag would break - * the ABI. So we use the existance of the hash area as - * indication instead */ - if (!oldhashmask) - { - ss->stringspace = solv_extend_resize(ss->stringspace, ss->sstrings + len + 1, 1, STRINGSPACE_BLOCK); - ss->strings = solv_extend_resize(ss->strings, ss->nstrings + 1, sizeof(Offset), STRING_BLOCK); - } - /* generate next id and save in table */ id = ss->nstrings++; hashtbl[h] = id; @@ -176,3 +173,82 @@ stringpool_shrink(Stringpool *ss) ss->stringspace = solv_extend_resize(ss->stringspace, ss->sstrings, 1, STRINGSPACE_BLOCK); ss->strings = solv_extend_resize(ss->strings, ss->nstrings, sizeof(Offset), STRING_BLOCK); } + +void +stringpool_reserve(Stringpool *ss, int numid, Offset sizeid) +{ + ss->stringspace = solv_extend_resize(ss->stringspace, ss->sstrings + sizeid, 1, STRINGSPACE_BLOCK); + ss->strings = solv_extend_resize(ss->strings, ss->nstrings + numid, sizeof(Offset), STRING_BLOCK); +} + +int +stringpool_integrate(Stringpool *ss, int numid, Offset sizeid, Id *idmap) +{ + int oldnstrings = ss->nstrings; + Offset oldsstrings = ss->sstrings; + Offset *str; + Id id; + int i, l; + char *strsp, *sp; + Hashval hashmask, h, hh; + Hashtable hashtbl; + + stringpool_resize_hash(ss, numid); + hashtbl = ss->stringhashtbl; + hashmask = ss->stringhashmask; + + /* + * run over new strings and merge with pool. + * we could use stringpool_str2id, but this is faster. + * also populate id map (maps solv Id -> pool Id) + */ + str = ss->strings; + sp = strsp = ss->stringspace + ss->sstrings; + for (i = 1; i < numid; i++) + { + if (sp >= strsp + sizeid) + { + ss->nstrings = oldnstrings; + ss->sstrings = oldsstrings; + stringpool_freehash(ss); + stringpool_shrink(ss); /* vacuum */ + return 0; + } + if (!*sp) /* shortcut for empty strings */ + { + idmap[i] = STRID_EMPTY; + sp++; + continue; + } + + /* find hash slot */ + h = strhash(sp) & hashmask; + hh = HASHCHAIN_START; + for (;;) + { + id = hashtbl[h]; + if (!id) + break; + if (!strcmp(ss->stringspace + ss->strings[id], sp)) + break; /* already in pool */ + h = HASHCHAIN_NEXT(h, hh, hashmask); + } + + /* length == offset to next string */ + l = strlen(sp) + 1; + if (!id) /* end of hash chain -> new string */ + { + id = ss->nstrings++; + hashtbl[h] = id; + str[id] = ss->sstrings; /* save offset */ + if (sp != ss->stringspace + ss->sstrings) + memmove(ss->stringspace + ss->sstrings, sp, l); + ss->sstrings += l; + } + idmap[i] = id; /* repo relative -> pool relative */ + sp += l; /* next string */ + } + stringpool_shrink(ss); /* vacuum */ + return 1; +} + diff --git a/src/strpool.h b/src/strpool.h index fd13bdb7..b8182b23 100644 --- a/src/strpool.h +++ b/src/strpool.h @@ -33,13 +33,15 @@ void stringpool_init_empty(Stringpool *ss); void stringpool_clone(Stringpool *ss, Stringpool *from); void stringpool_free(Stringpool *ss); void stringpool_freehash(Stringpool *ss); -void stringpool_resize_hash(Stringpool *ss, int numnew); Id stringpool_str2id(Stringpool *ss, const char *str, int create); Id stringpool_strn2id(Stringpool *ss, const char *str, unsigned int len, int create); void stringpool_shrink(Stringpool *ss); +void stringpool_reserve(Stringpool *ss, int numid, Offset sizeid); +int stringpool_integrate(Stringpool *ss, int numid, Offset sizeid, Id *idmap); + static inline const char * stringpool_id2str(Stringpool *ss, Id id)