From: Andrew Gregory Date: Tue, 1 Nov 2016 16:42:14 +0000 (-0400) Subject: mtree: use hash table for duplicate entry search X-Git-Tag: v3.3.0~108^2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fpull%2F819%2Fhead;p=thirdparty%2Flibarchive.git mtree: use hash table for duplicate entry search Because mtree combines multiple non-adjacent entries for the same file, parse_file() has to compare every entry to every other entry. For large mtree files, this results in extremely long parse times. By breaking the entries down into a hash table, the number of comparisons required can be drastically reduced. The hash function was taken directly from the standard user/group lookup code. --- diff --git a/libarchive/archive_read_support_format_mtree.c b/libarchive/archive_read_support_format_mtree.c index ae58e8750..85c655fbb 100644 --- a/libarchive/archive_read_support_format_mtree.c +++ b/libarchive/archive_read_support_format_mtree.c @@ -75,6 +75,8 @@ __FBSDID("$FreeBSD: head/lib/libarchive/archive_read_support_format_mtree.c 2011 #define MTREE_HAS_OPTIONAL 0x0800 #define MTREE_HAS_NOCHANGE 0x1000 /* FreeBSD specific */ +#define MTREE_HASHTABLE_SIZE 1024 + struct mtree_option { struct mtree_option *next; char *value; @@ -86,6 +88,8 @@ struct mtree_entry { char *name; char full; char used; + unsigned int name_hash; + struct mtree_entry *hashtable_next; }; struct mtree { @@ -98,6 +102,7 @@ struct mtree { const char *archive_format_name; struct mtree_entry *entries; struct mtree_entry *this_entry; + struct mtree_entry *entry_hashtable[MTREE_HASHTABLE_SIZE]; struct archive_string current_dir; struct archive_string contents_name; @@ -110,6 +115,7 @@ struct mtree { static int bid_keycmp(const char *, const char *, ssize_t); static int cleanup(struct archive_read *); static int detect_form(struct archive_read *, int *); +static unsigned int hash(const char *); static int mtree_bid(struct archive_read *, int); static int parse_file(struct archive_read *, struct archive_entry *, struct mtree *, struct mtree_entry *, int *); @@ -862,11 +868,12 @@ process_add_entry(struct archive_read *a, struct mtree *mtree, struct mtree_option **global, const char *line, ssize_t line_len, struct mtree_entry **last_entry, int is_form_d) { - struct mtree_entry *entry; + struct mtree_entry *entry, *ht_iter; struct mtree_option *iter; const char *next, *eq, *name, *end; size_t name_len, len; int r, i; + unsigned int ht_idx; if ((entry = malloc(sizeof(*entry))) == NULL) { archive_set_error(&a->archive, errno, "Can't allocate memory"); @@ -877,6 +884,8 @@ process_add_entry(struct archive_read *a, struct mtree *mtree, entry->name = NULL; entry->used = 0; entry->full = 0; + entry->name_hash = 0; + entry->hashtable_next = NULL; /* Add this entry to list. */ if (*last_entry == NULL) @@ -929,6 +938,16 @@ process_add_entry(struct archive_read *a, struct mtree *mtree, memcpy(entry->name, name, name_len); entry->name[name_len] = '\0'; parse_escapes(entry->name, entry); + entry->name_hash = hash(entry->name); + + ht_idx = entry->name_hash % MTREE_HASHTABLE_SIZE; + if ((ht_iter = mtree->entry_hashtable[ht_idx]) != NULL) { + while (ht_iter->hashtable_next) + ht_iter = ht_iter->hashtable_next; + ht_iter->hashtable_next = entry; + } else { + mtree->entry_hashtable[ht_idx] = entry; + } for (iter = *global; iter != NULL; iter = iter->next) { r = add_option(a, &entry->options, iter->value, @@ -1122,9 +1141,10 @@ parse_file(struct archive_read *a, struct archive_entry *entry, * with pathname canonicalization, which is a very * tricky subject.) */ - for (mp = mentry->next; mp != NULL; mp = mp->next) { + for (mp = mentry->hashtable_next; mp != NULL; mp = mp->hashtable_next) { if (mp->full && !mp->used - && strcmp(mentry->name, mp->name) == 0) { + && mentry->name_hash == mp->name_hash + && strcmp(mentry->name, mp->name) == 0) { /* Later lines override earlier ones. */ mp->used = 1; r1 = parse_line(a, entry, mtree, mp, @@ -2000,3 +2020,19 @@ readline(struct archive_read *a, struct mtree *mtree, char **start, find_off = u - mtree->line.s; } } + +static unsigned int +hash(const char *p) +{ + /* A 32-bit version of Peter Weinberger's (PJW) hash algorithm, + as used by ELF for hashing function names. */ + unsigned g, h = 0; + while (*p != '\0') { + h = (h << 4) + *p++; + if ((g = h & 0xF0000000) != 0) { + h ^= g >> 24; + h &= 0x0FFFFFFF; + } + } + return h; +}