From: Andrew Gregory <andrew.gregory.8@gmail.com>
Date: Tue, 1 Nov 2016 16:42:14 +0000 (-0400)
Subject: mtree: use hash table for duplicate entry search
X-Git-Tag: v3.3.0~108^2
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fpull%2F819%2Fhead;p=thirdparty%2Flibarchive.git

mtree: use hash table for duplicate entry search

Because mtree combines multiple non-adjacent entries for the same file,
parse_file() has to compare every entry to every other entry.  For large
mtree files, this results in extremely long parse times.  By breaking
the entries down into a hash table, the number of comparisons required
can be drastically reduced.  The hash function was taken directly from
the standard user/group lookup code.
---

diff --git a/libarchive/archive_read_support_format_mtree.c b/libarchive/archive_read_support_format_mtree.c
index ae58e8750..85c655fbb 100644
--- a/libarchive/archive_read_support_format_mtree.c
+++ b/libarchive/archive_read_support_format_mtree.c
@@ -75,6 +75,8 @@ __FBSDID("$FreeBSD: head/lib/libarchive/archive_read_support_format_mtree.c 2011
 #define	MTREE_HAS_OPTIONAL	0x0800
 #define	MTREE_HAS_NOCHANGE	0x1000 /* FreeBSD specific */
 
+#define	MTREE_HASHTABLE_SIZE 1024
+
 struct mtree_option {
 	struct mtree_option *next;
 	char *value;
@@ -86,6 +88,8 @@ struct mtree_entry {
 	char *name;
 	char full;
 	char used;
+	unsigned int name_hash;
+	struct mtree_entry *hashtable_next;
 };
 
 struct mtree {
@@ -98,6 +102,7 @@ struct mtree {
 	const char		*archive_format_name;
 	struct mtree_entry	*entries;
 	struct mtree_entry	*this_entry;
+	struct mtree_entry	*entry_hashtable[MTREE_HASHTABLE_SIZE];
 	struct archive_string	 current_dir;
 	struct archive_string	 contents_name;
 
@@ -110,6 +115,7 @@ struct mtree {
 static int	bid_keycmp(const char *, const char *, ssize_t);
 static int	cleanup(struct archive_read *);
 static int	detect_form(struct archive_read *, int *);
+static unsigned int	hash(const char *);
 static int	mtree_bid(struct archive_read *, int);
 static int	parse_file(struct archive_read *, struct archive_entry *,
 		    struct mtree *, struct mtree_entry *, int *);
@@ -862,11 +868,12 @@ process_add_entry(struct archive_read *a, struct mtree *mtree,
     struct mtree_option **global, const char *line, ssize_t line_len,
     struct mtree_entry **last_entry, int is_form_d)
 {
-	struct mtree_entry *entry;
+	struct mtree_entry *entry, *ht_iter;
 	struct mtree_option *iter;
 	const char *next, *eq, *name, *end;
 	size_t name_len, len;
 	int r, i;
+	unsigned int ht_idx;
 
 	if ((entry = malloc(sizeof(*entry))) == NULL) {
 		archive_set_error(&a->archive, errno, "Can't allocate memory");
@@ -877,6 +884,8 @@ process_add_entry(struct archive_read *a, struct mtree *mtree,
 	entry->name = NULL;
 	entry->used = 0;
 	entry->full = 0;
+	entry->name_hash = 0;
+	entry->hashtable_next = NULL;
 
 	/* Add this entry to list. */
 	if (*last_entry == NULL)
@@ -929,6 +938,16 @@ process_add_entry(struct archive_read *a, struct mtree *mtree,
 	memcpy(entry->name, name, name_len);
 	entry->name[name_len] = '\0';
 	parse_escapes(entry->name, entry);
+	entry->name_hash = hash(entry->name);
+
+	ht_idx = entry->name_hash % MTREE_HASHTABLE_SIZE;
+	if ((ht_iter = mtree->entry_hashtable[ht_idx]) != NULL) {
+		while (ht_iter->hashtable_next)
+			ht_iter = ht_iter->hashtable_next;
+		ht_iter->hashtable_next = entry;
+	} else {
+		mtree->entry_hashtable[ht_idx] = entry;
+	}
 
 	for (iter = *global; iter != NULL; iter = iter->next) {
 		r = add_option(a, &entry->options, iter->value,
@@ -1122,9 +1141,10 @@ parse_file(struct archive_read *a, struct archive_entry *entry,
 		 * with pathname canonicalization, which is a very
 		 * tricky subject.)
 		 */
-		for (mp = mentry->next; mp != NULL; mp = mp->next) {
+		for (mp = mentry->hashtable_next; mp != NULL; mp = mp->hashtable_next) {
 			if (mp->full && !mp->used
-			    && strcmp(mentry->name, mp->name) == 0) {
+					&& mentry->name_hash == mp->name_hash
+					&& strcmp(mentry->name, mp->name) == 0) {
 				/* Later lines override earlier ones. */
 				mp->used = 1;
 				r1 = parse_line(a, entry, mtree, mp,
@@ -2000,3 +2020,19 @@ readline(struct archive_read *a, struct mtree *mtree, char **start,
 		find_off = u - mtree->line.s;
 	}
 }
+
+static unsigned int
+hash(const char *p)
+{
+	/* A 32-bit version of Peter Weinberger's (PJW) hash algorithm,
+	   as used by ELF for hashing function names. */
+	unsigned g, h = 0;
+	while (*p != '\0') {
+		h = (h << 4) + *p++;
+		if ((g = h & 0xF0000000) != 0) {
+			h ^= g >> 24;
+			h &= 0x0FFFFFFF;
+		}
+	}
+	return h;
+}