[thirdparty/git.git] / diff-delta.c

/*
 * diff-delta.c: generate a delta between two buffers
 *
 *  Many parts of this file have been lifted from LibXDiff version 0.10.
 *  http://www.xmailserver.org/xdiff-lib.html
 *
 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
 *  Copyright (C) 2003	Davide Libenzi
 *
 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
 *
 *  This file is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  Use of this within git automatically means that the LGPL
 *  licensing gets turned into GPLv2 within this project.
 */

#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include "delta.h"


/* block size: min = 16, max = 64k, power of 2 */
#define BLK_SIZE 16

#define MIN(a, b) ((a) < (b) ? (a) : (b))

#define GR_PRIME 0x9e370001
#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))

struct index {
	const unsigned char *ptr;
	unsigned int val;
	struct index *next;
};

static struct index ** delta_index(const unsigned char *buf,
				   unsigned long bufsize,
				   unsigned long trg_bufsize,
				   unsigned int *hash_shift)
{
	unsigned int i, hsize, hshift, hlimit, entries, *hash_count;
	const unsigned char *data;
	struct index *entry, **hash;
	void *mem;

	/* determine index hash size */
	entries = bufsize  / BLK_SIZE;
	hsize = entries / 4;
	for (i = 4; (1 << i) < hsize && i < 31; i++);
	hsize = 1 << i;
	hshift = 32 - i;
	*hash_shift = hshift;

	/* allocate lookup index */
	mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
	if (!mem)
		return NULL;
	hash = mem;
	entry = mem + hsize * sizeof(*hash);
	memset(hash, 0, hsize * sizeof(*hash));

	/* allocate an array to count hash entries */
	hash_count = calloc(hsize, sizeof(*hash_count));
	if (!hash_count) {
		free(hash);
		return NULL;
	}

	/* then populate the index */
	data = buf + entries * BLK_SIZE - BLK_SIZE;
	while (data >= buf) {
		unsigned int val = adler32(0, data, BLK_SIZE);
		i = HASH(val, hshift);
		entry->ptr = data;
		entry->val = val;
		entry->next = hash[i];
		hash[i] = entry++;
		hash_count[i]++;
		data -= BLK_SIZE;
 	}

	/*
	 * Determine a limit on the number of entries in the same hash
	 * bucket.  This guard us against patological data sets causing
	 * really bad hash distribution with most entries in the same hash
	 * bucket that would bring us to O(m*n) computing costs (m and n
	 * corresponding to reference and target buffer sizes).
	 *
	 * The more the target buffer is large, the more it is important to
	 * have small entry lists for each hash buckets.  With such a limit
	 * the cost is bounded to something more like O(m+n).
	 */
	hlimit = (1 << 26) / trg_bufsize;
	if (hlimit < 4*BLK_SIZE)
		hlimit = 4*BLK_SIZE;

	/*
	 * Now make sure none of the hash buckets has more entries than
	 * we're willing to test.  Otherwise we cull the entry list
	 * uniformly to still preserve a good repartition across
	 * the reference buffer.
	 */
	for (i = 0; i < hsize; i++) {
		if (hash_count[i] < hlimit)
			continue;
		entry = hash[i];
		do {
			struct index *keep = entry;
			int skip = hash_count[i] / hlimit / 2;
			do {
				entry = entry->next;
			} while(--skip && entry);
			keep->next = entry;
		} while(entry);
	}
	free(hash_count);

	return hash;
}

/* provide the size of the copy opcode given the block offset and size */
#define COPYOP_SIZE(o, s) \
    (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
     !!(s & 0xff) + !!(s & 0xff00) + 1)

/* the maximum size for any opcode */
#define MAX_OP_SIZE COPYOP_SIZE(0xffffffff, 0xffffffff)

void *diff_delta(void *from_buf, unsigned long from_size,
		 void *to_buf, unsigned long to_size,
		 unsigned long *delta_size,
		 unsigned long max_size)
{
	unsigned int i, outpos, outsize, hash_shift;
	int inscnt;
	const unsigned char *ref_data, *ref_top, *data, *top;
	unsigned char *out;
	struct index *entry, **hash;

	if (!from_size || !to_size)
		return NULL;
	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
	if (!hash)
		return NULL;

	outpos = 0;
	outsize = 8192;
	if (max_size && outsize >= max_size)
		outsize = max_size + MAX_OP_SIZE + 1;
	out = malloc(outsize);
	if (!out) {
		free(hash);
		return NULL;
	}

	ref_data = from_buf;
	ref_top = from_buf + from_size;
	data = to_buf;
	top = to_buf + to_size;

	/* store reference buffer size */
	out[outpos++] = from_size;
	from_size >>= 7;
	while (from_size) {
		out[outpos - 1] |= 0x80;
		out[outpos++] = from_size;
		from_size >>= 7;
	}

	/* store target buffer size */
	out[outpos++] = to_size;
	to_size >>= 7;
	while (to_size) {
		out[outpos - 1] |= 0x80;
		out[outpos++] = to_size;
		to_size >>= 7;
	}

	inscnt = 0;

	while (data < top) {
		unsigned int moff = 0, msize = 0;
		if (data + BLK_SIZE <= top) {
			unsigned int val = adler32(0, data, BLK_SIZE);
			i = HASH(val, hash_shift);
			for (entry = hash[i]; entry; entry = entry->next) {
				const unsigned char *ref = entry->ptr;
				const unsigned char *src = data;
				unsigned int ref_size = ref_top - ref;
				if (entry->val != val)
					continue;
				if (ref_size > top - src)
					ref_size = top - src;
				if (ref_size > 0x10000)
					ref_size = 0x10000;
				if (ref_size <= msize)
					break;
				while (ref_size-- && *src++ == *ref)
					ref++;
				if (msize < ref - entry->ptr) {
					/* this is our best match so far */
					msize = ref - entry->ptr;
					moff = entry->ptr - ref_data;
				}
			}
		}

		if (!msize || msize < COPYOP_SIZE(moff, msize)) {
			if (!inscnt)
				outpos++;
			out[outpos++] = *data++;
			inscnt++;
			if (inscnt == 0x7f) {
				out[outpos - inscnt - 1] = inscnt;
				inscnt = 0;
			}
		} else {
			unsigned char *op;

			if (inscnt) {
				while (moff && ref_data[moff-1] == data[-1]) {
					if (msize == 0x10000)
						break;
					/* we can match one byte back */
					msize++;
					moff--;
					data--;
					outpos--;
					if (--inscnt)
						continue;
					outpos--;  /* remove count slot */
					inscnt--;  /* make it -1 */
					break;
				}
				out[outpos - inscnt - 1] = inscnt;
				inscnt = 0;
			}

			data += msize;
			op = out + outpos++;
			i = 0x80;

			if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
			moff >>= 8;
			if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
			moff >>= 8;
			if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
			moff >>= 8;
			if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }

			if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
			msize >>= 8;
			if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }

			*op = i;
		}

		if (outpos >= outsize - MAX_OP_SIZE) {
			void *tmp = out;
			outsize = outsize * 3 / 2;
			if (max_size && outsize >= max_size)
				outsize = max_size + MAX_OP_SIZE + 1;
			if (max_size && outpos > max_size)
				out = NULL;
			else
				out = realloc(out, outsize);
			if (!out) {
				free(tmp);
				free(hash);
				return NULL;
			}
		}
	}

	if (inscnt)
		out[outpos - inscnt - 1] = inscnt;

	free(hash);
	*delta_size = outpos;
	return out;
}
Commit	Line	Data
a310d434 NP	1	/*
	2	* diff-delta.c: generate a delta between two buffers
	3	*
	4	* Many parts of this file have been lifted from LibXDiff version 0.10.
	5	* http://www.xmailserver.org/xdiff-lib.html
	6	*
	7	* LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
	8	* Copyright (C) 2003 Davide Libenzi
	9	*
	10	* Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
	11	*
	12	* This file is free software; you can redistribute it and/or
	13	* modify it under the terms of the GNU Lesser General Public
	14	* License as published by the Free Software Foundation; either
	15	* version 2.1 of the License, or (at your option) any later version.
	16	*
	17	* Use of this within git automatically means that the LGPL
	18	* licensing gets turned into GPLv2 within this project.
	19	*/
	20
	21	#include <stdlib.h>
8e1454b5 NP	22	#include <string.h>
8e1454b5 NP	23	#include <zlib.h>
a310d434 NP	24	#include "delta.h"
	25
	26
	27	/* block size: min = 16, max = 64k, power of 2 */
	28	#define BLK_SIZE 16
	29
	30	#define MIN(a, b) ((a) < (b) ? (a) : (b))
	31
	32	#define GR_PRIME 0x9e370001
8e1454b5	33	#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
a310d434	34
8e1454b5	35	struct index {
a310d434	36	const unsigned char *ptr;
8e1454b5 NP	37	unsigned int val;
	38	struct index *next;
	39	};
a310d434	40
8e1454b5 NP	41	static struct index ** delta_index(const unsigned char *buf,
8e1454b5 NP	42	unsigned long bufsize,
c13c6bf7	43	unsigned long trg_bufsize,
8e1454b5	44	unsigned int *hash_shift)
a310d434	45	{
c13c6bf7	46	unsigned int i, hsize, hshift, hlimit, entries, *hash_count;
8e1454b5 NP	47	const unsigned char *data;
	48	struct index entry, *hash;
	49	void *mem;
	50
	51	/* determine index hash size */
c13c6bf7	52	entries = bufsize / BLK_SIZE;
8e1454b5	53	hsize = entries / 4;
c13c6bf7	54	for (i = 4; (1 << i) < hsize && i < 31; i++);
8e1454b5 NP	55	hsize = 1 << i;
	56	hshift = 32 - i;
	57	*hash_shift = hshift;
	58
	59	/* allocate lookup index */
	60	mem = malloc(hsize * sizeof(hash) + entries sizeof(*entry));
	61	if (!mem)
	62	return NULL;
	63	hash = mem;
	64	entry = mem + hsize * sizeof(*hash);
	65	memset(hash, 0, hsize * sizeof(*hash));
	66
c13c6bf7 NP	67	/* allocate an array to count hash entries */
	68	hash_count = calloc(hsize, sizeof(*hash_count));
	69	if (!hash_count) {
	70	free(hash);
	71	return NULL;
	72	}
	73
	74	/* then populate the index */
8e1454b5	75	data = buf + entries * BLK_SIZE - BLK_SIZE;
8e1454b5	76	while (data >= buf) {
c13c6bf7	77	unsigned int val = adler32(0, data, BLK_SIZE);
8e1454b5 NP	78	i = HASH(val, hshift);
	79	entry->ptr = data;
	80	entry->val = val;
	81	entry->next = hash[i];
	82	hash[i] = entry++;
c13c6bf7	83	hash_count[i]++;
a310d434	84	data -= BLK_SIZE;
8e1454b5	85	}
a310d434	86
c13c6bf7 NP	87	/*
	88	* Determine a limit on the number of entries in the same hash
	89	* bucket. This guard us against patological data sets causing
	90	* really bad hash distribution with most entries in the same hash
	91	* bucket that would bring us to O(m*n) computing costs (m and n
	92	* corresponding to reference and target buffer sizes).
	93	*
	94	* The more the target buffer is large, the more it is important to
	95	* have small entry lists for each hash buckets. With such a limit
	96	* the cost is bounded to something more like O(m+n).
	97	*/
	98	hlimit = (1 << 26) / trg_bufsize;
	99	if (hlimit < 4*BLK_SIZE)
	100	hlimit = 4*BLK_SIZE;
	101
	102	/*
	103	* Now make sure none of the hash buckets has more entries than
	104	* we're willing to test. Otherwise we cull the entry list
	105	* uniformly to still preserve a good repartition across
	106	* the reference buffer.
	107	*/
	108	for (i = 0; i < hsize; i++) {
	109	if (hash_count[i] < hlimit)
	110	continue;
	111	entry = hash[i];
	112	do {
	113	struct index *keep = entry;
	114	int skip = hash_count[i] / hlimit / 2;
	115	do {
	116	entry = entry->next;
	117	} while(--skip && entry);
	118	keep->next = entry;
	119	} while(entry);
	120	}
	121	free(hash_count);
	122
8e1454b5	123	return hash;
a310d434 NP	124	}
a310d434 NP	125
fe474b58	126	/* provide the size of the copy opcode given the block offset and size */
a310d434 NP	127	#define COPYOP_SIZE(o, s) \
	128	(!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
	129	!!(s & 0xff) + !!(s & 0xff00) + 1)
	130
fe474b58 NP	131	/* the maximum size for any opcode */
	132	#define MAX_OP_SIZE COPYOP_SIZE(0xffffffff, 0xffffffff)
	133
a310d434 NP	134	void diff_delta(void from_buf, unsigned long from_size,
a310d434 NP	135	void *to_buf, unsigned long to_size,
75c42d8c LT	136	unsigned long *delta_size,
75c42d8c LT	137	unsigned long max_size)
a310d434	138	{
5a1fb2ca NP	139	unsigned int i, outpos, outsize, hash_shift;
5a1fb2ca NP	140	int inscnt;
8e1454b5 NP	141	const unsigned char ref_data, ref_top, data, top;
	142	unsigned char *out;
	143	struct index entry, *hash;
a310d434	144
8e1454b5	145	if (!from_size \|\| !to_size)
a310d434	146	return NULL;
c13c6bf7	147	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
8e1454b5 NP	148	if (!hash)
	149	return NULL;
	150
a310d434 NP	151	outpos = 0;
a310d434 NP	152	outsize = 8192;
fe474b58 NP	153	if (max_size && outsize >= max_size)
fe474b58 NP	154	outsize = max_size + MAX_OP_SIZE + 1;
a310d434 NP	155	out = malloc(outsize);
a310d434 NP	156	if (!out) {
8e1454b5	157	free(hash);
a310d434 NP	158	return NULL;
	159	}
	160
e5e3a9d8 NP	161	ref_data = from_buf;
e5e3a9d8 NP	162	ref_top = from_buf + from_size;
a310d434 NP	163	data = to_buf;
	164	top = to_buf + to_size;
	165
	166	/* store reference buffer size */
69a2d426 NP	167	out[outpos++] = from_size;
	168	from_size >>= 7;
	169	while (from_size) {
	170	out[outpos - 1] \|= 0x80;
	171	out[outpos++] = from_size;
	172	from_size >>= 7;
	173	}
a310d434 NP	174
a310d434 NP	175	/* store target buffer size */
69a2d426 NP	176	out[outpos++] = to_size;
	177	to_size >>= 7;
	178	while (to_size) {
	179	out[outpos - 1] \|= 0x80;
	180	out[outpos++] = to_size;
	181	to_size >>= 7;
	182	}
a310d434 NP	183
a310d434 NP	184	inscnt = 0;
a310d434	185
8e1454b5 NP	186	while (data < top) {
8e1454b5 NP	187	unsigned int moff = 0, msize = 0;
c13c6bf7 NP	188	if (data + BLK_SIZE <= top) {
	189	unsigned int val = adler32(0, data, BLK_SIZE);
	190	i = HASH(val, hash_shift);
	191	for (entry = hash[i]; entry; entry = entry->next) {
	192	const unsigned char *ref = entry->ptr;
	193	const unsigned char *src = data;
	194	unsigned int ref_size = ref_top - ref;
	195	if (entry->val != val)
	196	continue;
	197	if (ref_size > top - src)
	198	ref_size = top - src;
	199	if (ref_size > 0x10000)
	200	ref_size = 0x10000;
	201	if (ref_size <= msize)
8e1454b5	202	break;
c13c6bf7 NP	203	while (ref_size-- && src++ == ref)
	204	ref++;
	205	if (msize < ref - entry->ptr) {
	206	/* this is our best match so far */
	207	msize = ref - entry->ptr;
	208	moff = entry->ptr - ref_data;
a310d434 NP	209	}
	210	}
	211	}
	212
	213	if (!msize \|\| msize < COPYOP_SIZE(moff, msize)) {
	214	if (!inscnt)
	215	outpos++;
	216	out[outpos++] = *data++;
	217	inscnt++;
	218	if (inscnt == 0x7f) {
	219	out[outpos - inscnt - 1] = inscnt;
	220	inscnt = 0;
	221	}
	222	} else {
8e1454b5 NP	223	unsigned char *op;
8e1454b5 NP	224
a310d434	225	if (inscnt) {
5a1fb2ca NP	226	while (moff && ref_data[moff-1] == data[-1]) {
	227	if (msize == 0x10000)
	228	break;
	229	/* we can match one byte back */
	230	msize++;
	231	moff--;
	232	data--;
	233	outpos--;
	234	if (--inscnt)
	235	continue;
	236	outpos--; /* remove count slot */
	237	inscnt--; /* make it -1 */
	238	break;
	239	}
a310d434 NP	240	out[outpos - inscnt - 1] = inscnt;
	241	inscnt = 0;
	242	}
	243
	244	data += msize;
8e1454b5	245	op = out + outpos++;
a310d434 NP	246	i = 0x80;
	247
	248	if (moff & 0xff) { out[outpos++] = moff; i \|= 0x01; }
	249	moff >>= 8;
	250	if (moff & 0xff) { out[outpos++] = moff; i \|= 0x02; }
	251	moff >>= 8;
	252	if (moff & 0xff) { out[outpos++] = moff; i \|= 0x04; }
	253	moff >>= 8;
	254	if (moff & 0xff) { out[outpos++] = moff; i \|= 0x08; }
	255
	256	if (msize & 0xff) { out[outpos++] = msize; i \|= 0x10; }
	257	msize >>= 8;
	258	if (msize & 0xff) { out[outpos++] = msize; i \|= 0x20; }
	259
8e1454b5	260	*op = i;
a310d434 NP	261	}
a310d434 NP	262
fe474b58	263	if (outpos >= outsize - MAX_OP_SIZE) {
a310d434 NP	264	void *tmp = out;
a310d434 NP	265	outsize = outsize * 3 / 2;
fe474b58 NP	266	if (max_size && outsize >= max_size)
	267	outsize = max_size + MAX_OP_SIZE + 1;
	268	if (max_size && outpos > max_size)
	269	out = NULL;
	270	else
	271	out = realloc(out, outsize);
a310d434 NP	272	if (!out) {
a310d434 NP	273	free(tmp);
8e1454b5	274	free(hash);
a310d434 NP	275	return NULL;
	276	}
	277	}
	278	}
	279
	280	if (inscnt)
	281	out[outpos - inscnt - 1] = inscnt;
	282
8e1454b5	283	free(hash);
a310d434 NP	284	*delta_size = outpos;
	285	return out;
	286	}