From: drh Date: Tue, 19 Feb 2019 18:39:16 +0000 (+0000) Subject: Add the fossildelta.c extension in ext/misc with implementations of the Fossil X-Git-Tag: version-3.28.0~174 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=4bec44bdfa1fea3374a8e262b6a8da6fc79f08b8;p=thirdparty%2Fsqlite.git Add the fossildelta.c extension in ext/misc with implementations of the Fossil delta functions. FossilOrigin-Name: b80cafa6f8a5c6ff1dc9efd2f670777ab131ace2df1eb431cedc8cfa901baf18 --- diff --git a/ext/misc/fossildelta.c b/ext/misc/fossildelta.c new file mode 100644 index 0000000000..3d845faa98 --- /dev/null +++ b/ext/misc/fossildelta.c @@ -0,0 +1,791 @@ +/* +** 2019-02-19 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This SQLite extension implements the delta functions used by Fossil. +*/ +#include +#include +#include +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +/* +** The "u32" type must be an unsigned 32-bit integer. Adjust this +*/ +typedef unsigned int u32; + +/* +** Must be a 16-bit value +*/ +typedef short int s16; +typedef unsigned short int u16; + + +/* +** The width of a hash window in bytes. The algorithm only works if this +** is a power of 2. +*/ +#define NHASH 16 + +/* +** The current state of the rolling hash. +** +** z[] holds the values that have been hashed. z[] is a circular buffer. +** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of +** the window. +** +** Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted +** sum. Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1. +** (Each index for z[] should be module NHASH, of course. The %NHASH operator +** is omitted in the prior expression for brevity.) +*/ +typedef struct hash hash; +struct hash { + u16 a, b; /* Hash values */ + u16 i; /* Start of the hash window */ + char z[NHASH]; /* The values that have been hashed */ +}; + +/* +** Initialize the rolling hash using the first NHASH characters of z[] +*/ +static void hash_init(hash *pHash, const char *z){ + u16 a, b, i; + a = b = z[0]; + for(i=1; iz, z, NHASH); + pHash->a = a & 0xffff; + pHash->b = b & 0xffff; + pHash->i = 0; +} + +/* +** Advance the rolling hash by a single character "c" +*/ +static void hash_next(hash *pHash, int c){ + u16 old = pHash->z[pHash->i]; + pHash->z[pHash->i] = c; + pHash->i = (pHash->i+1)&(NHASH-1); + pHash->a = pHash->a - old + c; + pHash->b = pHash->b - NHASH*old + pHash->a; +} + +/* +** Return a 32-bit hash value +*/ +static u32 hash_32bit(hash *pHash){ + return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16); +} + +/* +** Compute a hash on NHASH bytes. +** +** This routine is intended to be equivalent to: +** hash h; +** hash_init(&h, zInput); +** return hash_32bit(&h); +*/ +static u32 hash_once(const char *z){ + u16 a, b, i; + a = b = z[0]; + for(i=1; i0; i++, v>>=6){ + zBuf[i] = zDigits[v&0x3f]; + } + for(j=i-1; j>=0; j--){ + *(*pz)++ = zBuf[j]; + } +} + +/* +** Read bytes from *pz and convert them into a positive integer. When +** finished, leave *pz pointing to the first character past the end of +** the integer. The *pLen parameter holds the length of the string +** in *pz and is decremented once for each character in the integer. +*/ +static unsigned int getInt(const char **pz, int *pLen){ + static const signed char zValue[] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, 36, + -1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, -1, -1, -1, 63, -1, + }; + unsigned int v = 0; + int c; + unsigned char *z = (unsigned char*)*pz; + unsigned char *zStart = z; + while( (c = zValue[0x7f&*(z++)])>=0 ){ + v = (v<<6) + c; + } + z--; + *pLen -= z - zStart; + *pz = (char*)z; + return v; +} + +/* +** Return the number digits in the base-64 representation of a positive integer +*/ +static int digit_count(int v){ + unsigned int i, x; + for(i=1, x=64; v>=x; i++, x <<= 6){} + return i; +} + +#ifdef __GNUC__ +# define GCC_VERSION (__GNUC__*1000000+__GNUC_MINOR__*1000+__GNUC_PATCHLEVEL__) +#else +# define GCC_VERSION 0 +#endif + +/* +** Compute a 32-bit big-endian checksum on the N-byte buffer. If the +** buffer is not a multiple of 4 bytes length, compute the sum that would +** have occurred if the buffer was padded with zeros to the next multiple +** of four bytes. +*/ +static unsigned int checksum(const char *zIn, size_t N){ + static const int byteOrderTest = 1; + const unsigned char *z = (const unsigned char *)zIn; + const unsigned char *zEnd = (const unsigned char*)&zIn[N&~3]; + unsigned sum = 0; + assert( (z - (const unsigned char*)0)%4==0 ); /* Four-byte alignment */ + if( 0==*(char*)&byteOrderTest ){ + /* This is a big-endian machine */ + while( z=4003000 + while( z=1300 + while( z= 16){ + sum0 += ((unsigned)z[0] + z[4] + z[8] + z[12]); + sum1 += ((unsigned)z[1] + z[5] + z[9] + z[13]); + sum2 += ((unsigned)z[2] + z[6] + z[10]+ z[14]); + sum += ((unsigned)z[3] + z[7] + z[11]+ z[15]); + z += 16; + N -= 16; + } + while(N >= 4){ + sum0 += z[0]; + sum1 += z[1]; + sum2 += z[2]; + sum += z[3]; + z += 4; + N -= 4; + } + sum += (sum2 << 8) + (sum1 << 16) + (sum0 << 24); +#endif + } + switch(N&3){ + case 3: sum += (z[2] << 8); + case 2: sum += (z[1] << 16); + case 1: sum += (z[0] << 24); + default: ; + } + return sum; +} + +/* +** Create a new delta. +** +** The delta is written into a preallocated buffer, zDelta, which +** should be at least 60 bytes longer than the target file, zOut. +** The delta string will be NUL-terminated, but it might also contain +** embedded NUL characters if either the zSrc or zOut files are +** binary. This function returns the length of the delta string +** in bytes, excluding the final NUL terminator character. +** +** Output Format: +** +** The delta begins with a base64 number followed by a newline. This +** number is the number of bytes in the TARGET file. Thus, given a +** delta file z, a program can compute the size of the output file +** simply by reading the first line and decoding the base-64 number +** found there. The delta_output_size() routine does exactly this. +** +** After the initial size number, the delta consists of a series of +** literal text segments and commands to copy from the SOURCE file. +** A copy command looks like this: +** +** NNN@MMM, +** +** where NNN is the number of bytes to be copied and MMM is the offset +** into the source file of the first byte (both base-64). If NNN is 0 +** it means copy the rest of the input file. Literal text is like this: +** +** NNN:TTTTT +** +** where NNN is the number of bytes of text (base-64) and TTTTT is the text. +** +** The last term is of the form +** +** NNN; +** +** In this case, NNN is a 32-bit bigendian checksum of the output file +** that can be used to verify that the delta applied correctly. All +** numbers are in base-64. +** +** Pure text files generate a pure text delta. Binary files generate a +** delta that may contain some binary data. +** +** Algorithm: +** +** The encoder first builds a hash table to help it find matching +** patterns in the source file. 16-byte chunks of the source file +** sampled at evenly spaced intervals are used to populate the hash +** table. +** +** Next we begin scanning the target file using a sliding 16-byte +** window. The hash of the 16-byte window in the target is used to +** search for a matching section in the source file. When a match +** is found, a copy command is added to the delta. An effort is +** made to extend the matching section to regions that come before +** and after the 16-byte hash window. A copy command is only issued +** if the result would use less space that just quoting the text +** literally. Literal text is added to the delta for sections that +** do not match or which can not be encoded efficiently using copy +** commands. +*/ +static int delta_create( + const char *zSrc, /* The source or pattern file */ + unsigned int lenSrc, /* Length of the source file */ + const char *zOut, /* The target file */ + unsigned int lenOut, /* Length of the target file */ + char *zDelta /* Write the delta into this buffer */ +){ + int i, base; + char *zOrigDelta = zDelta; + hash h; + int nHash; /* Number of hash table entries */ + int *landmark; /* Primary hash table */ + int *collide; /* Collision chain */ + int lastRead = -1; /* Last byte of zSrc read by a COPY command */ + + /* Add the target file size to the beginning of the delta + */ + putInt(lenOut, &zDelta); + *(zDelta++) = '\n'; + + /* If the source file is very small, it means that we have no + ** chance of ever doing a copy command. Just output a single + ** literal segment for the entire target and exit. + */ + if( lenSrc<=NHASH ){ + putInt(lenOut, &zDelta); + *(zDelta++) = ':'; + memcpy(zDelta, zOut, lenOut); + zDelta += lenOut; + putInt(checksum(zOut, lenOut), &zDelta); + *(zDelta++) = ';'; + return zDelta - zOrigDelta; + } + + /* Compute the hash table used to locate matching sections in the + ** source file. + */ + nHash = lenSrc/NHASH; + collide = sqlite3_malloc64( (sqlite3_int64)nHash*2*sizeof(int) ); + memset(collide, -1, nHash*2*sizeof(int)); + landmark = &collide[nHash]; + for(i=0; i=0 && (limit--)>0 ){ + /* + ** The hash window has identified a potential match against + ** landmark block iBlock. But we need to investigate further. + ** + ** Look for a region in zOut that matches zSrc. Anchor the search + ** at zSrc[iSrc] and zOut[base+i]. Do not include anything prior to + ** zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen]. + ** + ** Set cnt equal to the length of the match and set ofst so that + ** zSrc[ofst] is the first element of the match. litsz is the number + ** of characters between zOut[base] and the beginning of the match. + ** sz will be the overhead (in bytes) needed to encode the copy + ** command. Only generate copy command if the overhead of the + ** copy command is less than the amount of literal text to be copied. + */ + int cnt, ofst, litsz; + int j, k, x, y; + int sz; + int limitX; + + /* Beginning at iSrc, match forwards as far as we can. j counts + ** the number of characters that match */ + iSrc = iBlock*NHASH; + y = base+i; + limitX = ( lenSrc-iSrc <= lenOut-y ) ? lenSrc : iSrc + lenOut - y; + for(x=iSrc; x=sz && cnt>bestCnt ){ + /* Remember this match only if it is the best so far and it + ** does not increase the file size */ + bestCnt = cnt; + bestOfst = iSrc-k; + bestLitsz = litsz; + } + + /* Check the next matching block */ + iBlock = collide[iBlock]; + } + + /* We have a copy command that does not cause the delta to be larger + ** than a literal insert. So add the copy command to the delta. + */ + if( bestCnt>0 ){ + if( bestLitsz>0 ){ + /* Add an insert command before the copy */ + putInt(bestLitsz,&zDelta); + *(zDelta++) = ':'; + memcpy(zDelta, &zOut[base], bestLitsz); + zDelta += bestLitsz; + base += bestLitsz; + } + base += bestCnt; + putInt(bestCnt, &zDelta); + *(zDelta++) = '@'; + putInt(bestOfst, &zDelta); + *(zDelta++) = ','; + if( bestOfst + bestCnt -1 > lastRead ){ + lastRead = bestOfst + bestCnt - 1; + } + bestCnt = 0; + break; + } + + /* If we reach this point, it means no match is found so far */ + if( base+i+NHASH>=lenOut ){ + /* We have reached the end of the file and have not found any + ** matches. Do an "insert" for everything that does not match */ + putInt(lenOut-base, &zDelta); + *(zDelta++) = ':'; + memcpy(zDelta, &zOut[base], lenOut-base); + zDelta += lenOut-base; + base = lenOut; + break; + } + + /* Advance the hash by one character. Keep looking for a match */ + hash_next(&h, zOut[base+i+NHASH]); + i++; + } + } + /* Output a final "insert" record to get all the text at the end of + ** the file that does not match anything in the source file. + */ + if( base0 ){ + unsigned int cnt, ofst; + cnt = getInt(&zDelta, &lenDelta); + switch( zDelta[0] ){ + case '@': { + zDelta++; lenDelta--; + ofst = getInt(&zDelta, &lenDelta); + if( lenDelta>0 && zDelta[0]!=',' ){ + /* ERROR: copy command not terminated by ',' */ + return -1; + } + zDelta++; lenDelta--; + total += cnt; + if( total>limit ){ + /* ERROR: copy exceeds output file size */ + return -1; + } + if( ofst+cnt > lenSrc ){ + /* ERROR: copy extends past end of input */ + return -1; + } + memcpy(zOut, &zSrc[ofst], cnt); + zOut += cnt; + break; + } + case ':': { + zDelta++; lenDelta--; + total += cnt; + if( total>limit ){ + /* ERROR: insert command gives an output larger than predicted */ + return -1; + } + if( cnt>lenDelta ){ + /* ERROR: insert count exceeds size of delta */ + return -1; + } + memcpy(zOut, zDelta, cnt); + zOut += cnt; + zDelta += cnt; + lenDelta -= cnt; + break; + } + case ';': { + zDelta++; lenDelta--; + zOut[0] = 0; +#ifdef FOSSIL_ENABLE_DELTA_CKSUM_TEST + if( cnt!=checksum(zOrigOut, total) ){ + /* ERROR: bad checksum */ + return -1; + } +#endif + if( total!=limit ){ + /* ERROR: generated size does not match predicted size */ + return -1; + } + return total; + } + default: { + /* ERROR: unknown delta operator */ + return -1; + } + } + } + /* ERROR: unterminated delta */ + return -1; +} + +/* +** Analyze a delta. Figure out the total number of bytes copied from +** source to target, and the total number of bytes inserted by the delta, +** and return both numbers. +*/ +static int delta_analyze( + const char *zDelta, /* Delta to apply to the pattern */ + int lenDelta, /* Length of the delta */ + int *pnCopy, /* OUT: Number of bytes copied */ + int *pnInsert /* OUT: Number of bytes inserted */ +){ + unsigned int nInsert = 0; + unsigned int nCopy = 0; + + (void)getInt(&zDelta, &lenDelta); + if( *zDelta!='\n' ){ + /* ERROR: size integer not terminated by "\n" */ + return -1; + } + zDelta++; lenDelta--; + while( *zDelta && lenDelta>0 ){ + unsigned int cnt; + cnt = getInt(&zDelta, &lenDelta); + switch( zDelta[0] ){ + case '@': { + zDelta++; lenDelta--; + (void)getInt(&zDelta, &lenDelta); + if( lenDelta>0 && zDelta[0]!=',' ){ + /* ERROR: copy command not terminated by ',' */ + return -1; + } + zDelta++; lenDelta--; + nCopy += cnt; + break; + } + case ':': { + zDelta++; lenDelta--; + nInsert += cnt; + if( cnt>lenDelta ){ + /* ERROR: insert count exceeds size of delta */ + return -1; + } + zDelta += cnt; + lenDelta -= cnt; + break; + } + case ';': { + *pnCopy = nCopy; + *pnInsert = nInsert; + return 0; + } + default: { + /* ERROR: unknown delta operator */ + return -1; + } + } + } + /* ERROR: unterminated delta */ + return -1; +} + +/* +** SQL functions: fossildelta_create(X,Y) +** +** Return a delta for carrying X into Y. +*/ +static void deltaCreateFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + const char *aOrig; int nOrig; /* old blob */ + const char *aNew; int nNew; /* new blob */ + char *aOut; int nOut; /* output delta */ + + assert( argc==2 ); + if( sqlite3_value_type(argv[0])==SQLITE_NULL ) return; + if( sqlite3_value_type(argv[1])==SQLITE_NULL ) return; + nOrig = sqlite3_value_bytes(argv[0]); + aOrig = (const char*)sqlite3_value_blob(argv[0]); + nNew = sqlite3_value_bytes(argv[1]); + aNew = (const char*)sqlite3_value_blob(argv[1]); + aOut = sqlite3_malloc64(nNew+70); + if( aOut==0 ){ + sqlite3_result_error_nomem(context); + }else{ + nOut = delta_create(aOrig, nOrig, aNew, nNew, aOut); + if( nOut<0 ){ + sqlite3_free(aOut); + sqlite3_result_error(context, "cannot create fossil delta", -1); + }else{ + sqlite3_result_blob(context, aOut, nOut, sqlite3_free); + } + } +} + +/* +** SQL functions: fossildelta_apply(X,D) +** +** Return the result of applying delta D to input X. +*/ +static void deltaApplyFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + const char *aOrig; int nOrig; /* The X input */ + const char *aDelta; int nDelta; /* The input delta (D) */ + char *aOut; int nOut, nOut2; /* The output */ + + assert( argc==2 ); + if( sqlite3_value_type(argv[0])==SQLITE_NULL ) return; + if( sqlite3_value_type(argv[1])==SQLITE_NULL ) return; + nOrig = sqlite3_value_bytes(argv[0]); + aOrig = (const char*)sqlite3_value_blob(argv[0]); + nDelta = sqlite3_value_bytes(argv[1]); + aDelta = (const char*)sqlite3_value_blob(argv[1]); + + /* Figure out the size of the output */ + nOut = delta_output_size(aDelta, nDelta); + if( nOut<0 ){ + sqlite3_result_error(context, "corrupt fossil delta", -1); + return; + } + aOut = sqlite3_malloc64((sqlite3_int64)nOut+1); + if( aOut==0 ){ + sqlite3_result_error_nomem(context); + }else{ + nOut2 = delta_apply(aOrig, nOrig, aDelta, nDelta, aOut); + if( nOut2!=nOut ){ + sqlite3_free(aOut); + sqlite3_result_error(context, "corrupt fossil delta", -1); + }else{ + sqlite3_result_blob(context, aOut, nOut, sqlite3_free); + } + } +} + + +/* +** SQL functions: fossildelta_output_size(D) +** +** Return the size of the output that results from applying delta D. +*/ +static void deltaOutputSizeFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + const char *aDelta; int nDelta; /* The input delta (D) */ + int nOut; /* Size of output */ + assert( argc==1 ); + if( sqlite3_value_type(argv[0])==SQLITE_NULL ) return; + nDelta = sqlite3_value_bytes(argv[0]); + aDelta = (const char*)sqlite3_value_blob(argv[0]); + + /* Figure out the size of the output */ + nOut = delta_output_size(aDelta, nDelta); + if( nOut<0 ){ + sqlite3_result_error(context, "corrupt fossil delta", -1); + return; + }else{ + sqlite3_result_int(context, nOut); + } +} + + +#ifdef _WIN32 +__declspec(dllexport) +#endif +int sqlite3_fossildelta_init( + sqlite3 *db, + char **pzErrMsg, + const sqlite3_api_routines *pApi +){ + int rc = SQLITE_OK; + SQLITE_EXTENSION_INIT2(pApi); + (void)pzErrMsg; /* Unused parameter */ + rc = sqlite3_create_function(db, "delta_create", 2, SQLITE_UTF8, 0, + deltaCreateFunc, 0, 0); + if( rc==SQLITE_OK ){ + rc = sqlite3_create_function(db, "delta_apply", 2, SQLITE_UTF8, 0, + deltaApplyFunc, 0, 0); + } + if( rc==SQLITE_OK ){ + rc = sqlite3_create_function(db, "delta_output_size", 1, SQLITE_UTF8, 0, + deltaOutputSizeFunc, 0, 0); + } + return rc; +} diff --git a/manifest b/manifest index c4848176e4..dc80c37b38 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\sa\spotential\smemory\sleak\sin\sRBU\sif\sthe\srbu_fossil_delta()\sSQL\sfunction\sis\nmisused.\s\sMisuse\snever\shappens\sin\sa\sworking\sRBU\ssystem,\sso\sthis\sis\snot\sa\s\nparticularly\simportant\sfix. -D 2019-02-19T17:45:31.317 +C Add\sthe\sfossildelta.c\sextension\sin\sext/misc\swith\simplementations\sof\sthe\sFossil\ndelta\sfunctions. +D 2019-02-19T18:39:16.475 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F Makefile.in 178d8eb6840771149cee40b322d1b3be30d330198c522c903c1b66fb5a1bfca4 @@ -286,6 +286,7 @@ F ext/misc/dbdump.c baf6e37447c9d6968417b1cd34cbedb0b0ab3f91b5329501d8a8d5be3287 F ext/misc/eval.c 4b4757592d00fd32e44c7a067e6a0e4839c81a4d57abc4131ee7806d1be3104e F ext/misc/explain.c d5c12962d79913ef774b297006872af1fccda388f61a11d37758f9179a09551f F ext/misc/fileio.c 592d6531d8413d81b25f5a47a45d7e310e455d33e03a64c6ae85724c6524a5d5 +F ext/misc/fossildelta.c 64619ac4ff0d865e01f25436fd1b82c3dd7f6bc6184c9a06e002b16a121cd652 F ext/misc/fuzzer.c 9e79c337faffdd4c5fe4485467537438359b43e0858a40038d4300b894ff553f F ext/misc/ieee754.c f190d0cc5182529acb15babd177781be1ac1718c F ext/misc/json1.c 8af4672f43634257dbcfdb4515b4070325463d67c6968b4be1bd414de28d4d58 @@ -1804,7 +1805,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P f2d400db4dbfb05e2540178ed3662f97f8c57a95f8129886c7081c35e53adf0d -R 8d9f63c20f18bc4035bf1d3926ca6101 +P 12517d1b15da46bc90bd95bb9c161d7f2ecdd7f28b1b3a5ed4397939ef986061 +R e38db5ccc4de4d9df319413343ad8fcf U drh -Z 3f3200a2e92dbd4c9a6a6ed07ff60873 +Z 993efe1b585ff08956441be0e9d0910b diff --git a/manifest.uuid b/manifest.uuid index 34fb63dc13..2ff6534024 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -12517d1b15da46bc90bd95bb9c161d7f2ecdd7f28b1b3a5ed4397939ef986061 \ No newline at end of file +b80cafa6f8a5c6ff1dc9efd2f670777ab131ace2df1eb431cedc8cfa901baf18 \ No newline at end of file