From: drh <> Date: Sat, 31 Aug 2024 16:55:14 +0000 (+0000) Subject: Avoid unnecessary sort operations when running one of the percentile X-Git-Tag: version-3.47.0~163^2~2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ad8ec9db63f0da294625211d6f261d99685fb971;p=thirdparty%2Fsqlite.git Avoid unnecessary sort operations when running one of the percentile aggregates as a window function. FossilOrigin-Name: 5d311536211eb1e3c887ceb7e6516d3900e6eebbccc8c445dd43cdd556182728 --- diff --git a/ext/misc/percentile.c b/ext/misc/percentile.c index 99b1430171..e427497878 100644 --- a/ext/misc/percentile.c +++ b/ext/misc/percentile.c @@ -62,6 +62,29 @@ ** ** (14) A separate percentile_cond(Y,X) function is the equivalent of ** percentile(Y,X*100.0). +** +** (15) All three SQL functions implemented by this module can also be +** used as window-functions. +** +** Implementation notes as of 2024-08-31: +** +** * The regular aggregate-function versions of the merge(), percentile(), +** and percentile_cond() routines work by accumulating all values in +** an array of doubles, then sorting that array using a quicksort +** before computing the answer. Thus the runtime is O(NlogN) where +** N is the number of rows of input. +** +** * For the window-function versions of these routines, the array of +** inputs is sorted as soon as the first value is computed. Thereafter, +** the array is kept in sorted order using an insert-sort. This +** results in O(N*K) performance where K is the size of the window. +** One can devise alternative implementations that give O(N*logN*logK) +** performance, but they require more complex logic and data structures. +** The developers have elected to keep the asymptotically slower +** algorithm for now, for simplicity, under the theory that window +** functions are seldom used and when they are, the window size K is +** often small. The developers might revisit that decision later, +** should the need arise. */ #include "sqlite3ext.h" SQLITE_EXTENSION_INIT1 @@ -78,6 +101,7 @@ struct Percentile { unsigned nAlloc; /* Number of slots allocated for a[] */ unsigned nUsed; /* Number of slots actually used in a[] */ char bSorted; /* True if a[] is already in sorted order */ + char bKeepSorted; /* True if advantageous to keep a[] sorted */ double rPct; /* 1.0 more than the value for P */ double *a; /* Array of Y values */ }; @@ -85,7 +109,7 @@ struct Percentile { /* ** Return TRUE if the input floating-point number is an infinity. */ -static int isInfinity(double r){ +static int percentIsInfinity(double r){ sqlite3_uint64 u; assert( sizeof(u)==sizeof(r) ); memcpy(&u, &r, sizeof(u)); @@ -93,13 +117,55 @@ static int isInfinity(double r){ } /* -** Return TRUE if two doubles differ by 0.001 or less +** Return TRUE if two doubles differ by 0.001 or less. */ -static int sameValue(double a, double b){ +static int percentSameValue(double a, double b){ a -= b; return a>=-0.001 && a<=0.001; } +#if 0 +/* Verify that the elements of the Percentile p are in fact sorted. +** Used for testing and debugging only. +*/ +static void percentAssertSorted(Percentile *p){ + int i; + for(i=p->nUsed-2; i>=0 && p->a[i]<=p->a[i+1]; i--){} + assert( i<0 ); +} +#else +# define percentAssertSorted(X) +#endif + +/* +** Search p (which must have p->bSorted) looking for an entry with +** value y. Return the index of that entry. +** +** If bExact is true, return -1 if the entry is not found. +** +** If bExact is false, return the index at which a new entry with +** value y should be insert in order to keep the values in sorted +** order. The smallest return value in this case will be 0, and +** the largest return value will be p->nUsed. +*/ +static int percentBinarySearch(Percentile *p, double y, int bExact){ + int iFirst = 0; /* First element of search range */ + int iLast = p->nUsed - 1; /* Last element of search range */ + while( iLast>=iFirst ){ + int iMid = (iFirst+iLast)/2; + double x = p->a[iMid]; + if( xy ){ + iLast = iMid - 1; + }else{ + return iMid; + } + } + if( bExact ) return -1; + return iFirst; +} + /* ** The "step" function for percentile(Y,P) is called once for each ** input row. @@ -145,7 +211,7 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ ** from any prior row, per Requirement (2). */ if( p->rPct==0.0 ){ p->rPct = rPct+1.0; - }else if( !sameValue(p->rPct,rPct+1.0) ){ + }else if( !percentSameValue(p->rPct,rPct+1.0) ){ sqlite3_result_error(pCtx, "2nd argument to percentile() is not the " "same for all input rows", -1); return; @@ -165,7 +231,7 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ /* Throw an error if the Y value is infinity or NaN */ y = sqlite3_value_double(argv[0]); - if( isInfinity(y) ){ + if( percentIsInfinity(y) ){ sqlite3_result_error(pCtx, "Inf input to percentile()", -1); return; } @@ -183,55 +249,26 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){ p->nAlloc = n; p->a = a; } - p->a[p->nUsed++] = y; - assert( p->nUsed>=1 ); - if( p->nUsed==1 ){ + if( p->nUsed==0 ){ + p->a[p->nUsed++] = y; p->bSorted = 1; - }else if( p->bSorted && p->a[p->nUsed-2]>y ){ + }else if( !p->bSorted || y>=p->a[p->nUsed-1] ){ + p->a[p->nUsed++] = y; + }else if( p->bKeepSorted ){ + int i; + percentAssertSorted(p); + i = percentBinarySearch(p, y, 0); + if( inUsed ){ + memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0])); + } + p->a[i] = y; + p->nUsed++; + }else{ + p->a[p->nUsed++] = y; p->bSorted = 0; } } -/* -** The "inverse" function for percentile(Y,P) is called to remove a -** row that was previously inserted by "step". -*/ -static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){ - Percentile *p; - int eType; - double y; - int i; - assert( argc==2 || argc==1 ); - - /* Allocate the session context. */ - p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p)); - assert( p!=0 ); - - /* Ignore rows for which Y is NULL */ - eType = sqlite3_value_type(argv[0]); - if( eType==SQLITE_NULL ) return; - - /* If not NULL, then Y must be numeric. Otherwise throw an error. - ** Requirement 4 */ - if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){ - return; - } - - /* Ignore the Y value if it is infinity or NaN */ - y = sqlite3_value_double(argv[0]); - if( isInfinity(y) ){ - return; - } - - /* Find and remove the row */ - for(i=0; inUsed && p->a[i]!=y; i++){} - if( inUsed ){ - p->a[i] = p->a[p->nUsed-1]; - p->nUsed--; - } - p->bSorted = p->nUsed<=1; -} - /* ** Sort an array of doubles. ** @@ -290,9 +327,59 @@ static void sortDoubles(double *a, int n){ #endif } + +/* +** The "inverse" function for percentile(Y,P) is called to remove a +** row that was previously inserted by "step". +*/ +static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){ + Percentile *p; + int eType; + double y; + int i; + assert( argc==2 || argc==1 ); + + /* Allocate the session context. */ + p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p)); + assert( p!=0 ); + + /* Ignore rows for which Y is NULL */ + eType = sqlite3_value_type(argv[0]); + if( eType==SQLITE_NULL ) return; + + /* If not NULL, then Y must be numeric. Otherwise throw an error. + ** Requirement 4 */ + if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){ + return; + } + + /* Ignore the Y value if it is infinity or NaN */ + y = sqlite3_value_double(argv[0]); + if( percentIsInfinity(y) ){ + return; + } + if( p->bSorted==0 ){ + sortDoubles(p->a, p->nUsed); + p->bSorted = 1; + }else{ + percentAssertSorted(p); + } + p->bKeepSorted = 1; + + /* Find and remove the row */ + i = percentBinarySearch(p, y, 1); + if( i>=0 ){ + p->nUsed--; + if( inUsed ){ + memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0])); + } + } + percentAssertSorted(p); +} + /* -** Called to compute the final output of percentile() and to clean -** up all allocated memory. +** Compute the final output of percentile(). Clean up all allocated +** memory if and only if bIsFinal is true. */ static void percentCompute(sqlite3_context *pCtx, int bIsFinal){ Percentile *p; @@ -306,6 +393,8 @@ static void percentCompute(sqlite3_context *pCtx, int bIsFinal){ if( p->bSorted==0 ){ sortDoubles(p->a, p->nUsed); p->bSorted = 1; + }else{ + percentAssertSorted(p); } ix = (p->rPct-1.0)*(p->nUsed-1)*0.01; i1 = (unsigned)ix; @@ -318,6 +407,8 @@ static void percentCompute(sqlite3_context *pCtx, int bIsFinal){ if( bIsFinal ){ sqlite3_free(p->a); memset(p, 0, sizeof(*p)); + }else{ + p->bKeepSorted = 1; } } static void percentFinal(sqlite3_context *pCtx){ @@ -327,8 +418,6 @@ static void percentValue(sqlite3_context *pCtx){ percentCompute(pCtx, 0); } - - #ifdef _WIN32 __declspec(dllexport) #endif @@ -349,7 +438,6 @@ int sqlite3_percentile_init( SQLITE_UTF8|SQLITE_INNOCUOUS, 0, percentStep, percentFinal, percentValue, percentInverse, 0); - } if( rc==SQLITE_OK ){ rc = sqlite3_create_window_function(db, "percentile_cont", 2, diff --git a/manifest b/manifest index e61e0275b4..e4debfefcd 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Test\scases\sadded. -D 2024-08-31T15:02:07.805 +C Avoid\sunnecessary\ssort\soperations\swhen\srunning\sone\sof\sthe\spercentile\naggregates\sas\sa\swindow\sfunction. +D 2024-08-31T16:55:14.747 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724 @@ -410,7 +410,7 @@ F ext/misc/nextchar.c 7877914c2a80c2f181dd04c3dbef550dfb54c93495dc03da2403b5dd58 F ext/misc/noop.c f1a21cc9b7a4e667e5c8458d80ba680b8bd4315a003f256006046879f679c5a0 F ext/misc/normalize.c bd84355c118e297522aba74de34a4fd286fc775524e0499b14473918d09ea61f F ext/misc/pcachetrace.c f4227ce03fb16aa8d6f321b72dd051097419d7a028a9853af048bee7645cb405 -F ext/misc/percentile.c 89416b108569171be1d8dda4fa2687ad116ea969b4d129d02cf3dc1fd67fc87e +F ext/misc/percentile.c 46627b7495c69344d384f667bb6c80ba2c4aeb779997a4e22fea1a39cd20beb9 F ext/misc/prefixes.c 82645f79229877afab08c8b08ca1e7fa31921280906b90a61c294e4f540cd2a6 F ext/misc/qpvtab.c fc189e127f68f791af90a487f4460ec91539a716daf45a0c357e963fd47cc06c F ext/misc/randomjson.c ef835fc64289e76ac4873b85fe12f9463a036168d7683cf2b773e36e6262c4ed @@ -2211,8 +2211,8 @@ F vsixtest/vsixtest.tcl 6195aba1f12a5e10efc2b8c0009532167be5e301abe5b31385638080 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 4d0e3df4b9c609755977b8a462126242d2be1310c0122a8d4ba76d98d32a7230 -R 06b03a21c6b126e146ea802b7b43139b +P 25e68229843cc84978955817285550085d1306ba4ce3b0517dd00e5d05b9ae0a +R 1b58d4d1990506055e066a3b56b6ff9a U drh -Z a8a75faed57aa0d8766eb4f3377323de +Z 10ad6a710202821993e7562cc60a54b3 # Remove this line to create a well-formed Fossil manifest. diff --git a/manifest.uuid b/manifest.uuid index 301d4eacfd..1413d7e77e 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -25e68229843cc84978955817285550085d1306ba4ce3b0517dd00e5d05b9ae0a +5d311536211eb1e3c887ceb7e6516d3900e6eebbccc8c445dd43cdd556182728