From: dan Date: Fri, 13 Jul 2012 19:26:34 +0000 (+0000) Subject: Add the "matchlen" column to the spellfix1 virtual table. X-Git-Tag: version-3.7.14~64^2~3 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8512752407da984870c8ced7644d94cf4b9a27d8;p=thirdparty%2Fsqlite.git Add the "matchlen" column to the spellfix1 virtual table. FossilOrigin-Name: f24b9d87f6b0e8b4d26669d5c1191f9280ba14a3 --- diff --git a/manifest b/manifest index 4901be8cae..c081ba92f0 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Update\stest_spellfix.c\swith\slatest\schanges. -D 2012-07-13T16:15:20.128 +C Add\sthe\s"matchlen"\scolumn\sto\sthe\sspellfix1\svirtual\stable. +D 2012-07-13T19:26:34.617 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 8f6d858bf3df9978ba43df19985146a1173025e4 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -221,7 +221,7 @@ F src/test_quota.h 8761e463b25e75ebc078bd67d70e39b9c817a0cb F src/test_rtree.c aba603c949766c4193f1068b91c787f57274e0d9 F src/test_schema.c 8c06ef9ddb240c7a0fcd31bc221a6a2aade58bf0 F src/test_server.c 2f99eb2837dfa06a4aacf24af24c6affdf66a84f -F src/test_spellfix.c 1de8d8c086efa50bb6660ea5988e8630ef9144aa +F src/test_spellfix.c 1c900928dad9b71c0fdcbdda9e2f52234f283660 F src/test_stat.c d1569c7a4839f13e80187e2c26b2ab4da2d03935 F src/test_superlock.c 2b97936ca127d13962c3605dbc9a4ef269c424cd F src/test_syscall.c a992d8c80ea91fbf21fb2dd570db40e77dd7e6ae @@ -716,6 +716,7 @@ F test/speed3.test d32043614c08c53eafdc80f33191d5bd9b920523 F test/speed4.test abc0ad3399dcf9703abed2fff8705e4f8e416715 F test/speed4p.explain 6b5f104ebeb34a038b2f714150f51d01143e59aa F test/speed4p.test 0e51908951677de5a969b723e03a27a1c45db38b +F test/spellfix.test 936be6f7ba1c4d096adb280c68b32f4848af8d2e F test/sqllimits1.test b1aae27cc98eceb845e7f7adf918561256e31298 F test/stat.test 08e8185b3fd5b010c90d7ad82b9dd4ea1cbf14b0 F test/stmt.test 25d64e3dbf9a3ce89558667d7f39d966fe2a71b9 @@ -1004,7 +1005,10 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381 -P 7fac56ed9feda819e66070bd5e06db8cad77e8bd -R 8322d35d4441c91cd733d64b44bd2f2e +P cba2a65870481df213e006b07e74f0ca19d2d57c +R 79ffd07a9c878240f73dd72d96e3ef36 +T *branch * spellfix-matchlen +T *sym-spellfix-matchlen * +T -sym-trunk * U dan -Z 4817f8644451c5f84c464c55e7d56257 +Z 642149d4a86bbb287ab54c68c79dc818 diff --git a/manifest.uuid b/manifest.uuid index 45760e0dc2..ebe0ec2458 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -cba2a65870481df213e006b07e74f0ca19d2d57c \ No newline at end of file +f24b9d87f6b0e8b4d26669d5c1191f9280ba14a3 \ No newline at end of file diff --git a/src/test_spellfix.c b/src/test_spellfix.c index 67a9920932..d80ce8f013 100644 --- a/src/test_spellfix.c +++ b/src/test_spellfix.c @@ -101,6 +101,11 @@ ** by default (unless overridden by ORDER BY) returns ** results in order of increasing score. ** +** matchlen For prefix queries, the number of characters in the prefix +** of the returned value (word) that matched the query term. +** For non-prefix queries, the number of characters in the +** returned value. +** ** top (HIDDEN) For any query, this value is the same on all ** rows. It is an integer which is the maximum number of ** rows that will be output. The actually number of rows @@ -605,8 +610,14 @@ static int substituteCost(char cPrev, char cFrom, char cTo){ ** -1 One of the inputs is NULL ** -2 Non-ASCII characters on input ** -3 Unable to allocate memory +** +** If pnMatch is not NULL, then *pnMatch is set to the number of bytes +** of zB that matched the pattern in zA. If zA does not end with a '*', +** then this value is always the number of bytes in zB (i.e. strlen(zB)). +** If zA does end in a '*', then it is the number of bytes in the prefix +** of zB that was deemed to match zA. */ -static int editdist1(const char *zA, const char *zB, int iLangId){ +static int editdist1(const char *zA, const char *zB, int iLangId, int *pnMatch){ int nA, nB; /* Number of characters in zA[] and zB[] */ int xA, xB; /* Loop counters for zA[] and zB[] */ char cA, cB; /* Current character of zA and zB */ @@ -619,12 +630,14 @@ static int editdist1(const char *zA, const char *zB, int iLangId){ char *cx; /* Corresponding character values */ int *toFree = 0; /* Malloced space */ int mStack[60+15]; /* Stack space to use if not too much is needed */ + int nMatch = 0; /* Early out if either input is NULL */ if( zA==0 || zB==0 ) return -1; /* Skip any common prefix */ - while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; } + while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; } + if( pnMatch ) *pnMatch = nMatch; if( zA[0]==0 && zB[0]==0 ) return 0; #if 0 @@ -737,10 +750,14 @@ static int editdist1(const char *zA, const char *zB, int iLangId){ if( cA=='*' ){ res = m[1]; for(xB=1; xB<=nB; xB++){ - if( m[xB]=128 ){ + int xTop, xBtm, x; + xTop = sizeof(translit)/sizeof(translit[0]) - 1; + xBtm = 0; + while( xTop>=xBtm ){ + x = (xTop + xBtm)/2; + if( translit[x].cFrom==c ){ + if( translit[x].cTo1 ) nOut++; + if( c==0x0429 || c== 0x0449 ) nOut += 2; + break; + }else if( translit[x].cFrom>c ){ + xTop = x-1; + }else{ + xBtm = x+1; + } + } + } + } + + return nChar; +} + + /* ** spellfix1_translit(X) ** @@ -2092,6 +2177,7 @@ struct spellfix1_vtab { struct spellfix1_cursor { sqlite3_vtab_cursor base; /* Base class - must be first */ spellfix1_vtab *pVTab; /* The table to which this cursor belongs */ + char *zPattern; /* rhs of MATCH clause */ int nRow; /* Number of rows of content */ int nAlloc; /* Number of allocated rows */ int iRow; /* Current row of content */ @@ -2105,6 +2191,7 @@ struct spellfix1_cursor { int iRank; /* Rank for this row */ int iDistance; /* Distance from pattern for this row */ int iScore; /* Score for sorting */ + int iMatchlen; /* Value of matchlen column (or -1) */ char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */ } *a; }; @@ -2200,7 +2287,7 @@ static char *spellfix1Dequote(const char *zIn){ ** argv[0] -> module name ("spellfix1") ** argv[1] -> database name ** argv[2] -> table name -** argv[3].. -> optional arguments (currently ignored) +** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter) */ static int spellfix1Init( int isCreate, @@ -2238,21 +2325,23 @@ static int spellfix1Init( rc = SQLITE_NOMEM; }else{ rc = sqlite3_declare_vtab(db, - "CREATE TABLE x(word,rank,distance,langid," - "score, phonehash,top HIDDEN,scope HIDDEN,srchcnt HIDDEN," - "soundslike HIDDEN,command HIDDEN)" + "CREATE TABLE x(word,rank,distance,langid, " + "score, matchlen, phonehash, " + "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, " + "soundslike HIDDEN, command HIDDEN)" ); #define SPELLFIX_COL_WORD 0 #define SPELLFIX_COL_RANK 1 #define SPELLFIX_COL_DISTANCE 2 #define SPELLFIX_COL_LANGID 3 #define SPELLFIX_COL_SCORE 4 -#define SPELLFIX_COL_PHONEHASH 5 -#define SPELLFIX_COL_TOP 6 -#define SPELLFIX_COL_SCOPE 7 -#define SPELLFIX_COL_SRCHCNT 8 -#define SPELLFIX_COL_SOUNDSLIKE 9 -#define SPELLFIX_COL_COMMAND 10 +#define SPELLFIX_COL_MATCHLEN 5 +#define SPELLFIX_COL_PHONEHASH 6 +#define SPELLFIX_COL_TOP 7 +#define SPELLFIX_COL_SCOPE 8 +#define SPELLFIX_COL_SRCHCNT 9 +#define SPELLFIX_COL_SOUNDSLIKE 10 +#define SPELLFIX_COL_COMMAND 11 } if( rc==SQLITE_OK && isCreate ){ sqlite3_uint64 r; @@ -2350,6 +2439,7 @@ static int spellfix1Close(sqlite3_vtab_cursor *cur){ spellfix1_cursor *pCur = (spellfix1_cursor *)cur; spellfix1ResetCursor(pCur); spellfix1ResizeCursor(pCur, 0); + sqlite3_free(pCur->zPattern); sqlite3_free(pCur); return SQLITE_OK; } @@ -2583,15 +2673,16 @@ static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){ } } while( sqlite3_step(pStmt)==SQLITE_ROW ){ + int iMatchlen = -1; iRank = sqlite3_column_int(pStmt, 2); if( p->pMatchStr3 ){ int nWord = sqlite3_column_bytes(pStmt, 1); zWord = (const char*)sqlite3_column_text(pStmt, 1); - iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang); + iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen); }else{ zK1 = (const char*)sqlite3_column_text(pStmt, 3); if( zK1==0 ) continue; - iDist = editdist1(p->zPattern, zK1, pCur->iLang); + iDist = editdist1(p->zPattern, zK1, pCur->iLang, 0); } pCur->nSearch++; iScore = spellfix1Score(iDist,iRank); @@ -2615,6 +2706,7 @@ static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){ pCur->a[idx].iRank = iRank; pCur->a[idx].iDistance = iDist; pCur->a[idx].iScore = iScore; + pCur->a[idx].iMatchlen = iMatchlen; memcpy(pCur->a[idx].zHash, zHash1, iScope+1); if( pCur->nRownAlloc ) pCur->nRow++; if( pCur->nRow==pCur->nAlloc ){ @@ -2696,6 +2788,8 @@ static int spellfix1FilterForMatch( x.pLang = 0; } zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0])); + sqlite3_free(pCur->zPattern); + pCur->zPattern = zPattern; if( zPattern==0 ) return SQLITE_NOMEM; nPattern = strlen(zPattern); if( zPattern[nPattern-1]=='*' ) nPattern--; @@ -2746,7 +2840,6 @@ static int spellfix1FilterForMatch( pCur->iScope = iScope; } sqlite3_finalize(pStmt); - sqlite3_free(zPattern); editDist3FromStringDelete(pMatchStr3); return pCur->a ? x.rc : SQLITE_NOMEM; } @@ -2830,6 +2923,30 @@ static int spellfix1Column(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore); break; } + case SPELLFIX_COL_MATCHLEN: { + int iMatchlen = pCur->a[pCur->iRow].iMatchlen; + if( iMatchlen<0 ){ + int nPattern = strlen(pCur->zPattern); + char *zWord = pCur->a[pCur->iRow].zWord; + int nWord = strlen(zWord); + + if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){ + char *zTranslit; + int res; + zTranslit = (char *)transliterate((unsigned char *)zWord, nWord); + if( !zTranslit ) return SQLITE_NOMEM; + res = editdist1(pCur->zPattern, zTranslit, pCur->iLang, &iMatchlen); + sqlite3_free(zTranslit); + if( res<0 ) return SQLITE_NOMEM; + iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen); + }else{ + iMatchlen = utf8Charlen(zWord, nWord); + } + } + + sqlite3_result_int(ctx, iMatchlen); + break; + } case SPELLFIX_COL_PHONEHASH: { sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC); break; diff --git a/test/spellfix.test b/test/spellfix.test new file mode 100644 index 0000000000..7459ca2cd6 --- /dev/null +++ b/test/spellfix.test @@ -0,0 +1,147 @@ +# 2012 July 12 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +set testprefix spellfix + +register_spellfix_module db + +set vocab { +rabbi rabbit rabbits rabble rabid rabies raccoon raccoons race raced racer +racers races racetrack racial racially racing rack racked racket racketeer +racketeering racketeers rackets racking racks radar radars radial radially +radian radiance radiant radiantly radiate radiated radiates radiating radiation +radiations radiator radiators radical radically radicals radices radii radio +radioactive radioastronomy radioed radiography radioing radiology radios radish +radishes radium radius radix radon raft rafter rafters rafts rag rage raged +rages ragged raggedly raggedness raging rags ragweed raid raided raider raiders +raiding raids rail railed railer railers railing railroad railroaded railroader +railroaders railroading railroads rails railway railways raiment rain rainbow +raincoat raincoats raindrop raindrops rained rainfall rainier rainiest raining +rains rainstorm rainy raise raised raiser raisers raises raisin raising rake +raked rakes raking rallied rallies rally rallying ram ramble rambler rambles +rambling ramblings ramification ramifications ramp rampage rampant rampart +ramps ramrod rams ran ranch ranched rancher ranchers ranches ranching rancid +random randomization randomize randomized randomizes randomly randomness randy +rang range ranged rangeland ranger rangers ranges ranging rangy rank ranked +ranker rankers rankest ranking rankings rankle rankly rankness ranks ransack +ransacked ransacking ransacks ransom ransomer ransoming ransoms rant ranted +ranter ranters ranting rants rap rapacious rape raped raper rapes rapid +rapidity rapidly rapids rapier raping rapport rapprochement raps rapt raptly +rapture raptures rapturous rare rarely rareness rarer rarest rarity rascal +rascally rascals rash rasher rashly rashness rasp raspberry rasped rasping +rasps raster rat rate rated rater raters rates rather ratification ratified +ratifies ratify ratifying rating ratings ratio ration rational rationale +rationales rationalities rationality rationalization rationalizations +rationalize rationalized rationalizes rationalizing rationally rationals +rationing rations ratios rats rattle rattled rattler rattlers rattles +rattlesnake rattlesnakes rattling raucous ravage ravaged ravager ravagers +ravages ravaging rave raved raven ravening ravenous ravenously ravens raves +ravine ravines raving ravings raw rawer rawest rawly rawness ray rays raze +razor razors re reabbreviate reabbreviated reabbreviates reabbreviating reach +reachability reachable reachably reached reacher reaches reaching reacquired +react reacted reacting reaction reactionaries reactionary reactions reactivate +reactivated reactivates reactivating reactivation reactive reactively +reactivity reactor reactors reacts read readability readable reader readers +readied readier readies readiest readily readiness reading readings readjusted +readout readouts reads ready readying real realest realign realigned realigning +realigns realism realist realistic realistically realists realities reality +} + +do_test 1.1 { + execsql { CREATE VIRTUAL TABLE t1 USING spellfix1 } + foreach word $vocab { + execsql { INSERT INTO t1(word) VALUES($word) } + } +} {} + +foreach {tn word res} { + 1 raxpi* {rasping 5 rasped 5 raspberry 6 rasp 4 rasps 4} + 2 ril* {rail 4 railway 4 railing 4 rails 4 railways 4} + 3 rilis* {realist 6 realistic 6 realistically 6 realists 6 realism 6} + 4 reail* {realities 3 reality 3 real 3 realest 3 realist 3} + 5 ras* {rasp 3 rash 3 rasped 3 rasping 3 rasps 3} + 6 realistss* {realists 8 realigns 8 realistic 9 realistically 9 realest 7} + 7 realistss {realists 8 realist 7 realigns 8 realistic 9 realest 7} + 8 rllation* {realities 9 reality 7 rallied 7 railed 4} + 9 renstom* {rainstorm 8 ransomer 6 ransom 6 ransoming 6 ransoms 6} +} { + do_execsql_test 1.2.$tn { + SELECT word, matchlen FROM t1 WHERE word MATCH $word LIMIT 5 + } $res +} + + +do_execsql_test 2.1 { + CREATE VIRTUAL TABLE t2 USING spellfix1; + INSERT INTO t2 (word, soundslike) VALUES('school', 'skuul'); + INSERT INTO t2 (word, soundslike) VALUES('psalm', 'sarm'); + SELECT word, matchlen FROM t2 WHERE word MATCH 'sar*' LIMIT 5; +} {psalm 4} + +do_execsql_test 2.2 { + SELECT word, matchlen FROM t2 WHERE word MATCH 'skol*' LIMIT 5; +} {school 6} + +set vocab { +kangaroo kanji kappa karate keel keeled keeling keels keen keener keenest +keenly keenness keep keeper keepers keeping keeps ken kennel kennels kept +kerchief kerchiefs kern kernel kernels kerosene ketchup kettle +kettles key keyboard keyboards keyed keyhole keying keynote keypad keypads keys +keystroke keystrokes keyword keywords kick kicked kicker kickers kicking +kickoff kicks kid kidded kiddie kidding kidnap kidnapper kidnappers kidnapping +kidnappings kidnaps kidney kidneys kids kill killed killer killers killing +killingly killings killjoy kills kilobit kilobits kiloblock kilobyte kilobytes +kilogram kilograms kilohertz kilohm kilojoule kilometer kilometers kiloton +kilovolt kilowatt kiloword kimono kin kind kinder kindergarten kindest +kindhearted kindle kindled kindles kindling kindly kindness kindred kinds +kinetic king kingdom kingdoms kingly kingpin kings kink kinky kinship kinsman +kiosk kiss kissed kisser kissers kisses kissing kit kitchen kitchenette +kitchens kite kited kites kiting kits kitten kittenish kittens kitty klaxon +kludge kludges klystron knack knapsack knapsacks knave knaves knead kneads knee +kneecap kneed kneeing kneel kneeled kneeling kneels knees knell knells knelt +knew knife knifed knifes knifing knight knighted knighthood knighting knightly +knights knit knits knives knob knobs knock knockdown knocked knocker knockers +knocking knockout knocks knoll knolls knot knots knotted knotting know knowable +knower knowhow knowing knowingly knowledge knowledgeable known knows knuckle +knuckled knuckles koala kosher kudo +} + +do_execsql_test 3.1 { + CREATE TABLE costs(iLang, cFrom, cTo, iCost); + INSERT INTO costs VALUES(0, 'a', 'e', 1); + INSERT INTO costs VALUES(0, 'e', 'i', 1); + INSERT INTO costs VALUES(0, 'i', 'o', 1); + INSERT INTO costs VALUES(0, 'o', 'u', 1); + INSERT INTO costs VALUES(0, 'u', 'a', 1); + CREATE VIRTUAL TABLE t3 USING spellfix1(edit_cost_table=costs); +} + +do_test 3.2 { + foreach w $vocab { + execsql { INSERT INTO t3(word) VALUES($w) } + } +} {} + +breakpoint +foreach {tn word res} { + 1 kos* {kosher 3 kiosk 4 kudo 2 kappa 1 keypad 1} + 2 kellj* {killjoy 5 killed 4 killingly 4 kill 4 killer 4} + 3 kellj {kill 4 kills 5 killjoy 7 keel 4 killed 6} +} { + do_execsql_test 1.2.$tn { + SELECT word, matchlen FROM t3 WHERE word MATCH $word LIMIT 5 + } $res +} + +finish_test