** by default (unless overridden by ORDER BY) returns
** results in order of increasing score.
**
+** matchlen For prefix queries, the number of characters in the prefix
+** of the returned value (word) that matched the query term.
+** For non-prefix queries, the number of characters in the
+** returned value.
+**
** top (HIDDEN) For any query, this value is the same on all
** rows. It is an integer which is the maximum number of
** rows that will be output. The actually number of rows
** -1 One of the inputs is NULL
** -2 Non-ASCII characters on input
** -3 Unable to allocate memory
+**
+** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
+** of zB that matched the pattern in zA. If zA does not end with a '*',
+** then this value is always the number of bytes in zB (i.e. strlen(zB)).
+** If zA does end in a '*', then it is the number of bytes in the prefix
+** of zB that was deemed to match zA.
*/
-static int editdist1(const char *zA, const char *zB, int iLangId){
+static int editdist1(const char *zA, const char *zB, int iLangId, int *pnMatch){
int nA, nB; /* Number of characters in zA[] and zB[] */
int xA, xB; /* Loop counters for zA[] and zB[] */
char cA, cB; /* Current character of zA and zB */
char *cx; /* Corresponding character values */
int *toFree = 0; /* Malloced space */
int mStack[60+15]; /* Stack space to use if not too much is needed */
+ int nMatch = 0;
/* Early out if either input is NULL */
if( zA==0 || zB==0 ) return -1;
/* Skip any common prefix */
- while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; }
+ while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
+ if( pnMatch ) *pnMatch = nMatch;
if( zA[0]==0 && zB[0]==0 ) return 0;
#if 0
if( cA=='*' ){
res = m[1];
for(xB=1; xB<=nB; xB++){
- if( m[xB]<res ) res = m[xB];
+ if( m[xB]<res ){
+ res = m[xB];
+ if( pnMatch ) *pnMatch = xB+nMatch;
+ }
}
}else{
res = m[nB];
+ if( pnMatch ) *pnMatch = -1;
}
sqlite3_free(toFree);
return res;
int res = editdist1(
(const char*)sqlite3_value_text(argv[0]),
(const char*)sqlite3_value_text(argv[1]),
- langid);
+ langid, 0);
if( res<0 ){
if( res==(-3) ){
sqlite3_result_error_nomem(context);
/* Compute the edit distance between two strings.
**
** If an error occurs, return a negative number which is the error code.
+**
+** If pnMatch is not NULL, then *pnMatch is set to the number of characters
+** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
+** not contain the pattern for a prefix-search, then this is always the number
+** of characters in z2. If pFrom does contain a prefix search pattern, then
+** it is the number of characters in the prefix of z2 that was deemed to
+** match pFrom.
*/
static int editDist3Core(
EditDist3FromString *pFrom, /* The FROM string */
const char *z2, /* The TO string */
int n2, /* Length of the TO string */
- const EditDist3Lang *pLang /* Edit weights for a particular language ID */
+ const EditDist3Lang *pLang, /* Edit weights for a particular language ID */
+ int *pnMatch /* OUT: Characters in matched prefix */
){
int k, n;
int i1, b1;
/* Free memory allocations and return the result */
res = (int)m[szRow*(n2+1)-1];
if( f.isPrefix ){
- for(i2=f.n; i2<n2; i2++){
+ *pnMatch = n2;
+ for(i2=1; i2<=n2; i2++){
int b = m[szRow*i2-1];
- if( b<res ) res = b;
+ if( b<=res ){
+ res = b;
+ if( pnMatch ) *pnMatch = i2-1;
+ }
}
+ }else if( pnMatch ){
+ *pnMatch = n2;
}
editDist3Abort:
sqlite3_result_error_nomem(context);
return;
}
- dist = editDist3Core(pFrom, zB, nB, pLang);
+ dist = editDist3Core(pFrom, zB, nB, pLang, 0);
editDist3FromStringDelete(pFrom);
sqlite3_result_int(context, dist);
}
return c;
}
+/*
+** Return the number of characters in the utf-8 string in the nIn byte
+** buffer pointed to by zIn.
+*/
+static int utf8Charlen(const char *zIn, int nIn){
+ int i;
+ int nChar = 0;
+ for(i=0; i<nIn; nChar++){
+ int sz;
+ utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
+ i += sz;
+ }
+ return nChar;
+}
+
/*
** Table of translations from unicode characters into ASCII.
*/
return zOut;
}
+/*
+** Return the number of characters in the shortest prefix of the input
+** string that transliterates to an ASCII string nTrans bytes or longer.
+** Or, if the transliteration of the input string is less than nTrans
+** bytes in size, return the number of characters in the input string.
+*/
+static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
+ int i, c, sz, nOut;
+ int nChar;
+
+ i = nOut = 0;
+ for(nChar=0; i<nIn && nOut<nTrans; nChar++){
+ c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
+ i += sz;
+
+ nOut++;
+ if( c>=128 ){
+ int xTop, xBtm, x;
+ xTop = sizeof(translit)/sizeof(translit[0]) - 1;
+ xBtm = 0;
+ while( xTop>=xBtm ){
+ x = (xTop + xBtm)/2;
+ if( translit[x].cFrom==c ){
+ if( translit[x].cTo1 ) nOut++;
+ if( c==0x0429 || c== 0x0449 ) nOut += 2;
+ break;
+ }else if( translit[x].cFrom>c ){
+ xTop = x-1;
+ }else{
+ xBtm = x+1;
+ }
+ }
+ }
+ }
+
+ return nChar;
+}
+
+
/*
** spellfix1_translit(X)
**
struct spellfix1_cursor {
sqlite3_vtab_cursor base; /* Base class - must be first */
spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
+ char *zPattern; /* rhs of MATCH clause */
int nRow; /* Number of rows of content */
int nAlloc; /* Number of allocated rows */
int iRow; /* Current row of content */
int iRank; /* Rank for this row */
int iDistance; /* Distance from pattern for this row */
int iScore; /* Score for sorting */
+ int iMatchlen; /* Value of matchlen column (or -1) */
char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
} *a;
};
** argv[0] -> module name ("spellfix1")
** argv[1] -> database name
** argv[2] -> table name
-** argv[3].. -> optional arguments (currently ignored)
+** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
*/
static int spellfix1Init(
int isCreate,
rc = SQLITE_NOMEM;
}else{
rc = sqlite3_declare_vtab(db,
- "CREATE TABLE x(word,rank,distance,langid,"
- "score, phonehash,top HIDDEN,scope HIDDEN,srchcnt HIDDEN,"
- "soundslike HIDDEN,command HIDDEN)"
+ "CREATE TABLE x(word,rank,distance,langid, "
+ "score, matchlen, phonehash, "
+ "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
+ "soundslike HIDDEN, command HIDDEN)"
);
#define SPELLFIX_COL_WORD 0
#define SPELLFIX_COL_RANK 1
#define SPELLFIX_COL_DISTANCE 2
#define SPELLFIX_COL_LANGID 3
#define SPELLFIX_COL_SCORE 4
-#define SPELLFIX_COL_PHONEHASH 5
-#define SPELLFIX_COL_TOP 6
-#define SPELLFIX_COL_SCOPE 7
-#define SPELLFIX_COL_SRCHCNT 8
-#define SPELLFIX_COL_SOUNDSLIKE 9
-#define SPELLFIX_COL_COMMAND 10
+#define SPELLFIX_COL_MATCHLEN 5
+#define SPELLFIX_COL_PHONEHASH 6
+#define SPELLFIX_COL_TOP 7
+#define SPELLFIX_COL_SCOPE 8
+#define SPELLFIX_COL_SRCHCNT 9
+#define SPELLFIX_COL_SOUNDSLIKE 10
+#define SPELLFIX_COL_COMMAND 11
}
if( rc==SQLITE_OK && isCreate ){
sqlite3_uint64 r;
spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
spellfix1ResetCursor(pCur);
spellfix1ResizeCursor(pCur, 0);
+ sqlite3_free(pCur->zPattern);
sqlite3_free(pCur);
return SQLITE_OK;
}
}
}
while( sqlite3_step(pStmt)==SQLITE_ROW ){
+ int iMatchlen = -1;
iRank = sqlite3_column_int(pStmt, 2);
if( p->pMatchStr3 ){
int nWord = sqlite3_column_bytes(pStmt, 1);
zWord = (const char*)sqlite3_column_text(pStmt, 1);
- iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang);
+ iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
}else{
zK1 = (const char*)sqlite3_column_text(pStmt, 3);
if( zK1==0 ) continue;
- iDist = editdist1(p->zPattern, zK1, pCur->iLang);
+ iDist = editdist1(p->zPattern, zK1, pCur->iLang, 0);
}
pCur->nSearch++;
iScore = spellfix1Score(iDist,iRank);
pCur->a[idx].iRank = iRank;
pCur->a[idx].iDistance = iDist;
pCur->a[idx].iScore = iScore;
+ pCur->a[idx].iMatchlen = iMatchlen;
memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
if( pCur->nRow==pCur->nAlloc ){
x.pLang = 0;
}
zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
+ sqlite3_free(pCur->zPattern);
+ pCur->zPattern = zPattern;
if( zPattern==0 ) return SQLITE_NOMEM;
nPattern = strlen(zPattern);
if( zPattern[nPattern-1]=='*' ) nPattern--;
pCur->iScope = iScope;
}
sqlite3_finalize(pStmt);
- sqlite3_free(zPattern);
editDist3FromStringDelete(pMatchStr3);
return pCur->a ? x.rc : SQLITE_NOMEM;
}
sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
break;
}
+ case SPELLFIX_COL_MATCHLEN: {
+ int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
+ if( iMatchlen<0 ){
+ int nPattern = strlen(pCur->zPattern);
+ char *zWord = pCur->a[pCur->iRow].zWord;
+ int nWord = strlen(zWord);
+
+ if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
+ char *zTranslit;
+ int res;
+ zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
+ if( !zTranslit ) return SQLITE_NOMEM;
+ res = editdist1(pCur->zPattern, zTranslit, pCur->iLang, &iMatchlen);
+ sqlite3_free(zTranslit);
+ if( res<0 ) return SQLITE_NOMEM;
+ iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
+ }else{
+ iMatchlen = utf8Charlen(zWord, nWord);
+ }
+ }
+
+ sqlite3_result_int(ctx, iMatchlen);
+ break;
+ }
case SPELLFIX_COL_PHONEHASH: {
sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
break;
--- /dev/null
+# 2012 July 12
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+set testprefix spellfix
+
+register_spellfix_module db
+
+set vocab {
+rabbi rabbit rabbits rabble rabid rabies raccoon raccoons race raced racer
+racers races racetrack racial racially racing rack racked racket racketeer
+racketeering racketeers rackets racking racks radar radars radial radially
+radian radiance radiant radiantly radiate radiated radiates radiating radiation
+radiations radiator radiators radical radically radicals radices radii radio
+radioactive radioastronomy radioed radiography radioing radiology radios radish
+radishes radium radius radix radon raft rafter rafters rafts rag rage raged
+rages ragged raggedly raggedness raging rags ragweed raid raided raider raiders
+raiding raids rail railed railer railers railing railroad railroaded railroader
+railroaders railroading railroads rails railway railways raiment rain rainbow
+raincoat raincoats raindrop raindrops rained rainfall rainier rainiest raining
+rains rainstorm rainy raise raised raiser raisers raises raisin raising rake
+raked rakes raking rallied rallies rally rallying ram ramble rambler rambles
+rambling ramblings ramification ramifications ramp rampage rampant rampart
+ramps ramrod rams ran ranch ranched rancher ranchers ranches ranching rancid
+random randomization randomize randomized randomizes randomly randomness randy
+rang range ranged rangeland ranger rangers ranges ranging rangy rank ranked
+ranker rankers rankest ranking rankings rankle rankly rankness ranks ransack
+ransacked ransacking ransacks ransom ransomer ransoming ransoms rant ranted
+ranter ranters ranting rants rap rapacious rape raped raper rapes rapid
+rapidity rapidly rapids rapier raping rapport rapprochement raps rapt raptly
+rapture raptures rapturous rare rarely rareness rarer rarest rarity rascal
+rascally rascals rash rasher rashly rashness rasp raspberry rasped rasping
+rasps raster rat rate rated rater raters rates rather ratification ratified
+ratifies ratify ratifying rating ratings ratio ration rational rationale
+rationales rationalities rationality rationalization rationalizations
+rationalize rationalized rationalizes rationalizing rationally rationals
+rationing rations ratios rats rattle rattled rattler rattlers rattles
+rattlesnake rattlesnakes rattling raucous ravage ravaged ravager ravagers
+ravages ravaging rave raved raven ravening ravenous ravenously ravens raves
+ravine ravines raving ravings raw rawer rawest rawly rawness ray rays raze
+razor razors re reabbreviate reabbreviated reabbreviates reabbreviating reach
+reachability reachable reachably reached reacher reaches reaching reacquired
+react reacted reacting reaction reactionaries reactionary reactions reactivate
+reactivated reactivates reactivating reactivation reactive reactively
+reactivity reactor reactors reacts read readability readable reader readers
+readied readier readies readiest readily readiness reading readings readjusted
+readout readouts reads ready readying real realest realign realigned realigning
+realigns realism realist realistic realistically realists realities reality
+}
+
+do_test 1.1 {
+ execsql { CREATE VIRTUAL TABLE t1 USING spellfix1 }
+ foreach word $vocab {
+ execsql { INSERT INTO t1(word) VALUES($word) }
+ }
+} {}
+
+foreach {tn word res} {
+ 1 raxpi* {rasping 5 rasped 5 raspberry 6 rasp 4 rasps 4}
+ 2 ril* {rail 4 railway 4 railing 4 rails 4 railways 4}
+ 3 rilis* {realist 6 realistic 6 realistically 6 realists 6 realism 6}
+ 4 reail* {realities 3 reality 3 real 3 realest 3 realist 3}
+ 5 ras* {rasp 3 rash 3 rasped 3 rasping 3 rasps 3}
+ 6 realistss* {realists 8 realigns 8 realistic 9 realistically 9 realest 7}
+ 7 realistss {realists 8 realist 7 realigns 8 realistic 9 realest 7}
+ 8 rllation* {realities 9 reality 7 rallied 7 railed 4}
+ 9 renstom* {rainstorm 8 ransomer 6 ransom 6 ransoming 6 ransoms 6}
+} {
+ do_execsql_test 1.2.$tn {
+ SELECT word, matchlen FROM t1 WHERE word MATCH $word LIMIT 5
+ } $res
+}
+
+
+do_execsql_test 2.1 {
+ CREATE VIRTUAL TABLE t2 USING spellfix1;
+ INSERT INTO t2 (word, soundslike) VALUES('school', 'skuul');
+ INSERT INTO t2 (word, soundslike) VALUES('psalm', 'sarm');
+ SELECT word, matchlen FROM t2 WHERE word MATCH 'sar*' LIMIT 5;
+} {psalm 4}
+
+do_execsql_test 2.2 {
+ SELECT word, matchlen FROM t2 WHERE word MATCH 'skol*' LIMIT 5;
+} {school 6}
+
+set vocab {
+kangaroo kanji kappa karate keel keeled keeling keels keen keener keenest
+keenly keenness keep keeper keepers keeping keeps ken kennel kennels kept
+kerchief kerchiefs kern kernel kernels kerosene ketchup kettle
+kettles key keyboard keyboards keyed keyhole keying keynote keypad keypads keys
+keystroke keystrokes keyword keywords kick kicked kicker kickers kicking
+kickoff kicks kid kidded kiddie kidding kidnap kidnapper kidnappers kidnapping
+kidnappings kidnaps kidney kidneys kids kill killed killer killers killing
+killingly killings killjoy kills kilobit kilobits kiloblock kilobyte kilobytes
+kilogram kilograms kilohertz kilohm kilojoule kilometer kilometers kiloton
+kilovolt kilowatt kiloword kimono kin kind kinder kindergarten kindest
+kindhearted kindle kindled kindles kindling kindly kindness kindred kinds
+kinetic king kingdom kingdoms kingly kingpin kings kink kinky kinship kinsman
+kiosk kiss kissed kisser kissers kisses kissing kit kitchen kitchenette
+kitchens kite kited kites kiting kits kitten kittenish kittens kitty klaxon
+kludge kludges klystron knack knapsack knapsacks knave knaves knead kneads knee
+kneecap kneed kneeing kneel kneeled kneeling kneels knees knell knells knelt
+knew knife knifed knifes knifing knight knighted knighthood knighting knightly
+knights knit knits knives knob knobs knock knockdown knocked knocker knockers
+knocking knockout knocks knoll knolls knot knots knotted knotting know knowable
+knower knowhow knowing knowingly knowledge knowledgeable known knows knuckle
+knuckled knuckles koala kosher kudo
+}
+
+do_execsql_test 3.1 {
+ CREATE TABLE costs(iLang, cFrom, cTo, iCost);
+ INSERT INTO costs VALUES(0, 'a', 'e', 1);
+ INSERT INTO costs VALUES(0, 'e', 'i', 1);
+ INSERT INTO costs VALUES(0, 'i', 'o', 1);
+ INSERT INTO costs VALUES(0, 'o', 'u', 1);
+ INSERT INTO costs VALUES(0, 'u', 'a', 1);
+ CREATE VIRTUAL TABLE t3 USING spellfix1(edit_cost_table=costs);
+}
+
+do_test 3.2 {
+ foreach w $vocab {
+ execsql { INSERT INTO t3(word) VALUES($w) }
+ }
+} {}
+
+breakpoint
+foreach {tn word res} {
+ 1 kos* {kosher 3 kiosk 4 kudo 2 kappa 1 keypad 1}
+ 2 kellj* {killjoy 5 killed 4 killingly 4 kill 4 killer 4}
+ 3 kellj {kill 4 kills 5 killjoy 7 keel 4 killed 6}
+} {
+ do_execsql_test 1.2.$tn {
+ SELECT word, matchlen FROM t3 WHERE word MATCH $word LIMIT 5
+ } $res
+}
+
+finish_test