From: dan Date: Sat, 17 Jan 2015 17:48:10 +0000 (+0000) Subject: Improve the performance of the fts5 porter tokenizer implementation. X-Git-Tag: version-3.8.11~114^2~105 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2656167f6e6f71463c99de0474e64517f9491b3e;p=thirdparty%2Fsqlite.git Improve the performance of the fts5 porter tokenizer implementation. FossilOrigin-Name: 96ea600440de05ee663e71c3f0d0de2c64108bf9 --- diff --git a/ext/fts5/fts5_tokenize.c b/ext/fts5/fts5_tokenize.c index feb3513a46..b62f50bf82 100644 --- a/ext/fts5/fts5_tokenize.c +++ b/ext/fts5/fts5_tokenize.c @@ -443,6 +443,7 @@ static int fts5UnicodeTokenize( rc = SQLITE_NOMEM; goto tokenize_done; } + zOut = &aFold[zOut - p->aFold]; memcpy(aFold, p->aFold, nFold); sqlite3_free(p->aFold); p->aFold = aFold; @@ -528,7 +529,7 @@ static int fts5PorterCreate( pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer)); if( pRet ){ memset(pRet, 0, sizeof(PorterTokenizer)); - rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer); + rc = pApi->xFindTokenizer(pApi, "unicode61", &pUserdata, &pRet->tokenizer); }else{ rc = SQLITE_NOMEM; } @@ -666,105 +667,459 @@ static int fts5Porter_Vowel(char *zStem, int nStem){ return 0; } -static int fts5PorterCb( - void *pCtx, - const char *pToken, - int nToken, - int iStart, - int iEnd -){ - PorterContext *p = (PorterContext*)pCtx; - PorterRule aStep1A[] = { - { "sses", 4, 0, "ss", 2 }, - { "ies", 3, 0, "i", 1 }, - { "ss", 2, 0, "ss", 2 }, - { "s", 1, 0, "", 0 }, - { 0, 0, 0, 0 } - }; +/************************************************************************** +*************************************************************************** +** GENERATED CODE STARTS HERE (mkportersteps.tcl) +*/ - PorterRule aStep1B[] = { - { "eed", 3, fts5Porter_MGt0, "ee", 2 }, - { "ed", 2, fts5Porter_Vowel, "", 0 }, - { "ing", 3, fts5Porter_Vowel, "", 0 }, - { 0, 0, 0, 0 } - }; +static int fts5PorterStep4(char *aBuf, int *pnBuf){ + int ret = 0; + int nBuf = *pnBuf; + switch( aBuf[nBuf-2] ){ + + case 'a': + if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){ + if( fts5Porter_MGt1(aBuf, nBuf-2) ){ + *pnBuf = nBuf - 2; + } + } + break; + + case 'c': + if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt1(aBuf, nBuf-4) ){ + *pnBuf = nBuf - 4; + } + }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt1(aBuf, nBuf-4) ){ + *pnBuf = nBuf - 4; + } + } + break; + + case 'e': + if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){ + if( fts5Porter_MGt1(aBuf, nBuf-2) ){ + *pnBuf = nBuf - 2; + } + } + break; + + case 'i': + if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){ + if( fts5Porter_MGt1(aBuf, nBuf-2) ){ + *pnBuf = nBuf - 2; + } + } + break; + + case 'l': + if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt1(aBuf, nBuf-4) ){ + *pnBuf = nBuf - 4; + } + }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt1(aBuf, nBuf-4) ){ + *pnBuf = nBuf - 4; + } + } + break; + + case 'n': + if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt1(aBuf, nBuf-5) ){ + *pnBuf = nBuf - 5; + } + }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt1(aBuf, nBuf-4) ){ + *pnBuf = nBuf - 4; + } + }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + } + break; + + case 'o': + if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){ + if( fts5Porter_MGt1(aBuf, nBuf-2) ){ + *pnBuf = nBuf - 2; + } + } + break; + + case 's': + if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + } + break; + + case 't': + if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + } + break; + + case 'u': + if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + } + break; + + case 'v': + if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + } + break; + + case 'z': + if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt1(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + } + break; + + } + return ret; +} + - PorterRule aStep1B2[] = { - { "at", 2, 0, "ate", 3 }, - { "bl", 2, 0, "ble", 3 }, - { "iz", 2, 0, "ize", 3 }, - { 0, 0, 0, 0 } - }; +static int fts5PorterStep1B2(char *aBuf, int *pnBuf){ + int ret = 0; + int nBuf = *pnBuf; + switch( aBuf[nBuf-2] ){ + + case 'a': + if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){ + memcpy(&aBuf[nBuf-2], "ate", 3); + *pnBuf = nBuf - 2 + 3; + ret = 1; + } + break; + + case 'b': + if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){ + memcpy(&aBuf[nBuf-2], "ble", 3); + *pnBuf = nBuf - 2 + 3; + ret = 1; + } + break; + + case 'i': + if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){ + memcpy(&aBuf[nBuf-2], "ize", 3); + *pnBuf = nBuf - 2 + 3; + ret = 1; + } + break; + + } + return ret; +} + - PorterRule aStep1C[] = { - { "y", 1, fts5Porter_Vowel, "i", 1 }, - { 0, 0, 0, 0 } - }; +static int fts5PorterStep2(char *aBuf, int *pnBuf){ + int ret = 0; + int nBuf = *pnBuf; + switch( aBuf[nBuf-2] ){ + + case 'a': + if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){ + if( fts5Porter_MGt0(aBuf, nBuf-7) ){ + memcpy(&aBuf[nBuf-7], "ate", 3); + *pnBuf = nBuf - 7 + 3; + } + }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){ + if( fts5Porter_MGt0(aBuf, nBuf-6) ){ + memcpy(&aBuf[nBuf-6], "tion", 4); + *pnBuf = nBuf - 6 + 4; + } + } + break; + + case 'c': + if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + memcpy(&aBuf[nBuf-4], "ence", 4); + *pnBuf = nBuf - 4 + 4; + } + }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + memcpy(&aBuf[nBuf-4], "ance", 4); + *pnBuf = nBuf - 4 + 4; + } + } + break; + + case 'e': + if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + memcpy(&aBuf[nBuf-4], "ize", 3); + *pnBuf = nBuf - 4 + 3; + } + } + break; + + case 'g': + if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + memcpy(&aBuf[nBuf-4], "log", 3); + *pnBuf = nBuf - 4 + 3; + } + } + break; + + case 'l': + if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt0(aBuf, nBuf-3) ){ + memcpy(&aBuf[nBuf-3], "ble", 3); + *pnBuf = nBuf - 3 + 3; + } + }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + memcpy(&aBuf[nBuf-4], "al", 2); + *pnBuf = nBuf - 4 + 2; + } + }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "ent", 3); + *pnBuf = nBuf - 5 + 3; + } + }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt0(aBuf, nBuf-3) ){ + memcpy(&aBuf[nBuf-3], "e", 1); + *pnBuf = nBuf - 3 + 1; + } + }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "ous", 3); + *pnBuf = nBuf - 5 + 3; + } + } + break; + + case 'o': + if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){ + if( fts5Porter_MGt0(aBuf, nBuf-7) ){ + memcpy(&aBuf[nBuf-7], "ize", 3); + *pnBuf = nBuf - 7 + 3; + } + }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "ate", 3); + *pnBuf = nBuf - 5 + 3; + } + }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + memcpy(&aBuf[nBuf-4], "ate", 3); + *pnBuf = nBuf - 4 + 3; + } + } + break; + + case 's': + if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "al", 2); + *pnBuf = nBuf - 5 + 2; + } + }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){ + if( fts5Porter_MGt0(aBuf, nBuf-7) ){ + memcpy(&aBuf[nBuf-7], "ive", 3); + *pnBuf = nBuf - 7 + 3; + } + }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){ + if( fts5Porter_MGt0(aBuf, nBuf-7) ){ + memcpy(&aBuf[nBuf-7], "ful", 3); + *pnBuf = nBuf - 7 + 3; + } + }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){ + if( fts5Porter_MGt0(aBuf, nBuf-7) ){ + memcpy(&aBuf[nBuf-7], "ous", 3); + *pnBuf = nBuf - 7 + 3; + } + } + break; + + case 't': + if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "al", 2); + *pnBuf = nBuf - 5 + 2; + } + }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "ive", 3); + *pnBuf = nBuf - 5 + 3; + } + }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){ + if( fts5Porter_MGt0(aBuf, nBuf-6) ){ + memcpy(&aBuf[nBuf-6], "ble", 3); + *pnBuf = nBuf - 6 + 3; + } + } + break; + + } + return ret; +} + - PorterRule aStep2[] = { - { "ational", 7, fts5Porter_MGt0, "ate", 3}, - { "tional", 6, fts5Porter_MGt0, "tion", 4}, - { "enci", 4, fts5Porter_MGt0, "ence", 4}, - { "anci", 4, fts5Porter_MGt0, "ance", 4}, - { "izer", 4, fts5Porter_MGt0, "ize", 3}, - { "logi", 4, fts5Porter_MGt0, "log", 3}, /* added post 1979 */ - { "bli", 3, fts5Porter_MGt0, "ble", 3}, /* modified post 1979 */ - { "alli", 4, fts5Porter_MGt0, "al", 2}, - { "entli", 5, fts5Porter_MGt0, "ent", 3}, - { "eli", 3, fts5Porter_MGt0, "e", 1}, - { "ousli", 5, fts5Porter_MGt0, "ous", 3}, - { "ization", 7, fts5Porter_MGt0, "ize", 3}, - { "ation", 5, fts5Porter_MGt0, "ate", 3}, - { "ator", 4, fts5Porter_MGt0, "ate", 3}, - { "alism", 5, fts5Porter_MGt0, "al", 2}, - { "iveness", 7, fts5Porter_MGt0, "ive", 3}, - { "fulness", 7, fts5Porter_MGt0, "ful", 3}, - { "ousness", 7, fts5Porter_MGt0, "ous", 3}, - { "aliti", 5, fts5Porter_MGt0, "al", 2}, - { "iviti", 5, fts5Porter_MGt0, "ive", 3}, - { "biliti", 6, fts5Porter_MGt0, "ble", 3}, - { 0, 0, 0, 0 } - }; +static int fts5PorterStep3(char *aBuf, int *pnBuf){ + int ret = 0; + int nBuf = *pnBuf; + switch( aBuf[nBuf-2] ){ + + case 'a': + if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + memcpy(&aBuf[nBuf-4], "ic", 2); + *pnBuf = nBuf - 4 + 2; + } + } + break; + + case 's': + if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){ + if( fts5Porter_MGt0(aBuf, nBuf-4) ){ + *pnBuf = nBuf - 4; + } + } + break; + + case 't': + if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "ic", 2); + *pnBuf = nBuf - 5 + 2; + } + }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "ic", 2); + *pnBuf = nBuf - 5 + 2; + } + } + break; + + case 'u': + if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt0(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + } + } + break; + + case 'v': + if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + *pnBuf = nBuf - 5; + } + } + break; + + case 'z': + if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){ + if( fts5Porter_MGt0(aBuf, nBuf-5) ){ + memcpy(&aBuf[nBuf-5], "al", 2); + *pnBuf = nBuf - 5 + 2; + } + } + break; + + } + return ret; +} + - PorterRule aStep3[] = { - { "icate", 5, fts5Porter_MGt0, "ic", 2}, - { "ative", 5, fts5Porter_MGt0, "", 0}, - { "alize", 5, fts5Porter_MGt0, "al", 2}, - { "iciti", 5, fts5Porter_MGt0, "ic", 2}, - { "ical", 4, fts5Porter_MGt0, "ic", 2}, - { "ful", 3, fts5Porter_MGt0, "", 0}, - { "ness", 4, fts5Porter_MGt0, "", 0}, - { 0, 0, 0, 0 } - }; +static int fts5PorterStep1B(char *aBuf, int *pnBuf){ + int ret = 0; + int nBuf = *pnBuf; + switch( aBuf[nBuf-2] ){ + + case 'e': + if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_MGt0(aBuf, nBuf-3) ){ + memcpy(&aBuf[nBuf-3], "ee", 2); + *pnBuf = nBuf - 3 + 2; + } + }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){ + if( fts5Porter_Vowel(aBuf, nBuf-2) ){ + *pnBuf = nBuf - 2; + ret = 1; + } + } + break; + + case 'n': + if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){ + if( fts5Porter_Vowel(aBuf, nBuf-3) ){ + *pnBuf = nBuf - 3; + ret = 1; + } + } + break; + + } + return ret; +} + +/* +** GENERATED CODE ENDS HERE (mkportersteps.tcl) +*************************************************************************** +**************************************************************************/ - PorterRule aStep4[] = { - { "al", 2, fts5Porter_MGt1, "", 0}, - { "ance", 4, fts5Porter_MGt1, "", 0}, - { "ence", 4, fts5Porter_MGt1, "", 0}, - { "er", 2, fts5Porter_MGt1, "", 0}, - { "ic", 2, fts5Porter_MGt1, "", 0}, - { "able", 4, fts5Porter_MGt1, "", 0}, - { "ible", 4, fts5Porter_MGt1, "", 0}, - { "ant", 3, fts5Porter_MGt1, "", 0}, - { "ement", 5, fts5Porter_MGt1, "", 0}, - { "ment", 4, fts5Porter_MGt1, "", 0}, - { "ent", 3, fts5Porter_MGt1, "", 0}, - { "ion", 3, fts5Porter_MGt1_and_S_or_T, "", 0}, - { "ou", 2, fts5Porter_MGt1, "", 0}, - { "ism", 3, fts5Porter_MGt1, "", 0}, - { "ate", 3, fts5Porter_MGt1, "", 0}, - { "iti", 3, fts5Porter_MGt1, "", 0}, - { "ous", 3, fts5Porter_MGt1, "", 0}, - { "ive", 3, fts5Porter_MGt1, "", 0}, - { "ize", 3, fts5Porter_MGt1, "", 0}, - { 0, 0, 0, 0 } - }; +static void fts5PorterStep1A(char *aBuf, int *pnBuf){ + int nBuf = *pnBuf; + if( aBuf[nBuf-1]=='s' ){ + if( aBuf[nBuf-2]=='e' ){ + if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s') + || (nBuf>3 && aBuf[nBuf-3]=='i' ) + ){ + *pnBuf = nBuf-2; + }else{ + *pnBuf = nBuf-1; + } + } + else if( aBuf[nBuf-2]!='s' ){ + *pnBuf = nBuf-1; + } + } +} +static int fts5PorterCb( + void *pCtx, + const char *pToken, + int nToken, + int iStart, + int iEnd +){ + PorterContext *p = (PorterContext*)pCtx; char *aBuf; int nBuf; - int n; if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through; aBuf = p->aBuf; @@ -772,10 +1127,9 @@ static int fts5PorterCb( memcpy(aBuf, pToken, nBuf); /* Step 1. */ - fts5PorterApply(aBuf, &nBuf, aStep1A); - n = fts5PorterApply(aBuf, &nBuf, aStep1B); - if( n==1 || n==2 ){ - if( fts5PorterApply(aBuf, &nBuf, aStep1B2)<0 ){ + fts5PorterStep1A(aBuf, &nBuf); + if( fts5PorterStep1B(aBuf, &nBuf) ){ + if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){ char c = aBuf[nBuf-1]; if( fts5PorterIsVowel(c, 0)==0 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2] @@ -786,12 +1140,16 @@ static int fts5PorterCb( } } } - fts5PorterApply(aBuf, &nBuf, aStep1C); + + /* Step 1C. */ + if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){ + aBuf[nBuf-1] = 'i'; + } /* Steps 2 through 4. */ - fts5PorterApply(aBuf, &nBuf, aStep2); - fts5PorterApply(aBuf, &nBuf, aStep3); - fts5PorterApply(aBuf, &nBuf, aStep4); + fts5PorterStep2(aBuf, &nBuf); + fts5PorterStep3(aBuf, &nBuf); + fts5PorterStep4(aBuf, &nBuf); /* Step 5a. */ if( nBuf>0 && aBuf[nBuf-1]=='e' ){ diff --git a/ext/fts5/mkportersteps.tcl b/ext/fts5/mkportersteps.tcl new file mode 100644 index 0000000000..b6214c6bf7 --- /dev/null +++ b/ext/fts5/mkportersteps.tcl @@ -0,0 +1,222 @@ +# +# 2014 Jun 09 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#------------------------------------------------------------------------- +# +# This script generates the implementations of the following C functions, +# which are part of the porter tokenizer implementation: +# +# static int fts5PorterStep1B(char *aBuf, int *pnBuf); +# static int fts5PorterStep1B2(char *aBuf, int *pnBuf); +# static int fts5PorterStep2(char *aBuf, int *pnBuf); +# static int fts5PorterStep3(char *aBuf, int *pnBuf); +# static int fts5PorterStep4(char *aBuf, int *pnBuf); +# + +set O(Step1B2) { + { at {} ate 1 } + { bl {} ble 1 } + { iz {} ize 1 } +} + +set O(Step1B) { + { "eed" fts5Porter_MGt0 "ee" 0 } + { "ed" fts5Porter_Vowel "" 1 } + { "ing" fts5Porter_Vowel "" 1 } +} + +set O(Step2) { + { "ational" fts5Porter_MGt0 "ate" } + { "tional" fts5Porter_MGt0 "tion" } + { "enci" fts5Porter_MGt0 "ence" } + { "anci" fts5Porter_MGt0 "ance" } + { "izer" fts5Porter_MGt0 "ize" } + { "logi" fts5Porter_MGt0 "log" } + { "bli" fts5Porter_MGt0 "ble" } + { "alli" fts5Porter_MGt0 "al" } + { "entli" fts5Porter_MGt0 "ent" } + { "eli" fts5Porter_MGt0 "e" } + { "ousli" fts5Porter_MGt0 "ous" } + { "ization" fts5Porter_MGt0 "ize" } + { "ation" fts5Porter_MGt0 "ate" } + { "ator" fts5Porter_MGt0 "ate" } + { "alism" fts5Porter_MGt0 "al" } + { "iveness" fts5Porter_MGt0 "ive" } + { "fulness" fts5Porter_MGt0 "ful" } + { "ousness" fts5Porter_MGt0 "ous" } + { "aliti" fts5Porter_MGt0 "al" } + { "iviti" fts5Porter_MGt0 "ive" } + { "biliti" fts5Porter_MGt0 "ble" } +} + +set O(Step3) { + { "icate" fts5Porter_MGt0 "ic" } + { "ative" fts5Porter_MGt0 "" } + { "alize" fts5Porter_MGt0 "al" } + { "iciti" fts5Porter_MGt0 "ic" } + { "ical" fts5Porter_MGt0 "ic" } + { "ful" fts5Porter_MGt0 "" } + { "ness" fts5Porter_MGt0 "" } +} + +set O(Step4) { + { "al" fts5Porter_MGt1 "" } + { "ance" fts5Porter_MGt1 "" } + { "ence" fts5Porter_MGt1 "" } + { "er" fts5Porter_MGt1 "" } + { "ic" fts5Porter_MGt1 "" } + { "able" fts5Porter_MGt1 "" } + { "ible" fts5Porter_MGt1 "" } + { "ant" fts5Porter_MGt1 "" } + { "ement" fts5Porter_MGt1 "" } + { "ment" fts5Porter_MGt1 "" } + { "ent" fts5Porter_MGt1 "" } + { "ion" fts5Porter_MGt1_and_S_or_T "" } + { "ou" fts5Porter_MGt1 "" } + { "ism" fts5Porter_MGt1 "" } + { "ate" fts5Porter_MGt1 "" } + { "iti" fts5Porter_MGt1 "" } + { "ous" fts5Porter_MGt1 "" } + { "ive" fts5Porter_MGt1 "" } + { "ize" fts5Porter_MGt1 "" } +} + +proc sort_cb {lhs rhs} { + set L [string range [lindex $lhs 0] end-1 end-1] + set R [string range [lindex $rhs 0] end-1 end-1] + string compare $L $R +} + +proc create_step_function {name data} { + + set T(function) { +static int fts5Porter${name}(char *aBuf, int *pnBuf){ + int ret = 0; + int nBuf = *pnBuf; + switch( aBuf[nBuf-2] ){ + ${switchbody} + } + return ret; +} + } + + set T(case) { + case '${k}': + ${ifstmts} + break; + } + + set T(if_0_0_0) { + if( ${match} ){ + *pnBuf = nBuf - $n; + } + } + set T(if_1_0_0) { + if( ${match} ){ + if( ${cond} ){ + *pnBuf = nBuf - $n; + } + } + } + set T(if_0_1_0) { + if( ${match} ){ + ${memcpy} + *pnBuf = nBuf - $n + $nRep; + } + } + set T(if_1_1_0) { + if( ${match} ){ + if( ${cond} ){ + ${memcpy} + *pnBuf = nBuf - $n + $nRep; + } + } + } + set T(if_1_0_1) { + if( ${match} ){ + if( ${cond} ){ + *pnBuf = nBuf - $n; + ret = 1; + } + } + } + set T(if_0_1_1) { + if( ${match} ){ + ${memcpy} + *pnBuf = nBuf - $n + $nRep; + ret = 1; + } + } + set T(if_1_1_1) { + if( ${match} ){ + if( ${cond} ){ + ${memcpy} + *pnBuf = nBuf - $n + $nRep; + ret = 1; + } + } + } + + set switchbody "" + + foreach I $data { + set k [string range [lindex $I 0] end-1 end-1] + lappend aCase($k) $I + } + foreach k [lsort [array names aCase]] { + set ifstmts "" + foreach I $aCase($k) { + set zSuffix [lindex $I 0] ;# Suffix text for this rule + set zRep [lindex $I 2] ;# Replacement text for rule + set xCond [lindex $I 1] ;# Condition callback (or "") + + set n [string length $zSuffix] + set nRep [string length $zRep] + + set match "nBuf>$n && 0==memcmp(\"$zSuffix\", &aBuf\[nBuf-$n\], $n)" + set memcpy "memcpy(&aBuf\[nBuf-$n\], \"$zRep\", $nRep);" + set cond "${xCond}(aBuf, nBuf-$n)" + + set bMemcpy [expr {$nRep>0}] + set bCond [expr {$xCond!=""}] + set bRet [expr {[llength $I]>3 && [lindex $I 3]}] + + set t $T(if_${bCond}_${bMemcpy}_${bRet}) + lappend ifstmts [string trim [subst -nocommands $t]] + } + + set ifstmts [join $ifstmts "else "] + + append switchbody [subst -nocommands $T(case)] + } + + + puts [subst -nocommands $T(function)] +} + + +puts [string trim { +/************************************************************************** +*************************************************************************** +** GENERATED CODE STARTS HERE (mkportersteps.tcl) +*/ +}] +foreach step [array names O] { + create_step_function $step $O($step) +} +puts [string trim { +/* +** GENERATED CODE ENDS HERE (mkportersteps.tcl) +*************************************************************************** +**************************************************************************/ +}] + + + diff --git a/ext/fts5/tool/loadfts5.tcl b/ext/fts5/tool/loadfts5.tcl new file mode 100644 index 0000000000..2572e38aea --- /dev/null +++ b/ext/fts5/tool/loadfts5.tcl @@ -0,0 +1,71 @@ + + +proc loadfile {f} { + set fd [open $f] + set data [read $fd] + close $fd + return $data +} + +set ::nRow 0 +proc load_hierachy {dir} { + foreach f [glob -nocomplain -dir $dir *] { + if {$::O(limit) && $::nRow>=$::O(limit)} break + if {[file isdir $f]} { + load_hierachy $f + } else { + db eval { INSERT INTO t1 VALUES($f, loadfile($f)) } + incr ::nRow + } + } +} + +proc usage {} { + puts stderr "Usage: $::argv0 ?SWITCHES? DATABASE PATH" + puts stderr "" + puts stderr "Switches are:" + puts stderr " -fts4 (use fts4 instead of fts5)" + exit 1 +} + +set O(vtab) fts5 +set O(tok) "" +set O(limit) 0 + +if {[llength $argv]<2} usage +for {set i 0} {$i < [llength $argv]-2} {incr i} { + set arg [lindex $argv $i] + switch -- [lindex $argv $i] { + -fts4 { + set O(vtab) fts4 + } + + -fts5 { + set O(vtab) fts5 + } + + -porter { + set O(tok) ", tokenize=porter" + } + + -limit { + incr i + set O(limit) [lindex $argv $i] + } + + default { + usage + } + } +} + +sqlite3 db [lindex $argv end-1] +db func loadfile loadfile + +db transaction { + db eval "CREATE VIRTUAL TABLE t1 USING $O(vtab) (path, content$O(tok))" + load_hierachy [lindex $argv end] +} + + + diff --git a/manifest b/manifest index 8bfbf04553..7615e891da 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\sprefix\sindexes\sso\sthat\sthey\swork\sin\scharacters,\snot\sbytes. -D 2015-01-13T17:25:08.235 +C Improve\sthe\sperformance\sof\sthe\sfts5\sporter\stokenizer\simplementation. +D 2015-01-17T17:48:10.103 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 7cd23e4fc91004a6bd081623e1bc6932e44828c0 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -115,9 +115,10 @@ F ext/fts5/fts5_hash.c 63fa8379c5f2ac107d47c2b7d9ac04c95ef8a279 F ext/fts5/fts5_index.c 6f9f98875b2ee5a16255911e1dc1b0b32cb1c350 F ext/fts5/fts5_storage.c 8bc9e5b6654e1545e9513def277ef3f025921664 F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5 -F ext/fts5/fts5_tokenize.c bdb6a1f599a94ec6e9c1cad037d1071e823dcb5d +F ext/fts5/fts5_tokenize.c 7c61d5c35c3449597bdeaa54dd48afe26852c7b0 F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9 F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9 +F ext/fts5/mkportersteps.tcl 5acf962d2e0074f701620bb5308155fa1e4a63ba F ext/fts5/test/fts5_common.tcl 08e939096a07eb77a7a986613e960f31d3cab2cc F ext/fts5/test/fts5aa.test 3941b54d7585153be0c5cf0026f7dd8cfef13ea9 F ext/fts5/test/fts5ab.test 91a3faac09ad9fab5f71494db6e4071963281536 @@ -143,6 +144,7 @@ F ext/fts5/test/fts5rebuild.test 2a5e98205393487b4a732c8290999af7c0b907b4 F ext/fts5/test/fts5tokenizer.test b34ae592db66f6e89546d791ce1f905ba0b3395c F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d F ext/fts5/test/fts5unicode2.test 64a5267fd6082fcb46439892ebd0cbaa5c38acee +F ext/fts5/tool/loadfts5.tcl 55c1f3ebf3f4b4f54be5bbdc823e36d59fc5e2dd F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43 F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37 @@ -1275,7 +1277,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32 F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P f22dbccad9499624880ddd48df1b07fb42b1ad66 -R 8d592e678c3bea0440cf749de24705b7 +P af8d43a4a08528bbae25ee38fe25de8a86f8a21c +R bbc2aaea254f25294ae3538c1336787c U dan -Z 3408fdf2714814208d88a4779f5de9eb +Z 3ca0ddccabcad41dd9682a0c32f2940d diff --git a/manifest.uuid b/manifest.uuid index f2f6111639..5130bde3f4 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -af8d43a4a08528bbae25ee38fe25de8a86f8a21c \ No newline at end of file +96ea600440de05ee663e71c3f0d0de2c64108bf9 \ No newline at end of file