#include <assert.h>
/**************************************************************************
-** Start of simple tokenizer implementation.
+** Start of ascii tokenizer implementation.
*/
/*
** For tokenizers with no "unicode" modifier, the set of token characters
** is the same as the set of ASCII range alphanumeric characters.
*/
-static unsigned char aSimpleTokenChar[128] = {
+static unsigned char aAsciiTokenChar[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
};
-typedef struct SimpleTokenizer SimpleTokenizer;
-struct SimpleTokenizer {
+typedef struct AsciiTokenizer AsciiTokenizer;
+struct AsciiTokenizer {
unsigned char aTokenChar[128];
};
-static void fts5SimpleAddExceptions(
- SimpleTokenizer *p,
+static void fts5AsciiAddExceptions(
+ AsciiTokenizer *p,
const char *zArg,
int bTokenChars
){
}
/*
-** Create a "simple" tokenizer.
+** Create a "ascii" tokenizer.
*/
-static int fts5SimpleCreate(
+static int fts5AsciiCreate(
void *pCtx,
const char **azArg, int nArg,
Fts5Tokenizer **ppOut
){
int rc = SQLITE_OK;
- SimpleTokenizer *p = 0;
+ AsciiTokenizer *p = 0;
if( nArg%2 ){
rc = SQLITE_ERROR;
}else{
- p = sqlite3_malloc(sizeof(SimpleTokenizer));
+ p = sqlite3_malloc(sizeof(AsciiTokenizer));
if( p==0 ){
rc = SQLITE_NOMEM;
}else{
int i;
- memset(p, 0, sizeof(SimpleTokenizer));
- memcpy(p->aTokenChar, aSimpleTokenChar, sizeof(aSimpleTokenChar));
+ memset(p, 0, sizeof(AsciiTokenizer));
+ memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
const char *zArg = azArg[i+1];
if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
- fts5SimpleAddExceptions(p, zArg, 1);
+ fts5AsciiAddExceptions(p, zArg, 1);
}else
if( 0==sqlite3_stricmp(azArg[i], "separators") ){
- fts5SimpleAddExceptions(p, zArg, 0);
+ fts5AsciiAddExceptions(p, zArg, 0);
}else{
rc = SQLITE_ERROR;
}
}
/*
-** Delete a "simple" tokenizer.
+** Delete a "ascii" tokenizer.
*/
-static void fts5SimpleDelete(Fts5Tokenizer *p){
+static void fts5AsciiDelete(Fts5Tokenizer *p){
sqlite3_free(p);
}
-static void simpleFold(char *aOut, const char *aIn, int nByte){
+static void asciiFold(char *aOut, const char *aIn, int nByte){
int i;
for(i=0; i<nByte; i++){
char c = aIn[i];
}
/*
-** Tokenize some text using the simple tokenizer.
+** Tokenize some text using the ascii tokenizer.
*/
-static int fts5SimpleTokenize(
+static int fts5AsciiTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
const char *pText, int nText,
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
- SimpleTokenizer *p = (SimpleTokenizer*)pTokenizer;
+ AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
int rc = SQLITE_OK;
int ie;
int is = 0;
int nByte;
/* Skip any leading divider characters. */
- while( is<nText && ((pText[is]&0x80) || a[(int)pText[is]]==0) ){
+ while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
is++;
}
if( is==nText ) break;
/* Count the token characters */
ie = is+1;
- while( ie<nText && ((pText[ie]&0x80)==0 && a[(int)pText[ie]] ) ){
+ while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
ie++;
}
}
nFold = nByte*2;
}
- simpleFold(pFold, &pText[is], nByte);
+ asciiFold(pFold, &pText[is], nByte);
/* Invoke the token callback */
rc = xToken(pCtx, pFold, nByte, is, ie);
|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
}
+
#define WRITE_UTF8(zOut, c) { \
if( c<0x00080 ){ \
*zOut++ = (unsigned char)(c&0xFF); \
typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer {
+ unsigned char aTokenChar[128]; /* ASCII range token characters */
+ char *aFold; /* Buffer to fold text into */
+ int nFold; /* Size of aFold[] in bytes */
int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
int nException;
int *aiException;
int iCode;
int bToken;
READ_UTF8(zCsr, zTerm, iCode);
- bToken = sqlite3Fts5UnicodeIsalnum(iCode);
- assert( (bToken==0 || bToken==1) );
- assert( (bTokenChars==0 || bTokenChars==1) );
- if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
- int i;
- for(i=0; i<nNew; i++){
- if( aNew[i]>iCode ) break;
+ if( iCode<128 ){
+ p->aTokenChar[iCode] = bTokenChars;
+ }else{
+ bToken = sqlite3Fts5UnicodeIsalnum(iCode);
+ assert( (bToken==0 || bToken==1) );
+ assert( (bTokenChars==0 || bTokenChars==1) );
+ if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
+ int i;
+ for(i=0; i<nNew; i++){
+ if( aNew[i]>iCode ) break;
+ }
+ memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
+ aNew[i] = iCode;
+ nNew++;
}
- memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
- aNew[i] = iCode;
- nNew++;
}
}
p->aiException = aNew;
return 0;
}
+/*
+** Delete a "unicode61" tokenizer.
+*/
+static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
+ if( pTok ){
+ Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
+ sqlite3_free(p->aiException);
+ sqlite3_free(p->aFold);
+ sqlite3_free(p);
+ }
+ return;
+}
+
/*
** Create a "unicode61" tokenizer.
*/
if( p ){
int i;
memset(p, 0, sizeof(Unicode61Tokenizer));
+ memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
p->bRemoveDiacritic = 1;
+ p->nFold = 64;
+ p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
+ if( p->aFold==0 ){
+ rc = SQLITE_NOMEM;
+ }
for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
const char *zArg = azArg[i+1];
if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
}else{
rc = SQLITE_NOMEM;
}
+ if( rc!=SQLITE_OK ){
+ fts5UnicodeDelete((Fts5Tokenizer*)p);
+ p = 0;
+ }
*ppOut = (Fts5Tokenizer*)p;
}
return rc;
}
-/*
-** Delete a "unicode61" tokenizer.
-*/
-static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
- Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
- sqlite3_free(p->aiException);
- sqlite3_free(p);
- return;
-}
-
/*
** Return true if, for the purposes of tokenizing with the tokenizer
** passed as the first argument, codepoint iCode is considered a token
return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
}
-/*
-** Tokenize some text using a unicode61 tokenizer.
-*/
static int fts5UnicodeTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
- const unsigned char *zInput = (const unsigned char*)pText;
- const unsigned char *zTerm = &zInput[nText];
- const unsigned char *z = zInput;
int rc = SQLITE_OK;
- int nBuf = 0;
- unsigned char *zBuf = 0;
- unsigned char *zOut = 0;
-
- while( rc==SQLITE_OK && z<zTerm ){
- int iCode;
- int bAlnum;
- const unsigned char *zStart;
- const unsigned char *zCode;
-
- if( zOut==zBuf ) zStart = z;
- zCode = z;
- READ_UTF8(z, zTerm, iCode);
- bAlnum = fts5UnicodeIsAlnum(p, iCode);
- if( bAlnum==0 && zOut>zBuf ){
- bAlnum = sqlite3Fts5UnicodeIsdiacritic(iCode);
+ unsigned char *a = p->aTokenChar;
+
+ unsigned char *zTerm = (unsigned char*)&pText[nText];
+ unsigned char *zCsr = (unsigned char *)pText;
+
+ /* Output buffer */
+ char *aFold = p->aFold;
+ int nFold = p->nFold;
+
+ /* Each iteration of this loop gobbles up a contiguous run of separators,
+ ** then the next token. */
+ while( rc==SQLITE_OK ){
+ int iCode; /* non-ASCII codepoint read from input */
+ char *zOut = aFold;
+ int is;
+ int ie;
+
+ /* Skip any separator characters. */
+ while( 1 ){
+ if( zCsr>=zTerm ) goto tokenize_done;
+ if( *zCsr & 0x80 ) {
+ /* A character outside of the ascii range. Skip past it if it is
+ ** a separator character. Or break out of the loop if it is not. */
+ is = zCsr - (unsigned char*)pText;
+ READ_UTF8(zCsr, zTerm, iCode);
+ if( fts5UnicodeIsAlnum(p, iCode) ){
+ goto non_ascii_tokenchar;
+ }
+ }else{
+ if( a[*zCsr] ){
+ is = zCsr - (unsigned char*)pText;
+ goto ascii_tokenchar;
+ }
+ zCsr++;
+ }
}
- if( bAlnum ){
- int iOut;
+ /* Run through the tokenchars. Fold them into the output buffer along
+ ** the way. */
+ while( zCsr<zTerm ){
- /* Grow the output buffer if required */
- while( (zOut-zBuf)+4>=nBuf ){
- unsigned char *zNew;
- nBuf = (nBuf ? nBuf*2 : 128);
- zNew = sqlite3_realloc(zBuf, nBuf);
- if( zNew==0 ){
+ /* Grow the output buffer so that there is sufficient space to fit the
+ ** largest possible utf-8 character. */
+ if( (zOut-aFold)+6>nFold ){
+ aFold = sqlite3_malloc(nFold*2);
+ if( aFold==0 ){
rc = SQLITE_NOMEM;
- goto tokenize_finished;
- }else{
- zOut = &zNew[zOut-zBuf];
- zBuf = zNew;
+ goto tokenize_done;
}
+ memcpy(aFold, p->aFold, nFold);
+ sqlite3_free(p->aFold);
+ p->aFold = aFold;
+ p->nFold = nFold = nFold*2;
}
- /* Write the new character to it */
- iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
- if( iOut ) WRITE_UTF8(zOut, iOut);
+ if( *zCsr & 0x80 ){
+ /* An non-ascii-range character. Fold it into the output buffer if
+ ** it is a token character, or break out of the loop if it is not. */
+ READ_UTF8(zCsr, zTerm, iCode);
+ if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
+ non_ascii_tokenchar:
+ iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
+ if( iCode ) WRITE_UTF8(zOut, iCode);
+ }else{
+ break;
+ }
+ }else if( a[*zCsr]==0 ){
+ /* An ascii-range separator character. End of token. */
+ break;
+ }else{
+ ascii_tokenchar:
+ if( *zCsr>='A' && *zCsr<='Z' ){
+ *zOut++ = *zCsr + 32;
+ }else{
+ *zOut++ = *zCsr;
+ }
+ zCsr++;
+ }
+ ie = zCsr - (unsigned char*)pText;
}
- if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){
- int ie = (bAlnum ? z : zCode) - zInput;
- rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie);
- zOut = zBuf;
- }
+ /* Invoke the token callback */
+ rc = xToken(pCtx, aFold, zOut-aFold, is, ie);
}
-
- tokenize_finished:
- sqlite3_free(zBuf);
+
+ tokenize_done:
+ if( rc==SQLITE_DONE ) rc = SQLITE_OK;
return rc;
}
pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
if( pRet ){
memset(pRet, 0, sizeof(PorterTokenizer));
- rc = pApi->xFindTokenizer(pApi, "simple", &pUserdata, &pRet->tokenizer);
+ rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer);
}else{
rc = SQLITE_NOMEM;
}
const char *zName;
fts5_tokenizer x;
} aBuiltin[] = {
- { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
{ "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
- { "simple", {fts5SimpleCreate, fts5SimpleDelete, fts5SimpleTokenize }}
+ { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
+ { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
};
int rc = SQLITE_OK; /* Return code */
-C Fix\ssome\sdocumentation\sissues\sin\sfts5.
-D 2015-01-10T20:34:27.199
+C Optimize\sthe\sunicode61\stokenizer\sso\sthat\sit\shandles\sascii\stext\sfaster.\sMake\sit\sthe\sdefault\stokenizer.\sChange\sthe\sname\sof\sthe\ssimple\stokenizer\sto\s"ascii".
+D 2015-01-12T17:58:04.627
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in 7cd23e4fc91004a6bd081623e1bc6932e44828c0
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
F ext/fts3/unicode/mkunicode.tcl 4199cb887040ee3c3cd59a5171ddb0566904586e
F ext/fts5/extract_api_docs.tcl 55a6d648d516f35d9a1e580ac00de27154e1904a
-F ext/fts5/fts5.c c90004f4a91ce4f4dfad2fc980ade0d9314ebb10
+F ext/fts5/fts5.c 790880afffb249c79f9a36b38f9d774515f5cf7b
F ext/fts5/fts5.h f931954065693898d26c51f23f1d27200184a69a
F ext/fts5/fts5Int.h 0142ba4c3c70e1976578604c0e738670f7689726
F ext/fts5/fts5_aux.c 549aef152b0fd46020f5595d861b1fd60b3f9b4f
F ext/fts5/fts5_index.c ea36c1e42aaf8038b6139be95575eb7fe01f34e4
F ext/fts5/fts5_storage.c 8bc9e5b6654e1545e9513def277ef3f025921664
F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5
-F ext/fts5/fts5_tokenize.c 4c30cf32c63e59bec5b38533e0a65987df262851
+F ext/fts5/fts5_tokenize.c bdb6a1f599a94ec6e9c1cad037d1071e823dcb5d
F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9
F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9
F ext/fts5/test/fts5_common.tcl 08e939096a07eb77a7a986613e960f31d3cab2cc
F ext/fts5/test/fts5content.test 4234e0b11e003fe1e80472aa637f70464396fdd0
F ext/fts5/test/fts5ea.test 04695560a444fcc00c3c4f27783bdcfbf71f030c
F ext/fts5/test/fts5fault1.test f3f4c6ed15cc7a4dc8d517c0d1969d8e5a35a65c
-F ext/fts5/test/fts5near.test 70a568a1211a5b6d5a17282790d5f8cbbe086ce0
+F ext/fts5/test/fts5near.test 3f9f64e16cac82725d03d4e04c661090f0b3b947
F ext/fts5/test/fts5optimize.test 0028c90a7817d3e576d1148fc8dff17d89054e54
F ext/fts5/test/fts5porter.test 50322599823cb8080a99f0ec0c39f7d0c12bcb5e
F ext/fts5/test/fts5rebuild.test 2a5e98205393487b4a732c8290999af7c0b907b4
-F ext/fts5/test/fts5tokenizer.test f951bb9be29232bd057b0ac4d535b879d9cd9a89
-F ext/fts5/test/fts5unicode.test 9ae93296e59917c1210336388f6d3b98051b50c9
+F ext/fts5/test/fts5tokenizer.test b34ae592db66f6e89546d791ce1f905ba0b3395c
+F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d
F ext/fts5/test/fts5unicode2.test 64a5267fd6082fcb46439892ebd0cbaa5c38acee
F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43
F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
-P e749be563d8e738af113bd301770e2f22763ab77
-R 5c59d3558d2a230e6048c600760933d7
+P 512e1bdb4093b59d1494dfc63391476eadd52aea
+R 30a0c3c40d1701cf92ddf5b1410b6af9
U dan
-Z 6c17e3ae4cf92b8841424ff4d00c314d
+Z 9b7b348d489cfd6e15d4a8bf3e2c22e9