From: drh Date: Mon, 8 Feb 2016 02:30:50 +0000 (+0000) Subject: Demonstrate a much faster sqlite3GetToken() routine by using a lookup table X-Git-Tag: version-3.11.0~39^2~2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8974331fc35a22b48ccf1ca2c709890646f10a3d;p=thirdparty%2Fsqlite.git Demonstrate a much faster sqlite3GetToken() routine by using a lookup table to map initial token characters into a character class. This check-in does not work for EBCDIC. More optimization needed. FossilOrigin-Name: 9115baa1919584dc8ca25bbff54d3b65748a9631 --- diff --git a/manifest b/manifest index 3a81b43628..3badb22032 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\sthe\ssourcetest\starget\sto\sMakefile.msc. -D 2016-02-07T00:08:08.480 +C Demonstrate\sa\smuch\sfaster\ssqlite3GetToken()\sroutine\sby\susing\sa\slookup\stable\nto\smap\sinitial\stoken\scharacters\sinto\sa\scharacter\sclass.\s\sThis\scheck-in\sdoes\nnot\swork\sfor\sEBCDIC.\s\sMore\soptimization\sneeded. +D 2016-02-08T02:30:50.194 F Makefile.in 0a957a57243a3d55e96b1514e22ffae5db9ea116 F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 F Makefile.msc a3f8092763bb5d0057f0f4feb6b7fcc19713e107 @@ -406,7 +406,7 @@ F src/test_windirent.c 8f5fada630348558d5745b334702f301da1ffc61 F src/test_windirent.h b12055cab6227f7be10f5c19296f67c60cc5e2a5 F src/test_wsd.c 41cadfd9d97fe8e3e4e44f61a4a8ccd6f7ca8fe9 F src/threads.c bbfb74450643cb5372a43ad4f6cffd7e9dfcecb0 -F src/tokenize.c 214b783d6138e9f9fbb6b225ce9a376db3b03d42 +F src/tokenize.c b3cfc123d65a5bf7ba615f74f28737ae2135620a F src/treeview.c dc39ccf04e9331237388b9cb73289c9d87ea050b F src/trigger.c e14840ee0c3e549e758ec9bf3e4146e166002280 F src/update.c 310ca7adb86a7d1f2afae46905b21c83580f3e17 @@ -1427,7 +1427,10 @@ F tool/vdbe_profile.tcl 246d0da094856d72d2c12efec03250d71639d19f F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P b0b4624fc5d53bb0cc9fae7dad51984837d946ac -R 08c9d9815def36a9ada11c24d7dc13cb +P ab269e720552483c5617906837e294c1be3e0a57 +R de21eeb460827c84c88aec10967c3b1d +T *branch * tokenizer-char-class +T *sym-tokenizer-char-class * +T -sym-trunk * U drh -Z 21cfff4994071e02bdb1eef8143a0ef4 +Z 8982f608f7ca1f17154501e0dba47865 diff --git a/manifest.uuid b/manifest.uuid index 16127e2758..9b08cb07e9 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -ab269e720552483c5617906837e294c1be3e0a57 \ No newline at end of file +9115baa1919584dc8ca25bbff54d3b65748a9631 \ No newline at end of file diff --git a/src/tokenize.c b/src/tokenize.c index 5bee3d5a84..c4b36c4758 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -18,6 +18,56 @@ #include "sqliteInt.h" #include +/* Character classes for tokenizing */ +#define CC_X 0 /* The letter 'x' or 'X'. Start of x'01234fed' */ +#define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */ +#define CC_ID 2 /* unicode characters usable in IDs */ +#define CC_DIGIT 3 /* Digits */ +#define CC_DOLLAR 4 /* '$' */ +#define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */ +#define CC_VARNUM 6 /* '?'. Numeric SQL variables */ +#define CC_SPACE 7 /* Space characters */ +#define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */ +#define CC_QUOTE2 9 /* '['. [...] style quoted ids */ +#define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */ +#define CC_MINUS 11 /* '-'. Minus or SQL-style comment */ +#define CC_LT 12 /* '<'. Part of < or <= or <> */ +#define CC_GT 13 /* '>'. Part of > or >= */ +#define CC_EQ 14 /* '='. Part of = or == */ +#define CC_BANG 15 /* '!'. Part of != */ +#define CC_SLASH 16 /* '/'. / or c-style comment */ +#define CC_LP 17 /* '(' */ +#define CC_RP 18 /* ')' */ +#define CC_SEMI 19 /* ';' */ +#define CC_PLUS 20 /* '+' */ +#define CC_STAR 21 /* '*' */ +#define CC_PERCENT 22 /* '%' */ +#define CC_COMMA 23 /* ',' */ +#define CC_AND 24 /* '&' */ +#define CC_TILDA 25 /* '~' */ +#define CC_DOT 26 /* '.' */ +#define CC_ILLEGAL 27 /* Illegal character */ + +static const unsigned char aiClass[] = { +/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ +/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27, +/* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, +/* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16, +/* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6, +/* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1, +/* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27, +/* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + /* ** The charMap() macro maps alphabetic characters into their ** lower-case ASCII equivalent. On ASCII machines, this is just @@ -115,8 +165,8 @@ int sqlite3IsIdChar(u8 c){ return IdChar(c); } */ int sqlite3GetToken(const unsigned char *z, int *tokenType){ int i, c; - switch( *z ){ - case ' ': case '\t': case '\n': case '\f': case '\r': { + switch( aiClass[*z] ){ + case CC_SPACE: { testcase( z[0]==' ' ); testcase( z[0]=='\t' ); testcase( z[0]=='\n' ); @@ -126,7 +176,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ *tokenType = TK_SPACE; return i; } - case '-': { + case CC_MINUS: { if( z[1]=='-' ){ for(i=2; (c=z[i])!=0 && c!='\n'; i++){} *tokenType = TK_SPACE; /* IMP: R-22934-25134 */ @@ -135,27 +185,27 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ *tokenType = TK_MINUS; return 1; } - case '(': { + case CC_LP: { *tokenType = TK_LP; return 1; } - case ')': { + case CC_RP: { *tokenType = TK_RP; return 1; } - case ';': { + case CC_SEMI: { *tokenType = TK_SEMI; return 1; } - case '+': { + case CC_PLUS: { *tokenType = TK_PLUS; return 1; } - case '*': { + case CC_STAR: { *tokenType = TK_STAR; return 1; } - case '/': { + case CC_SLASH: { if( z[1]!='*' || z[2]==0 ){ *tokenType = TK_SLASH; return 1; @@ -165,15 +215,15 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ *tokenType = TK_SPACE; /* IMP: R-22934-25134 */ return i; } - case '%': { + case CC_PERCENT: { *tokenType = TK_REM; return 1; } - case '=': { + case CC_EQ: { *tokenType = TK_EQ; return 1 + (z[1]=='='); } - case '<': { + case CC_LT: { if( (c=z[1])=='=' ){ *tokenType = TK_LE; return 2; @@ -188,7 +238,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 1; } } - case '>': { + case CC_GT: { if( (c=z[1])=='=' ){ *tokenType = TK_GE; return 2; @@ -200,7 +250,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 1; } } - case '!': { + case CC_BANG: { if( z[1]!='=' ){ *tokenType = TK_ILLEGAL; return 2; @@ -209,7 +259,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 2; } } - case '|': { + case CC_PIPE: { if( z[1]!='|' ){ *tokenType = TK_BITOR; return 1; @@ -218,21 +268,19 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 2; } } - case ',': { + case CC_COMMA: { *tokenType = TK_COMMA; return 1; } - case '&': { + case CC_AND: { *tokenType = TK_BITAND; return 1; } - case '~': { + case CC_TILDA: { *tokenType = TK_BITNOT; return 1; } - case '`': - case '\'': - case '"': { + case CC_QUOTE: { int delim = z[0]; testcase( delim=='`' ); testcase( delim=='\'' ); @@ -257,7 +305,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return i; } } - case '.': { + case CC_DOT: { #ifndef SQLITE_OMIT_FLOATING_POINT if( !sqlite3Isdigit(z[1]) ) #endif @@ -268,8 +316,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ /* If the next character is a digit, this is a floating point ** number that begins with ".". Fall thru into the next case */ } - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': { + case CC_DIGIT: { testcase( z[0]=='0' ); testcase( z[0]=='1' ); testcase( z[0]=='2' ); testcase( z[0]=='3' ); testcase( z[0]=='4' ); testcase( z[0]=='5' ); testcase( z[0]=='6' ); testcase( z[0]=='7' ); testcase( z[0]=='8' ); @@ -304,22 +351,18 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ } return i; } - case '[': { + case CC_QUOTE2: { for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} *tokenType = c==']' ? TK_ID : TK_ILLEGAL; return i; } - case '?': { + case CC_VARNUM: { *tokenType = TK_VARIABLE; for(i=1; sqlite3Isdigit(z[i]); i++){} return i; } -#ifndef SQLITE_OMIT_TCL_VARIABLE - case '$': -#endif - case '@': /* For compatibility with MS SQL Server */ - case '#': - case ':': { + case CC_DOLLAR: + case CC_VARALPHA: { int n = 0; testcase( z[0]=='$' ); testcase( z[0]=='@' ); testcase( z[0]==':' ); testcase( z[0]=='#' ); @@ -349,7 +392,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return i; } #ifndef SQLITE_OMIT_BLOB_LITERAL - case 'x': case 'X': { + case CC_X: { testcase( z[0]=='x' ); testcase( z[0]=='X' ); if( z[1]=='\'' ){ *tokenType = TK_BLOB; @@ -361,20 +404,28 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ if( z[i] ) i++; return i; } - /* Otherwise fall through to the next case */ + i = 1; + break; } #endif - default: { - if( !IdChar(*z) ){ - break; - } - for(i=1; IdChar(z[i]); i++){} + case CC_KYWD: { + for(i=1; aiClass[z[i]]<=CC_KYWD; i++){} + if( aiClass[z[i]]<=CC_DOLLAR ){ i++; break; } *tokenType = TK_ID; return keywordCode((char*)z, i, tokenType); } + case CC_ID: { + i = 1; + break; + } + default: { + *tokenType = TK_ILLEGAL; + return 1; + } } - *tokenType = TK_ILLEGAL; - return 1; + while( aiClass[z[i]]<=CC_DOLLAR ){ i++; } + *tokenType = TK_ID; + return i; } /*