From: drh Date: Mon, 8 Feb 2016 19:36:46 +0000 (+0000) Subject: Changes to help the tokenizer run about 33% faster. X-Git-Tag: version-3.11.0~39 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e96f361995576922c9b1350b256693101884708e;p=thirdparty%2Fsqlite.git Changes to help the tokenizer run about 33% faster. FossilOrigin-Name: a050e6f096d40aa5b6275797b96e62c228044f5a --- e96f361995576922c9b1350b256693101884708e diff --cc manifest index dbd577f123,180d6f2232..2902207223 --- a/manifest +++ b/manifest @@@ -1,8 -1,8 +1,8 @@@ - C Fix\sharmless\scompiler\swarning\sin\s'srcck1'\stool\sand\sadd\sit\sto\sthe\sclean\stargets. - D 2016-02-07T20:39:27.144 -C Add\scode\sto\sget\sthe\stokenizer\scharacter-class\slogic\sworking\sfor\sEBCDIC. -D 2016-02-08T19:15:48.295 -F Makefile.in 0a957a57243a3d55e96b1514e22ffae5db9ea116 ++C Changes\sto\shelp\sthe\stokenizer\srun\sabout\s33%\sfaster. ++D 2016-02-08T19:36:46.659 +F Makefile.in dac2776c84e0d533b158a9af6e57e05c4a6b19f3 F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 -F Makefile.msc a3f8092763bb5d0057f0f4feb6b7fcc19713e107 +F Makefile.msc fcf377286d910b47e072da1ac7945976337c0925 F README.md 8ecc12493ff9f820cdea6520a9016001cb2e59b7 F VERSION 866588d1edf0ccb5b0d33896974338f97564f719 F aclocal.m4 a5c22d164aff7ed549d53a90fa56d56955281f50 @@@ -406,7 -406,7 +406,7 @@@ F src/test_windirent.c 8f5fada630348558 F src/test_windirent.h b12055cab6227f7be10f5c19296f67c60cc5e2a5 F src/test_wsd.c 41cadfd9d97fe8e3e4e44f61a4a8ccd6f7ca8fe9 F src/threads.c bbfb74450643cb5372a43ad4f6cffd7e9dfcecb0 - F src/tokenize.c 214b783d6138e9f9fbb6b225ce9a376db3b03d42 -F src/tokenize.c 5019666f8705e9f7135c6f1c1ffac95a1af76fa6 ++F src/tokenize.c 813934be70597edfbb685ae08fc4c8b549cf5a1e F src/treeview.c dc39ccf04e9331237388b9cb73289c9d87ea050b F src/trigger.c e14840ee0c3e549e758ec9bf3e4146e166002280 F src/update.c 310ca7adb86a7d1f2afae46905b21c83580f3e17 @@@ -1427,7 -1427,7 +1427,8 @@@ F tool/vdbe_profile.tcl 246d0da094856d7 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f - P ab269e720552483c5617906837e294c1be3e0a57 - R b0244c1b9f3d2cf89729e9f132d42a5e - U mistachkin - Z 9a8eb772a0a9cfffcb8487b60e9945a2 -P ff406b9701ebe3a01834837f380641c6f0c495bc -R f322e625886c02a1fcd0df28b2c76f16 ++P 852a529a8b112049f67a3126f677c06ae4a22d73 04f7da77c13925c1f1e287f4579bb85518297d81 ++R 64edf86f87d772039cf70265114474cb ++T +closed 04f7da77c13925c1f1e287f4579bb85518297d81 + U drh -Z 8351e730f91ff26fa5b93d74f8f175a3 ++Z feb6daf33d3b967be60d8c209f5e5987 diff --cc manifest.uuid index 3f741fdf8b,96aac98d14..dc1ddc7e40 --- a/manifest.uuid +++ b/manifest.uuid @@@ -1,1 -1,1 +1,1 @@@ - 852a529a8b112049f67a3126f677c06ae4a22d73 -04f7da77c13925c1f1e287f4579bb85518297d81 ++a050e6f096d40aa5b6275797b96e62c228044f5a diff --cc src/tokenize.c index 5bee3d5a84,68e7b45172..9b3444ac82 --- a/src/tokenize.c +++ b/src/tokenize.c @@@ -18,12 -18,92 +18,92 @@@ #include "sqliteInt.h" #include + /* Character classes for tokenizing + ** + ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented + ** using a lookup table, whereas a switch() directly on c uses a binary search. + ** The lookup table is much faster. To maximize speed, and to ensure that + ** a lookup table is used, all of the classes need to be small integers and + ** all of them need to be used within the switch. + */ -#define CC_X 0 /* The letter 'x' or 'X'. Start of x'01234fed' */ ++#define CC_X 0 /* The letter 'x', or start of BLOB literal */ + #define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */ + #define CC_ID 2 /* unicode characters usable in IDs */ + #define CC_DIGIT 3 /* Digits */ + #define CC_DOLLAR 4 /* '$' */ + #define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */ + #define CC_VARNUM 6 /* '?'. Numeric SQL variables */ + #define CC_SPACE 7 /* Space characters */ + #define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */ + #define CC_QUOTE2 9 /* '['. [...] style quoted ids */ + #define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */ + #define CC_MINUS 11 /* '-'. Minus or SQL-style comment */ + #define CC_LT 12 /* '<'. Part of < or <= or <> */ + #define CC_GT 13 /* '>'. Part of > or >= */ + #define CC_EQ 14 /* '='. Part of = or == */ + #define CC_BANG 15 /* '!'. Part of != */ + #define CC_SLASH 16 /* '/'. / or c-style comment */ + #define CC_LP 17 /* '(' */ + #define CC_RP 18 /* ')' */ + #define CC_SEMI 19 /* ';' */ + #define CC_PLUS 20 /* '+' */ + #define CC_STAR 21 /* '*' */ + #define CC_PERCENT 22 /* '%' */ + #define CC_COMMA 23 /* ',' */ + #define CC_AND 24 /* '&' */ + #define CC_TILDA 25 /* '~' */ + #define CC_DOT 26 /* '.' */ + #define CC_ILLEGAL 27 /* Illegal character */ + + static const unsigned char aiClass[] = { + #ifdef SQLITE_ASCII + /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ + /* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27, + /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + /* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16, + /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6, + /* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1, + /* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27, + /* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + #endif + #ifdef SQLITE_EBCDIC + /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ + /* 0x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 7, 7, 27, 27, + /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + /* 2x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + /* 3x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + /* 4x */ 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 12, 17, 20, 10, + /* 5x */ 24, 27, 27, 27, 27, 27, 27, 27, 27, 27, 15, 4, 21, 18, 19, 27, + /* 6x */ 11, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 23, 22, 1, 13, 7, + /* 7x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 8, 5, 5, 5, 8, 14, 8, + /* 8x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, + /* 9x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, + /* 9x */ 25, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27, + /* Bx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 9, 27, 27, 27, 27, 27, + /* Cx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, + /* Dx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, + /* Ex */ 27, 27, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27, + /* Fx */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 27, 27, 27, 27, 27, 27, + #endif + }; + /* - ** The charMap() macro maps alphabetic characters into their + ** The charMap() macro maps alphabetic characters (only) into their ** lower-case ASCII equivalent. On ASCII machines, this is just ** an upper-to-lower case map. On EBCDIC machines we also need - ** to adjust the encoding. Only alphabetic characters and underscores - ** need to be translated. + ** to adjust the encoding. The mapping is only valid for alphabetics + ** which are the only characters for which this feature is used. + ** + ** Used by keywordhash.h */ #ifdef SQLITE_ASCII # define charMap(X) sqlite3UpperToLower[(unsigned char)X] @@@ -57,7 -137,7 +137,7 @@@ const unsigned char ebcdicToAscii[] = ** returned. If the input is not a keyword, TK_ID is returned. ** ** The implementation of this routine was generated by a program, --** mkkeywordhash.h, located in the tool subdirectory of the distribution. ++** mkkeywordhash.c, located in the tool subdirectory of the distribution. ** The output of the mkkeywordhash.c program is written into a file ** named keywordhash.h and then included into this source file by ** the #include below. @@@ -110,13 -190,13 +190,15 @@@ int sqlite3IsIdChar(u8 c){ return IdCha /* --** Return the length of the token that begins at z[0]. ++** Return the length (in bytes) of the token that begins at z[0]. ** Store the token type in *tokenType before returning. */ int sqlite3GetToken(const unsigned char *z, int *tokenType){ int i, c; - switch( *z ){ - case ' ': case '\t': case '\n': case '\f': case '\r': { - switch( aiClass[*z] ){ ++ switch( aiClass[*z] ){ /* Switch on the character-class of the first byte ++ ** of the token. See the comment on the CC_ defines ++ ** above. */ + case CC_SPACE: { testcase( z[0]==' ' ); testcase( z[0]=='\t' ); testcase( z[0]=='\n' ); @@@ -348,8 -421,8 +423,20 @@@ if( n==0 ) *tokenType = TK_ILLEGAL; return i; } ++ case CC_KYWD: { ++ for(i=1; aiClass[z[i]]<=CC_KYWD; i++){} ++ if( IdChar(z[i]) ){ ++ /* This token started out using characters that can appear in keywords, ++ ** but z[i] is a character not allowed within keywords, so this must ++ ** be an identifier instead */ ++ i++; ++ break; ++ } ++ *tokenType = TK_ID; ++ return keywordCode((char*)z, i, tokenType); ++ } #ifndef SQLITE_OMIT_BLOB_LITERAL - case 'x': case 'X': { + case CC_X: { testcase( z[0]=='x' ); testcase( z[0]=='X' ); if( z[1]=='\'' ){ *tokenType = TK_BLOB; @@@ -361,20 -434,28 +448,22 @@@ if( z[i] ) i++; return i; } - /* Otherwise fall through to the next case */ - i = 1; - break; ++ /* If it is not a BLOB literal, then it must be an ID, since no ++ ** SQL keywords start with the letter 'x'. Fall through */ } #endif - case CC_KYWD: { - for(i=1; aiClass[z[i]]<=CC_KYWD; i++){} - if( IdChar(z[i]) ){ i++; break; } - *tokenType = TK_ID; - return keywordCode((char*)z, i, tokenType); - } + case CC_ID: { + i = 1; + break; + } default: { - if( !IdChar(*z) ){ - break; - } - for(i=1; IdChar(z[i]); i++){} - *tokenType = TK_ID; - return keywordCode((char*)z, i, tokenType); + *tokenType = TK_ILLEGAL; + return 1; } } - *tokenType = TK_ILLEGAL; - return 1; + while( IdChar(z[i]) ){ i++; } + *tokenType = TK_ID; + return i; } /*