Changes to help the tokenizer run about 33% faster.

author drh <drh@noemail.net>

Mon, 8 Feb 2016 19:36:46 +0000 (19:36 +0000)

committer drh <drh@noemail.net>

Mon, 8 Feb 2016 19:36:46 +0000 (19:36 +0000)
author drh <drh@noemail.net>
Mon, 8 Feb 2016 19:36:46 +0000 (19:36 +0000)
committer drh <drh@noemail.net>
Mon, 8 Feb 2016 19:36:46 +0000 (19:36 +0000)
diff --cc manifest

index dbd577f123aac27ad9c41c6ae39e4b4a9a0661e2,180d6f2232173cff2e5263a082e206a92b86c5e6..2902207223fa1fed30b818e33f3c2ad0fe1ae202
--- 1/manifest
--- 2/manifest
+++ b/manifest
@@@ -1,8 -1,8 +1,8 @@@
- C Fix\sharmless\scompiler\swarning\sin\s'srcck1'\stool\sand\sadd\sit\sto\sthe\sclean\stargets.
- D 2016-02-07T20:39:27.144
- -C Add\scode\sto\sget\sthe\stokenizer\scharacter-class\slogic\sworking\sfor\sEBCDIC.
- -D 2016-02-08T19:15:48.295
- -F Makefile.in 0a957a57243a3d55e96b1514e22ffae5db9ea116
++C Changes\sto\shelp\sthe\stokenizer\srun\sabout\s33%\sfaster.
++D 2016-02-08T19:36:46.659
+ +F Makefile.in dac2776c84e0d533b158a9af6e57e05c4a6b19f3
   F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434
- -F Makefile.msc a3f8092763bb5d0057f0f4feb6b7fcc19713e107
+ +F Makefile.msc fcf377286d910b47e072da1ac7945976337c0925
   F README.md 8ecc12493ff9f820cdea6520a9016001cb2e59b7
   F VERSION 866588d1edf0ccb5b0d33896974338f97564f719
   F aclocal.m4 a5c22d164aff7ed549d53a90fa56d56955281f50
@@@ -406,7 -406,7 +406,7 @@@ F src/test_windirent.c 8f5fada630348558
   F src/test_windirent.h b12055cab6227f7be10f5c19296f67c60cc5e2a5
   F src/test_wsd.c 41cadfd9d97fe8e3e4e44f61a4a8ccd6f7ca8fe9
   F src/threads.c bbfb74450643cb5372a43ad4f6cffd7e9dfcecb0
- F src/tokenize.c 214b783d6138e9f9fbb6b225ce9a376db3b03d42
- -F src/tokenize.c 5019666f8705e9f7135c6f1c1ffac95a1af76fa6
++F src/tokenize.c 813934be70597edfbb685ae08fc4c8b549cf5a1e
   F src/treeview.c dc39ccf04e9331237388b9cb73289c9d87ea050b
   F src/trigger.c e14840ee0c3e549e758ec9bf3e4146e166002280
   F src/update.c 310ca7adb86a7d1f2afae46905b21c83580f3e17
@@@ -1427,7 -1427,7 +1427,8 @@@ F tool/vdbe_profile.tcl 246d0da094856d7
   F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
   F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b
   F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
- P ab269e720552483c5617906837e294c1be3e0a57
- R b0244c1b9f3d2cf89729e9f132d42a5e
- U mistachkin
- Z 9a8eb772a0a9cfffcb8487b60e9945a2
- -P ff406b9701ebe3a01834837f380641c6f0c495bc
- -R f322e625886c02a1fcd0df28b2c76f16
++P 852a529a8b112049f67a3126f677c06ae4a22d73 04f7da77c13925c1f1e287f4579bb85518297d81
++R 64edf86f87d772039cf70265114474cb
++T +closed 04f7da77c13925c1f1e287f4579bb85518297d81
+ U drh
- -Z 8351e730f91ff26fa5b93d74f8f175a3
++Z feb6daf33d3b967be60d8c209f5e5987
diff --cc manifest.uuid

index 3f741fdf8bc184b70eaef48bb9cefaea8d3102ef,96aac98d1441ae4e3ae9fe02d68c3613f4c50c71..dc1ddc7e402e55a9162996a56195d5cb0e0f8e95
--- 1/manifest.uuid
--- 2/manifest.uuid
+++ b/manifest.uuid
@@@ -1,1 -1,1 +1,1 @@@
- 852a529a8b112049f67a3126f677c06ae4a22d73
- -04f7da77c13925c1f1e287f4579bb85518297d81
++a050e6f096d40aa5b6275797b96e62c228044f5a
diff --cc src/tokenize.c

index 5bee3d5a849331049ed751a2b2219149654b260d,68e7b4517226323c954fd2c354e0879b492f8002..9b3444ac82a8e525604e4487ce36a1ab8ee5db82
--- 1/src/tokenize.c
--- 2/src/tokenize.c
+++ b/src/tokenize.c
@@@ -18,12 -18,92 +18,92 @@@
   #include "sqliteInt.h"
   #include <stdlib.h>
   
- -#define CC_X          0    /* The letter 'x' or 'X'.  Start of x'01234fed' */
+ /* Character classes for tokenizing
+ **
+ ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
+ ** using a lookup table, whereas a switch() directly on c uses a binary search.
+ ** The lookup table is much faster.  To maximize speed, and to ensure that
+ ** a lookup table is used, all of the classes need to be small integers and
+ ** all of them need to be used within the switch.
+ */
++#define CC_X          0    /* The letter 'x', or start of BLOB literal */
+ #define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
+ #define CC_ID         2    /* unicode characters usable in IDs */
+ #define CC_DIGIT      3    /* Digits */
+ #define CC_DOLLAR     4    /* '$' */
+ #define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
+ #define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
+ #define CC_SPACE      7    /* Space characters */
+ #define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
+ #define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
+ #define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
+ #define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
+ #define CC_LT        12    /* '<'.  Part of < or <= or <> */
+ #define CC_GT        13    /* '>'.  Part of > or >= */
+ #define CC_EQ        14    /* '='.  Part of = or == */
+ #define CC_BANG      15    /* '!'.  Part of != */
+ #define CC_SLASH     16    /* '/'.  / or c-style comment */
+ #define CC_LP        17    /* '(' */
+ #define CC_RP        18    /* ')' */
+ #define CC_SEMI      19    /* ';' */
+ #define CC_PLUS      20    /* '+' */
+ #define CC_STAR      21    /* '*' */
+ #define CC_PERCENT   22    /* '%' */
+ #define CC_COMMA     23    /* ',' */
+ #define CC_AND       24    /* '&' */
+ #define CC_TILDA     25    /* '~' */
+ #define CC_DOT       26    /* '.' */
+ #define CC_ILLEGAL   27    /* Illegal character */
+ 
+ static const unsigned char aiClass[] = {
+ #ifdef SQLITE_ASCII
+ /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
+ /* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
+ /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ /* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
+ /* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
+ /* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+ /* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
+ /* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+ /* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
+ /* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+ /* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+ /* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+ /* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+ /* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+ /* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+ /* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+ /* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
+ #endif
+ #ifdef SQLITE_EBCDIC
+ /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
+ /* 0x */   27, 27, 27, 27, 27,  7, 27, 27, 27, 27, 27, 27,  7,  7, 27, 27,
+ /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ /* 2x */   27, 27, 27, 27, 27,  7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ /* 3x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ /* 4x */    7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 12, 17, 20, 10,
+ /* 5x */   24, 27, 27, 27, 27, 27, 27, 27, 27, 27, 15,  4, 21, 18, 19, 27,
+ /* 6x */   11, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 23, 22,  1, 13,  7,
+ /* 7x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  8,  5,  5,  5,  8, 14,  8,
+ /* 8x */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+ /* 9x */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+ /* 9x */   25,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 27, 27, 27, 27, 27,
+ /* Bx */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27,  9, 27, 27, 27, 27, 27,
+ /* Cx */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+ /* Dx */   27,  1,  1,  1,  1,  1,  1,  1,  1,  1, 27, 27, 27, 27, 27, 27,
+ /* Ex */   27, 27,  1,  1,  1,  1,  1,  0,  1,  1, 27, 27, 27, 27, 27, 27,
+ /* Fx */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3, 27, 27, 27, 27, 27, 27,
+ #endif
+ };
+ 
   /*
- ** The charMap() macro maps alphabetic characters into their
+ ** The charMap() macro maps alphabetic characters (only) into their
   ** lower-case ASCII equivalent.  On ASCII machines, this is just
   ** an upper-to-lower case map.  On EBCDIC machines we also need
- ** to adjust the encoding.  Only alphabetic characters and underscores
- ** need to be translated.
+ ** to adjust the encoding.  The mapping is only valid for alphabetics
+ ** which are the only characters for which this feature is used. 
+ **
+ ** Used by keywordhash.h
   */
   #ifdef SQLITE_ASCII
   # define charMap(X) sqlite3UpperToLower[(unsigned char)X]
@@@ -57,7 -137,7 +137,7 @@@ const unsigned char ebcdicToAscii[] = 
   ** returned.  If the input is not a keyword, TK_ID is returned.
   **
   ** The implementation of this routine was generated by a program,
--** mkkeywordhash.h, located in the tool subdirectory of the distribution.
++** mkkeywordhash.c, located in the tool subdirectory of the distribution.
   ** The output of the mkkeywordhash.c program is written into a file
   ** named keywordhash.h and then included into this source file by
   ** the #include below.
@@@ -110,13 -190,13 +190,15 @@@ int sqlite3IsIdChar(u8 c){ return IdCha
   
   
   /*
--** Return the length of the token that begins at z[0]. 
++** Return the length (in bytes) of the token that begins at z[0]. 
   ** Store the token type in *tokenType before returning.
   */
   int sqlite3GetToken(const unsigned char *z, int *tokenType){
     int i, c;
-   switch( *z ){
-     case ' ': case '\t': case '\n': case '\f': case '\r': {
- -  switch( aiClass[*z] ){
++  switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
++                          ** of the token. See the comment on the CC_ defines
++                          ** above. */
+     case CC_SPACE: {
         testcase( z[0]==' ' );
         testcase( z[0]=='\t' );
         testcase( z[0]=='\n' );
@@@ -348,8 -421,8 +423,20 @@@
         if( n==0 ) *tokenType = TK_ILLEGAL;
         return i;
       }
++    case CC_KYWD: {
++      for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
++      if( IdChar(z[i]) ){
++        /* This token started out using characters that can appear in keywords,
++        ** but z[i] is a character not allowed within keywords, so this must
++        ** be an identifier instead */
++        i++;
++        break;
++      }
++      *tokenType = TK_ID;
++      return keywordCode((char*)z, i, tokenType);
++    }
   #ifndef SQLITE_OMIT_BLOB_LITERAL
-     case 'x': case 'X': {
+     case CC_X: {
         testcase( z[0]=='x' ); testcase( z[0]=='X' );
         if( z[1]=='\'' ){
           *tokenType = TK_BLOB;
@@@ -361,20 -434,28 +448,22 @@@
           if( z[i] ) i++;
           return i;
         }
-       /* Otherwise fall through to the next case */
- -      i = 1;
- -      break;
++      /* If it is not a BLOB literal, then it must be an ID, since no
++      ** SQL keywords start with the letter 'x'.  Fall through */
       }
   #endif
- -    case CC_KYWD: {
- -      for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
- -      if( IdChar(z[i]) ){ i++; break; }
- -      *tokenType = TK_ID;
- -      return keywordCode((char*)z, i, tokenType);
- -    }
+     case CC_ID: {
+       i = 1;
+       break;
+     }
       default: {
-       if( !IdChar(*z) ){
-         break;
-       }
-       for(i=1; IdChar(z[i]); i++){}
-       *tokenType = TK_ID;
-       return keywordCode((char*)z, i, tokenType);
+       *tokenType = TK_ILLEGAL;
+       return 1;
       }
     }
-   *tokenType = TK_ILLEGAL;
-   return 1;
+   while( IdChar(z[i]) ){ i++; }
+   *tokenType = TK_ID;
+   return i;
   }
   
   /*
author	drh <drh@noemail.net>
	Mon, 8 Feb 2016 19:36:46 +0000 (19:36 +0000)
committer	drh <drh@noemail.net>
	Mon, 8 Feb 2016 19:36:46 +0000 (19:36 +0000)
		1	2
manifest	patch \|	diff1 \|	diff2 \|	blob \| history
manifest.uuid	patch \|	diff1 \|	diff2 \|	blob \| history
src/tokenize.c	patch \|	diff1 \|	diff2 \|	blob \| history