From 34dcee65442abd1f48f19ea855189f5b642da6dd Mon Sep 17 00:00:00 2001 From: drh Date: Mon, 8 Feb 2016 19:15:48 +0000 Subject: [PATCH] Add code to get the tokenizer character-class logic working for EBCDIC. FossilOrigin-Name: 04f7da77c13925c1f1e287f4579bb85518297d81 --- manifest | 14 +++++++------- manifest.uuid | 2 +- src/tokenize.c | 42 ++++++++++++++++++++++++++++++++++++------ tool/mkkeywordhash.c | 16 ++++++++++++---- 4 files changed, 56 insertions(+), 18 deletions(-) diff --git a/manifest b/manifest index a3d1184319..180d6f2232 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Faster\skeywordCode()\simplementation\sby\staking\sadvantage\sof\sthe\sfact\sthat\nthe\sinput\sis\salways\spure\sASCII\salphabetic\sand\sunderscore\sand\sthat\sthe\skeyword\ntable\sis\salways\supper-case. -D 2016-02-08T03:23:46.173 +C Add\scode\sto\sget\sthe\stokenizer\scharacter-class\slogic\sworking\sfor\sEBCDIC. +D 2016-02-08T19:15:48.295 F Makefile.in 0a957a57243a3d55e96b1514e22ffae5db9ea116 F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 F Makefile.msc a3f8092763bb5d0057f0f4feb6b7fcc19713e107 @@ -406,7 +406,7 @@ F src/test_windirent.c 8f5fada630348558d5745b334702f301da1ffc61 F src/test_windirent.h b12055cab6227f7be10f5c19296f67c60cc5e2a5 F src/test_wsd.c 41cadfd9d97fe8e3e4e44f61a4a8ccd6f7ca8fe9 F src/threads.c bbfb74450643cb5372a43ad4f6cffd7e9dfcecb0 -F src/tokenize.c b3cfc123d65a5bf7ba615f74f28737ae2135620a +F src/tokenize.c 5019666f8705e9f7135c6f1c1ffac95a1af76fa6 F src/treeview.c dc39ccf04e9331237388b9cb73289c9d87ea050b F src/trigger.c e14840ee0c3e549e758ec9bf3e4146e166002280 F src/update.c 310ca7adb86a7d1f2afae46905b21c83580f3e17 @@ -1383,7 +1383,7 @@ F tool/lempar.c 3ec1463a034b37d87d782be5f6b8b10a3b1ecbe7 F tool/loadfts.c c3c64e4d5e90e8ba41159232c2189dba4be7b862 F tool/logest.c eef612f8adf4d0993dafed0416064cf50d5d33c6 F tool/mkautoconfamal.sh a29b14d54302b33fd892958f6895582ea90e4a45 -F tool/mkkeywordhash.c 4451824f4f68f8e8d89eba080e0c1a9cf83f7b62 +F tool/mkkeywordhash.c f7f3b342211ac6a14258b9726d5b97cf4f548f22 F tool/mkmsvcmin.tcl d57e6efc9428605f5418d0b235721ddf7b5d9c0b F tool/mkopcodec.tcl d1b6362bd3aa80d5520d4d6f3765badf01f6c43c F tool/mkopcodeh.tcl 385c62d78c38b2d92146dcb5abd319dbbc33506d @@ -1427,7 +1427,7 @@ F tool/vdbe_profile.tcl 246d0da094856d72d2c12efec03250d71639d19f F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P 9115baa1919584dc8ca25bbff54d3b65748a9631 -R b013689cd0826a67d194ddf9700064de +P ff406b9701ebe3a01834837f380641c6f0c495bc +R f322e625886c02a1fcd0df28b2c76f16 U drh -Z b5dc027d5e497c867b371327464bbd96 +Z 8351e730f91ff26fa5b93d74f8f175a3 diff --git a/manifest.uuid b/manifest.uuid index 290fa630a8..96aac98d14 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -ff406b9701ebe3a01834837f380641c6f0c495bc \ No newline at end of file +04f7da77c13925c1f1e287f4579bb85518297d81 \ No newline at end of file diff --git a/src/tokenize.c b/src/tokenize.c index c4b36c4758..68e7b45172 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -18,7 +18,14 @@ #include "sqliteInt.h" #include -/* Character classes for tokenizing */ +/* Character classes for tokenizing +** +** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented +** using a lookup table, whereas a switch() directly on c uses a binary search. +** The lookup table is much faster. To maximize speed, and to ensure that +** a lookup table is used, all of the classes need to be small integers and +** all of them need to be used within the switch. +*/ #define CC_X 0 /* The letter 'x' or 'X'. Start of x'01234fed' */ #define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */ #define CC_ID 2 /* unicode characters usable in IDs */ @@ -49,6 +56,7 @@ #define CC_ILLEGAL 27 /* Illegal character */ static const unsigned char aiClass[] = { +#ifdef SQLITE_ASCII /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ /* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27, /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, @@ -66,14 +74,36 @@ static const unsigned char aiClass[] = { /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +#endif +#ifdef SQLITE_EBCDIC +/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ +/* 0x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 7, 7, 27, 27, +/* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, +/* 2x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, +/* 3x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, +/* 4x */ 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 12, 17, 20, 10, +/* 5x */ 24, 27, 27, 27, 27, 27, 27, 27, 27, 27, 15, 4, 21, 18, 19, 27, +/* 6x */ 11, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 23, 22, 1, 13, 7, +/* 7x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 8, 5, 5, 5, 8, 14, 8, +/* 8x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, +/* 9x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, +/* 9x */ 25, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27, +/* Bx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 9, 27, 27, 27, 27, 27, +/* Cx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, +/* Dx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, +/* Ex */ 27, 27, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27, +/* Fx */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 27, 27, 27, 27, 27, 27, +#endif }; /* -** The charMap() macro maps alphabetic characters into their +** The charMap() macro maps alphabetic characters (only) into their ** lower-case ASCII equivalent. On ASCII machines, this is just ** an upper-to-lower case map. On EBCDIC machines we also need -** to adjust the encoding. Only alphabetic characters and underscores -** need to be translated. +** to adjust the encoding. The mapping is only valid for alphabetics +** which are the only characters for which this feature is used. +** +** Used by keywordhash.h */ #ifdef SQLITE_ASCII # define charMap(X) sqlite3UpperToLower[(unsigned char)X] @@ -410,7 +440,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ #endif case CC_KYWD: { for(i=1; aiClass[z[i]]<=CC_KYWD; i++){} - if( aiClass[z[i]]<=CC_DOLLAR ){ i++; break; } + if( IdChar(z[i]) ){ i++; break; } *tokenType = TK_ID; return keywordCode((char*)z, i, tokenType); } @@ -423,7 +453,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 1; } } - while( aiClass[z[i]]<=CC_DOLLAR ){ i++; } + while( IdChar(z[i]) ){ i++; } *tokenType = TK_ID; return i; } diff --git a/tool/mkkeywordhash.c b/tool/mkkeywordhash.c index 43455ef97c..7e5287ea54 100644 --- a/tool/mkkeywordhash.c +++ b/tool/mkkeywordhash.c @@ -277,7 +277,10 @@ static Keyword aKeywordTable[] = { /* Number of keywords */ static int nKeyword = (sizeof(aKeywordTable)/sizeof(aKeywordTable[0])); -/* Map all alphabetic characters into the same case */ +/* Map all alphabetic characters into lower-case for hashing. This is +** only valid for alphabetics. In particular it does not work for '_' +** and so the hash cannot be on a keyword position that might be an '_'. +*/ #define charMap(X) (0x20|(X)) /* @@ -565,16 +568,21 @@ int main(int argc, char **argv){ } printf("%s };\n", j==0 ? "" : "\n"); - printf(" int h, i, j;\n"); + printf(" int i, j;\n"); printf(" const char *zKW;\n"); printf(" if( n>=2 ){\n"); - printf(" h = ((charMap(z[0])*4) ^ (charMap(z[n-1])*3) ^ n) %% %d;\n", + printf(" i = ((charMap(z[0])*4) ^ (charMap(z[n-1])*3) ^ n) %% %d;\n", bestSize); - printf(" for(i=((int)aHash[h])-1; i>=0; i=((int)aNext[i])-1){\n"); + printf(" for(i=((int)aHash[i])-1; i>=0; i=((int)aNext[i])-1){\n"); printf(" if( aLen[i]!=n ) continue;\n"); printf(" j = 0;\n"); printf(" zKW = &zText[aOffset[i]];\n"); + printf("#ifdef SQLITE_ASCII\n"); printf(" while( j