* the previous token to make the estimate a tiny bit more precise.
*/
+/* It is not safe to call isspace(), tolower(), or isalnum() on
+** hi-bit-set characters. This is the same solution used in the
+** tokenizer.
+*/
+/* TODO(shess) The snippet-generation code should be using the
+** tokenizer-generated tokens rather than doing its own local
+** tokenization.
+*/
+/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
+static int safe_isspace(char c){
+ return (c&0x80)==0 ? isspace(c) : 0;
+}
+static int safe_tolower(char c){
+ return (c&0x80)==0 ? tolower(c) : c;
+}
+static int safe_isalnum(char c){
+ return (c&0x80)==0 ? isalnum(c) : 0;
+}
+
typedef enum DocListType {
DL_DOCIDS, /* docids only */
DL_POSITIONS, /* docids + positions */
return 0;
}
case ' ': case '\t': case '\n': case '\f': case '\r': {
- for(i=1; isspace(z[i]); i++){}
+ for(i=1; safe_isspace(z[i]); i++){}
*tokenType = TOKEN_SPACE;
return i;
}
int i, j;
if( azIn ){
for(i=0, j=-1; azIn[i]; i++){
- if( isalnum(azIn[i][0]) || azIn[i][1] ){
+ if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
dequoteString(azIn[i]);
if( j>=0 ){
azIn[j] = azIn[i];
** s[] is t[].
*/
static int startsWith(const char *s, const char *t){
- while( isspace(*s) ){ s++; }
+ while( safe_isspace(*s) ){ s++; }
while( *t ){
- if( tolower(*s++)!=tolower(*t++) ) return 0;
+ if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
}
- return *s!='_' && !isalnum(*s);
+ return *s!='_' && !safe_isalnum(*s);
}
/*
char *p;
pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
for (p = pSpec->azContentColumn[i]; *p ; ++p) {
- if( !isalnum(*p) ) *p = '_';
+ if( !safe_isalnum(*p) ) *p = '_';
}
}
}
}
for(i=1; i<=10; i++){
- if( isspace(zDoc[iBreak-i]) ){
+ if( safe_isspace(zDoc[iBreak-i]) ){
return iBreak - i + 1;
}
- if( isspace(zDoc[iBreak+i]) ){
+ if( safe_isspace(zDoc[iBreak+i]) ){
return iBreak + i + 1;
}
}
*/
static void appendWhiteSpace(StringBuffer *p){
if( p->len==0 ) return;
- if( isspace(p->s[p->len-1]) ) return;
+ if( safe_isspace(p->s[p->len-1]) ) return;
append(p, " ");
}
** Remove white space from teh end of the StringBuffer
*/
static void trimWhiteSpace(StringBuffer *p){
- while( p->len>0 && isspace(p->s[p->len-1]) ){
+ while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
p->len--;
}
}
# define TRACE(A)
#endif
+/* It is not safe to call isspace(), tolower(), or isalnum() on
+** hi-bit-set characters. This is the same solution used in the
+** tokenizer.
+*/
+/* TODO(shess) The snippet-generation code should be using the
+** tokenizer-generated tokens rather than doing its own local
+** tokenization.
+*/
+/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
+static int safe_isspace(char c){
+ return (c&0x80)==0 ? isspace(c) : 0;
+}
+static int safe_tolower(char c){
+ return (c&0x80)==0 ? tolower(c) : c;
+}
+static int safe_isalnum(char c){
+ return (c&0x80)==0 ? isalnum(c) : 0;
+}
+
typedef enum DocListType {
DL_DOCIDS, /* docids only */
DL_POSITIONS, /* docids + positions */
static int endsInWhiteSpace(StringBuffer *p){
return stringBufferLength(p)>0 &&
- isspace(stringBufferData(p)[stringBufferLength(p)-1]);
+ safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
}
/* If the StringBuffer ends in something other than white space, add a
return 0;
}
case ' ': case '\t': case '\n': case '\f': case '\r': {
- for(i=1; isspace(z[i]); i++){}
+ for(i=1; safe_isspace(z[i]); i++){}
*tokenType = TOKEN_SPACE;
return i;
}
int i, j;
if( azIn ){
for(i=0, j=-1; azIn[i]; i++){
- if( isalnum(azIn[i][0]) || azIn[i][1] ){
+ if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
dequoteString(azIn[i]);
if( j>=0 ){
azIn[j] = azIn[i];
** s[] is t[].
*/
static int startsWith(const char *s, const char *t){
- while( isspace(*s) ){ s++; }
+ while( safe_isspace(*s) ){ s++; }
while( *t ){
- if( tolower(*s++)!=tolower(*t++) ) return 0;
+ if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
}
- return *s!='_' && !isalnum(*s);
+ return *s!='_' && !safe_isalnum(*s);
}
/*
char *p;
pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
for (p = pSpec->azContentColumn[i]; *p ; ++p) {
- if( !isalnum(*p) ) *p = '_';
+ if( !safe_isalnum(*p) ) *p = '_';
}
}
}
}
for(i=1; i<=10; i++){
- if( isspace(zDoc[iBreak-i]) ){
+ if( safe_isspace(zDoc[iBreak-i]) ){
return iBreak - i + 1;
}
- if( isspace(zDoc[iBreak+i]) ){
+ if( safe_isspace(zDoc[iBreak+i]) ){
return iBreak + i + 1;
}
}
-C Assume\sthe\smalloc-failed\sflag\scannot\salready\sbe\sset\swhen\scalling\ssqlite3_errmsg(16)().\s(CVS\s3745)
-D 2007-03-29T15:00:53
+C Don't\scall\sctype\sfunctions\son\shi-bit\schars.\s\sSome\splatforms\sraise\nassertions\swhen\sthis\soccurs,\sand\sit's\salmost\scertainly\snot\sthe\sright\nthing\sto\sdo\sin\sthe\sfirst\splace.\s(CVS\s3746)
+D 2007-03-29T16:30:39
F Makefile.in 2f2c3bf69faf0ae7b8e8af4f94f1986849034530
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e
F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b
F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5
-F ext/fts1/fts1.c 0aab3cf20eefd38935c8f525494d689cb2785f1d
+F ext/fts1/fts1.c 7585d9cb7ad7bcdf162936ab1fd64868f2f55ea5
F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6
F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114
F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089
F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
-F ext/fts2/fts2.c de8321a2ad1edea1f0dd223cb86cf008451784a4
+F ext/fts2/fts2.c 2e3cb46d28b0dd17b2ad3b48409618ace73caec6
F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1
F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b
F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
F test/fts1f.test 2d6cb10d8b7a4e6edc321bbdb3982f1f48774714
F test/fts1i.test 6bfe08cdfdced063a39a50c8601da65e6274d879
F test/fts1j.test e4c0ffcd0ba2adce09c6b7b43ffd0749b5fda5c7
+F test/fts1k.test fdf295cb797ba6a2ef81ec41cb98df0ceb2e572c
F test/fts1porter.test d86e9c3e0c7f8ff95add6582b4b585fb4e02b96d
F test/fts2a.test 103fc178d134c54c44c1938a4331e9e2030792d9
F test/fts2b.test 964abc0236c849c07ca1ae496bb25c268ae94816
F test/fts2h.test 223af921323b409d4b5b18ff4e51619541b174bb
F test/fts2i.test 1b22451d1f13f7c509baec620dc3a4a754885dd6
F test/fts2j.test f68d7611f76309bc8b94170f3740d9fbbc061d9b
+F test/fts2l.test 4c53c89ce3919003765ff4fd8d98ecf724d97dd3
F test/func.test 019d706b2458dfdf239c74cc31143446de1ee44a
F test/hook.test 7e7645fd9a033f79cce8fdff151e32715e7ec50a
F test/in.test 369cb2aa1eab02296b4ec470732fe8c131260b1d
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
-P 3714ac173289e580a0302a5a3beac05823d92c5b
-R 3cfcb502e90a93f72d96670b4207913a
-U danielk1977
-Z 7e6377bdbc94cfb816f281bbc3868b86
+P 54fa22273d551e00e1abd86992ff7c62ec4e0daf
+R 6645d4541d0d9e478c5b564689374f5f
+U shess
+Z 5e17544799ed91760b443021ffc206bc
-54fa22273d551e00e1abd86992ff7c62ec4e0daf
\ No newline at end of file
+f6c3abdc6c5e916e5366ba28fb1cd06ca3554303
\ No newline at end of file
--- /dev/null
+# 2007 March 28
+#
+# The author disclaims copyright to this source code.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library. The focus
+# of this script is testing isspace/isalnum/tolower problems with the
+# FTS1 module. Unfortunately, this code isn't a really principled set
+# of tests, because it's impossible to know where new uses of these
+# functions might appear.
+#
+# $Id: fts1k.test,v 1.1 2007/03/29 16:30:41 shess Exp $
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+
+# If SQLITE_ENABLE_FTS1 is defined, omit this file.
+ifcapable !fts1 {
+ finish_test
+ return
+}
+
+# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
+# hi-bit chars. parseSpec() also calls isalnum here.
+do_test fts1k-1.1 {
+ execsql "CREATE VIRTUAL TABLE t1 USING fts1(content, \x80)"
+} {}
+
+# Additionally tests isspace() call in getToken(), and isalnum() call
+# in tokenListToIdList().
+do_test fts1k-1.2 {
+ catch {
+ execsql "CREATE VIRTUAL TABLE t2 USING fts1(content, tokenize \x80)"
+ }
+ sqlite3_errmsg $DB
+} "unknown tokenizer: \x80"
+
+# Additionally test final isalnum() in startsWith().
+do_test fts1k-1.3 {
+ execsql "CREATE VIRTUAL TABLE t3 USING fts1(content, tokenize\x80)"
+} {}
+
+# The snippet-generation code has calls to isspace() which are sort of
+# hard to get to. It finds convenient breakpoints by starting ~40
+# chars before and after the matched term, and scanning ~10 chars
+# around that position for isspace() characters. The long word with
+# embedded hi-bit chars causes one of these isspace() calls to be
+# exercised. The version with a couple extra spaces should cause the
+# other isspace() call to be exercised. [Both cases have been tested
+# in the debugger, but I'm hoping to continue to catch it if simple
+# constant changes change things slightly.
+#
+# The trailing and leading hi-bit chars help with code which tests for
+# isspace() to coalesce multiple spaces.
+
+set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
+set phrase1 "$word $word $word target $word $word $word"
+set phrase2 "$word $word $word target $word $word $word"
+
+db eval {CREATE VIRTUAL TABLE t4 USING fts1(content)}
+db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
+db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"
+
+do_test fts1k-1.4 {
+ execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
+} {1 111 2 117}
+
+finish_test
--- /dev/null
+# 2007 March 28
+#
+# The author disclaims copyright to this source code.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library. The focus
+# of this script is testing isspace/isalnum/tolower problems with the
+# FTS2 module. Unfortunately, this code isn't a really principled set
+# of tests, because it's impossible to know where new uses of these
+# functions might appear.
+#
+# $Id: fts2l.test,v 1.1 2007/03/29 16:30:41 shess Exp $
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+
+# If SQLITE_ENABLE_FTS2 is defined, omit this file.
+ifcapable !fts2 {
+ finish_test
+ return
+}
+
+# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
+# hi-bit chars. parseSpec() also calls isalnum here.
+do_test fts2l-1.1 {
+ execsql "CREATE VIRTUAL TABLE t1 USING fts2(content, \x80)"
+} {}
+
+# Additionally tests isspace() call in getToken(), and isalnum() call
+# in tokenListToIdList().
+do_test fts2l-1.2 {
+ catch {
+ execsql "CREATE VIRTUAL TABLE t2 USING fts2(content, tokenize \x80)"
+ }
+ sqlite3_errmsg $DB
+} "unknown tokenizer: \x80"
+
+# Additionally test final isalnum() in startsWith().
+do_test fts2l-1.3 {
+ execsql "CREATE VIRTUAL TABLE t3 USING fts2(content, tokenize\x80)"
+} {}
+
+# The snippet-generation code has calls to isspace() which are sort of
+# hard to get to. It finds convenient breakpoints by starting ~40
+# chars before and after the matched term, and scanning ~10 chars
+# around that position for isspace() characters. The long word with
+# embedded hi-bit chars causes one of these isspace() calls to be
+# exercised. The version with a couple extra spaces should cause the
+# other isspace() call to be exercised. [Both cases have been tested
+# in the debugger, but I'm hoping to continue to catch it if simple
+# constant changes change things slightly.
+#
+# The trailing and leading hi-bit chars help with code which tests for
+# isspace() to coalesce multiple spaces.
+
+set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
+set phrase1 "$word $word $word target $word $word $word"
+set phrase2 "$word $word $word target $word $word $word"
+
+db eval {CREATE VIRTUAL TABLE t4 USING fts2(content)}
+db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
+db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"
+
+do_test fts2l-1.4 {
+ execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
+} {1 111 2 117}
+
+finish_test