Don't call ctype functions on hi-bit chars. Some platforms raise

author shess <shess@noemail.net>

Thu, 29 Mar 2007 16:30:38 +0000 (16:30 +0000)

committer shess <shess@noemail.net>

Thu, 29 Mar 2007 16:30:38 +0000 (16:30 +0000)
author shess <shess@noemail.net>
Thu, 29 Mar 2007 16:30:38 +0000 (16:30 +0000)
committer shess <shess@noemail.net>
Thu, 29 Mar 2007 16:30:38 +0000 (16:30 +0000)
diff --git a/ext/fts1/fts1.c b/ext/fts1/fts1.c

index 286655253a7e711fc2b41cfc7dbc81c55cd232f5..ddd4529c7eea7d8c4545d7a94a36488346bcb58f 100644 (file)
--- a/ext/fts1/fts1.c
+++ b/ext/fts1/fts1.c
@@ -177,6 +177,25 @@ static int getVarint32(const char *p, int *pi){
   * the previous token to make the estimate a tiny bit more precise.
  */
  
+/* It is not safe to call isspace(), tolower(), or isalnum() on
+** hi-bit-set characters.  This is the same solution used in the
+** tokenizer.
+*/
+/* TODO(shess) The snippet-generation code should be using the
+** tokenizer-generated tokens rather than doing its own local
+** tokenization.
+*/
+/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
+static int safe_isspace(char c){
+  return (c&0x80)==0 ? isspace(c) : 0;
+}
+static int safe_tolower(char c){
+  return (c&0x80)==0 ? tolower(c) : c;
+}
+static int safe_isalnum(char c){
+  return (c&0x80)==0 ? isalnum(c) : 0;
+}
+
  typedef enum DocListType {
    DL_DOCIDS,              /* docids only */
    DL_POSITIONS,           /* docids + positions */
@@ -1536,7 +1555,7 @@ static int getToken(const char *z, int *tokenType){
        return 0;
      }
      case ' ': case '\t': case '\n': case '\f': case '\r': {
-      for(i=1; isspace(z[i]); i++){}
+      for(i=1; safe_isspace(z[i]); i++){}
        *tokenType = TOKEN_SPACE;
        return i;
      }
@@ -1688,7 +1707,7 @@ static void tokenListToIdList(char **azIn){
    int i, j;
    if( azIn ){
      for(i=0, j=-1; azIn[i]; i++){
-      if( isalnum(azIn[i][0]) || azIn[i][1] ){
+      if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
          dequoteString(azIn[i]);
          if( j>=0 ){
            azIn[j] = azIn[i];
@@ -1737,11 +1756,11 @@ static char *firstToken(char *zIn, char **pzTail){
  ** s[] is t[].
  */
  static int startsWith(const char *s, const char *t){
-  while( isspace(*s) ){ s++; }
+  while( safe_isspace(*s) ){ s++; }
    while( *t ){
-    if( tolower(*s++)!=tolower(*t++) ) return 0;
+    if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
    }
-  return *s!='_' && !isalnum(*s);
+  return *s!='_' && !safe_isalnum(*s);
  }
  
  /*
@@ -1853,7 +1872,7 @@ static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
      char *p;
      pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
      for (p = pSpec->azContentColumn[i]; *p ; ++p) {
-      if( !isalnum(*p) ) *p = '_';
+      if( !safe_isalnum(*p) ) *p = '_';
      }
    }
  
@@ -2330,10 +2349,10 @@ static int wordBoundary(
      }
    }
    for(i=1; i<=10; i++){
-    if( isspace(zDoc[iBreak-i]) ){
+    if( safe_isspace(zDoc[iBreak-i]) ){
        return iBreak - i + 1;
      }
-    if( isspace(zDoc[iBreak+i]) ){
+    if( safe_isspace(zDoc[iBreak+i]) ){
        return iBreak + i + 1;
      }
    }
@@ -2346,7 +2365,7 @@ static int wordBoundary(
  */
  static void appendWhiteSpace(StringBuffer *p){
    if( p->len==0 ) return;
-  if( isspace(p->s[p->len-1]) ) return;
+  if( safe_isspace(p->s[p->len-1]) ) return;
    append(p, " ");
  }
  
@@ -2354,7 +2373,7 @@ static void appendWhiteSpace(StringBuffer *p){
  ** Remove white space from teh end of the StringBuffer
  */
  static void trimWhiteSpace(StringBuffer *p){
-  while( p->len>0 && isspace(p->s[p->len-1]) ){
+  while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
      p->len--;
    }
  }
diff --git a/ext/fts2/fts2.c b/ext/fts2/fts2.c

index 3f49a2958bd65131de8abbb07cbf83dde5f169b4..2955e731a1e154748522ed3474acc8af6b543527 100644 (file)
--- a/ext/fts2/fts2.c
+++ b/ext/fts2/fts2.c
@@ -304,6 +304,25 @@ SQLITE_EXTENSION_INIT1
  # define TRACE(A)
  #endif
  
+/* It is not safe to call isspace(), tolower(), or isalnum() on
+** hi-bit-set characters.  This is the same solution used in the
+** tokenizer.
+*/
+/* TODO(shess) The snippet-generation code should be using the
+** tokenizer-generated tokens rather than doing its own local
+** tokenization.
+*/
+/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
+static int safe_isspace(char c){
+  return (c&0x80)==0 ? isspace(c) : 0;
+}
+static int safe_tolower(char c){
+  return (c&0x80)==0 ? tolower(c) : c;
+}
+static int safe_isalnum(char c){
+  return (c&0x80)==0 ? isalnum(c) : 0;
+}
+
  typedef enum DocListType {
    DL_DOCIDS,              /* docids only */
    DL_POSITIONS,           /* docids + positions */
@@ -504,7 +523,7 @@ static void appendList(StringBuffer *sb, int nString, char **azString){
  
  static int endsInWhiteSpace(StringBuffer *p){
    return stringBufferLength(p)>0 &&
-    isspace(stringBufferData(p)[stringBufferLength(p)-1]);
+    safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
  }
  
  /* If the StringBuffer ends in something other than white space, add a
@@ -2194,7 +2213,7 @@ static int getToken(const char *z, int *tokenType){
        return 0;
      }
      case ' ': case '\t': case '\n': case '\f': case '\r': {
-      for(i=1; isspace(z[i]); i++){}
+      for(i=1; safe_isspace(z[i]); i++){}
        *tokenType = TOKEN_SPACE;
        return i;
      }
@@ -2346,7 +2365,7 @@ static void tokenListToIdList(char **azIn){
    int i, j;
    if( azIn ){
      for(i=0, j=-1; azIn[i]; i++){
-      if( isalnum(azIn[i][0]) || azIn[i][1] ){
+      if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
          dequoteString(azIn[i]);
          if( j>=0 ){
            azIn[j] = azIn[i];
@@ -2395,11 +2414,11 @@ static char *firstToken(char *zIn, char **pzTail){
  ** s[] is t[].
  */
  static int startsWith(const char *s, const char *t){
-  while( isspace(*s) ){ s++; }
+  while( safe_isspace(*s) ){ s++; }
    while( *t ){
-    if( tolower(*s++)!=tolower(*t++) ) return 0;
+    if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
    }
-  return *s!='_' && !isalnum(*s);
+  return *s!='_' && !safe_isalnum(*s);
  }
  
  /*
@@ -2511,7 +2530,7 @@ static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
      char *p;
      pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
      for (p = pSpec->azContentColumn[i]; *p ; ++p) {
-      if( !isalnum(*p) ) *p = '_';
+      if( !safe_isalnum(*p) ) *p = '_';
      }
    }
  
@@ -2971,10 +2990,10 @@ static int wordBoundary(
      }
    }
    for(i=1; i<=10; i++){
-    if( isspace(zDoc[iBreak-i]) ){
+    if( safe_isspace(zDoc[iBreak-i]) ){
        return iBreak - i + 1;
      }
-    if( isspace(zDoc[iBreak+i]) ){
+    if( safe_isspace(zDoc[iBreak+i]) ){
        return iBreak + i + 1;
      }
    }
diff --git a/manifest b/manifest

index f34f66db28fca38f9b4e3c8f2693a17ce4de6033..f0ff011d8b70595ed867636dabdb3a64bb49ea28 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Assume\sthe\smalloc-failed\sflag\scannot\salready\sbe\sset\swhen\scalling\ssqlite3_errmsg(16)().\s(CVS\s3745)
-D 2007-03-29T15:00:53
+C Don't\scall\sctype\sfunctions\son\shi-bit\schars.\s\sSome\splatforms\sraise\nassertions\swhen\sthis\soccurs,\sand\sit's\salmost\scertainly\snot\sthe\sright\nthing\sto\sdo\sin\sthe\sfirst\splace.\s(CVS\s3746)
+D 2007-03-29T16:30:39
  F Makefile.in 2f2c3bf69faf0ae7b8e8af4f94f1986849034530
  F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
  F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@@ -22,7 +22,7 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1
  F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e
  F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b
  F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5
-F ext/fts1/fts1.c 0aab3cf20eefd38935c8f525494d689cb2785f1d
+F ext/fts1/fts1.c 7585d9cb7ad7bcdf162936ab1fd64868f2f55ea5
  F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6
  F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114
  F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089
@@ -34,7 +34,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
  F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
  F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
  F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
-F ext/fts2/fts2.c de8321a2ad1edea1f0dd223cb86cf008451784a4
+F ext/fts2/fts2.c 2e3cb46d28b0dd17b2ad3b48409618ace73caec6
  F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1
  F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b
  F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
@@ -214,6 +214,7 @@ F test/fts1e.test 77244843e925560b5a0b70069c3e7ab62f181ed2
  F test/fts1f.test 2d6cb10d8b7a4e6edc321bbdb3982f1f48774714
  F test/fts1i.test 6bfe08cdfdced063a39a50c8601da65e6274d879
  F test/fts1j.test e4c0ffcd0ba2adce09c6b7b43ffd0749b5fda5c7
+F test/fts1k.test fdf295cb797ba6a2ef81ec41cb98df0ceb2e572c
  F test/fts1porter.test d86e9c3e0c7f8ff95add6582b4b585fb4e02b96d
  F test/fts2a.test 103fc178d134c54c44c1938a4331e9e2030792d9
  F test/fts2b.test 964abc0236c849c07ca1ae496bb25c268ae94816
@@ -225,6 +226,7 @@ F test/fts2g.test c69a8ab43ec77d123976ba6cf9422d647ae63032
  F test/fts2h.test 223af921323b409d4b5b18ff4e51619541b174bb
  F test/fts2i.test 1b22451d1f13f7c509baec620dc3a4a754885dd6
  F test/fts2j.test f68d7611f76309bc8b94170f3740d9fbbc061d9b
+F test/fts2l.test 4c53c89ce3919003765ff4fd8d98ecf724d97dd3
  F test/func.test 019d706b2458dfdf239c74cc31143446de1ee44a
  F test/hook.test 7e7645fd9a033f79cce8fdff151e32715e7ec50a
  F test/in.test 369cb2aa1eab02296b4ec470732fe8c131260b1d
@@ -444,7 +446,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
  F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
  F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
  F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
-P 3714ac173289e580a0302a5a3beac05823d92c5b
-R 3cfcb502e90a93f72d96670b4207913a
-U danielk1977
-Z 7e6377bdbc94cfb816f281bbc3868b86
+P 54fa22273d551e00e1abd86992ff7c62ec4e0daf
+R 6645d4541d0d9e478c5b564689374f5f
+U shess
+Z 5e17544799ed91760b443021ffc206bc
diff --git a/manifest.uuid b/manifest.uuid

index e342a433b94aa9e700356a47fbec8dfbfb74f212..7d1b46cd55f14dc32abf789215b4e0d238c9e8fd 100644 (file)
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-54fa22273d551e00e1abd86992ff7c62ec4e0daf
-\ No newline at end of file
+f6c3abdc6c5e916e5366ba28fb1cd06ca3554303
+\ No newline at end of file
diff --git a/test/fts1k.test b/test/fts1k.test

new file mode 100644 (file)

index 0000000..2fffa41
--- /dev/null
+++ b/test/fts1k.test
@@ -0,0 +1,69 @@
+# 2007 March 28
+#
+# The author disclaims copyright to this source code.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library.  The focus
+# of this script is testing isspace/isalnum/tolower problems with the
+# FTS1 module.  Unfortunately, this code isn't a really principled set
+# of tests, because it's impossible to know where new uses of these
+# functions might appear.
+#
+# $Id: fts1k.test,v 1.1 2007/03/29 16:30:41 shess Exp $
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+
+# If SQLITE_ENABLE_FTS1 is defined, omit this file.
+ifcapable !fts1 {
+  finish_test
+  return
+}
+
+# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
+# hi-bit chars.  parseSpec() also calls isalnum here.
+do_test fts1k-1.1 {
+  execsql "CREATE VIRTUAL TABLE t1 USING fts1(content, \x80)"
+} {}
+
+# Additionally tests isspace() call in getToken(), and isalnum() call
+# in tokenListToIdList().
+do_test fts1k-1.2 {
+  catch {
+    execsql "CREATE VIRTUAL TABLE t2 USING fts1(content, tokenize \x80)"
+  }
+  sqlite3_errmsg $DB
+} "unknown tokenizer: \x80"
+
+# Additionally test final isalnum() in startsWith().
+do_test fts1k-1.3 {
+  execsql "CREATE VIRTUAL TABLE t3 USING fts1(content, tokenize\x80)"
+} {}
+
+# The snippet-generation code has calls to isspace() which are sort of
+# hard to get to.  It finds convenient breakpoints by starting ~40
+# chars before and after the matched term, and scanning ~10 chars
+# around that position for isspace() characters.  The long word with
+# embedded hi-bit chars causes one of these isspace() calls to be
+# exercised.  The version with a couple extra spaces should cause the
+# other isspace() call to be exercised.  [Both cases have been tested
+# in the debugger, but I'm hoping to continue to catch it if simple
+# constant changes change things slightly.
+#
+# The trailing and leading hi-bit chars help with code which tests for
+# isspace() to coalesce multiple spaces.
+
+set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
+set phrase1 "$word $word $word target $word $word $word"
+set phrase2 "$word $word $word    target    $word $word $word"
+
+db eval {CREATE VIRTUAL TABLE t4 USING fts1(content)}
+db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
+db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"
+
+do_test fts1k-1.4 {
+  execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
+} {1 111 2 117}
+
+finish_test
diff --git a/test/fts2l.test b/test/fts2l.test

new file mode 100644 (file)

index 0000000..739eb50
--- /dev/null
+++ b/test/fts2l.test
@@ -0,0 +1,69 @@
+# 2007 March 28
+#
+# The author disclaims copyright to this source code.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library.  The focus
+# of this script is testing isspace/isalnum/tolower problems with the
+# FTS2 module.  Unfortunately, this code isn't a really principled set
+# of tests, because it's impossible to know where new uses of these
+# functions might appear.
+#
+# $Id: fts2l.test,v 1.1 2007/03/29 16:30:41 shess Exp $
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+
+# If SQLITE_ENABLE_FTS2 is defined, omit this file.
+ifcapable !fts2 {
+  finish_test
+  return
+}
+
+# Tests that startsWith() (calls isspace, tolower, isalnum) can handle
+# hi-bit chars.  parseSpec() also calls isalnum here.
+do_test fts2l-1.1 {
+  execsql "CREATE VIRTUAL TABLE t1 USING fts2(content, \x80)"
+} {}
+
+# Additionally tests isspace() call in getToken(), and isalnum() call
+# in tokenListToIdList().
+do_test fts2l-1.2 {
+  catch {
+    execsql "CREATE VIRTUAL TABLE t2 USING fts2(content, tokenize \x80)"
+  }
+  sqlite3_errmsg $DB
+} "unknown tokenizer: \x80"
+
+# Additionally test final isalnum() in startsWith().
+do_test fts2l-1.3 {
+  execsql "CREATE VIRTUAL TABLE t3 USING fts2(content, tokenize\x80)"
+} {}
+
+# The snippet-generation code has calls to isspace() which are sort of
+# hard to get to.  It finds convenient breakpoints by starting ~40
+# chars before and after the matched term, and scanning ~10 chars
+# around that position for isspace() characters.  The long word with
+# embedded hi-bit chars causes one of these isspace() calls to be
+# exercised.  The version with a couple extra spaces should cause the
+# other isspace() call to be exercised.  [Both cases have been tested
+# in the debugger, but I'm hoping to continue to catch it if simple
+# constant changes change things slightly.
+#
+# The trailing and leading hi-bit chars help with code which tests for
+# isspace() to coalesce multiple spaces.
+
+set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80"
+set phrase1 "$word $word $word target $word $word $word"
+set phrase2 "$word $word $word    target    $word $word $word"
+
+db eval {CREATE VIRTUAL TABLE t4 USING fts2(content)}
+db eval "INSERT INTO t4 (content) VALUES ('$phrase1')"
+db eval "INSERT INTO t4 (content) VALUES ('$phrase2')"
+
+do_test fts2l-1.4 {
+  execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'}
+} {1 111 2 117}
+
+finish_test
author	shess <shess@noemail.net>
	Thu, 29 Mar 2007 16:30:38 +0000 (16:30 +0000)
committer	shess <shess@noemail.net>
	Thu, 29 Mar 2007 16:30:38 +0000 (16:30 +0000)
ext/fts1/fts1.c		patch \| blob \| blame \| history
ext/fts2/fts2.c		patch \| blob \| blame \| history
manifest		patch \| blob \| blame \| history
manifest.uuid		patch \| blob \| blame \| history
test/fts1k.test	[new file with mode: 0644]	patch \| blob
test/fts2l.test	[new file with mode: 0644]	patch \| blob