Add a version of the unicode61 tokenizer to fts5.

author dan <dan@noemail.net>

Thu, 1 Jan 2015 16:46:10 +0000 (16:46 +0000)

committer dan <dan@noemail.net>

Thu, 1 Jan 2015 16:46:10 +0000 (16:46 +0000)
author dan <dan@noemail.net>
Thu, 1 Jan 2015 16:46:10 +0000 (16:46 +0000)
committer dan <dan@noemail.net>
Thu, 1 Jan 2015 16:46:10 +0000 (16:46 +0000)
diff --git a/ext/fts3/unicode/mkunicode.tcl b/ext/fts3/unicode/mkunicode.tcl

index 2da17c51a53ba79b923650d44b02a4a6c4f4cbeb..f1adb5ffdecbaa03157a844e1bb9c9e7cbc5ab09 100644 (file)
--- a/ext/fts3/unicode/mkunicode.tcl
+++ b/ext/fts3/unicode/mkunicode.tcl
@@ -732,8 +732,12 @@ proc print_fileheader {} {
  */
    }]
    puts ""
-  puts "#if defined(SQLITE_ENABLE_FTS4_UNICODE61)"
-  puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
+  if {$::generate_fts5_code} {
+    puts "#if defined(SQLITE_ENABLE_FTS5)"
+  } else {
+    puts "#if defined(SQLITE_ENABLE_FTS4_UNICODE61)"
+    puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
+  }
    puts ""
    puts "#include <assert.h>"
    puts ""
@@ -760,22 +764,38 @@ proc print_test_main {} {
  # our liking.
  #
  proc usage {} {
-  puts -nonewline stderr "Usage: $::argv0 ?-test? "
+  puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
    puts            stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
    exit 1
  }
-if {[llength $argv]!=2 && [llength $argv]!=3} usage
-if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
+if {[llength $argv]<2} usage
  set unicodedata.txt [lindex $argv end]
  set casefolding.txt [lindex $argv end-1]
-set generate_test_code [expr {[llength $argv]==3}]
+
+set generate_test_code 0
+set generate_fts5_code 0
+set function_prefix "sqlite3Fts"
+for {set i 0} {$i < [llength $argv]-2} {incr i} {
+  switch -- [lindex $argv $i] {
+    -test {
+      set generate_test_code 1
+    }
+    -fts5 {
+      set function_prefix sqlite3Fts5
+      set generate_fts5_code 1
+    }
+    default {
+      usage
+    }
+  }
+}
  
  print_fileheader
  
  # Print the isalnum() function to stdout.
  #
  set lRange [an_load_separator_ranges]
-print_isalnum sqlite3FtsUnicodeIsalnum $lRange
+print_isalnum ${function_prefix}UnicodeIsalnum $lRange
  
  # Leave a gap between the two generated C functions.
  #
@@ -790,22 +810,26 @@ set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
  print_rd $mappings
  puts ""
  puts ""
-print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings
+print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
  puts ""
  puts ""
  
  # Print the fold() function to stdout.
  #
-print_fold sqlite3FtsUnicodeFold
+print_fold ${function_prefix}UnicodeFold
  
  # Print the test routines and main() function to stdout, if -test 
  # was specified.
  #
  if {$::generate_test_code} {
-  print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
-  print_fold_test sqlite3FtsUnicodeFold $mappings
+  print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
+  print_fold_test ${function_prefix}UnicodeFold $mappings
    print_test_main 
  }
  
-puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
-puts "#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */"
+if {$generate_fts5_code} {
+  puts "#endif /* defined(SQLITE_ENABLE_FTS5) */"
+} else {
+  puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
+  puts "#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */"
+}
diff --git a/ext/fts5/fts5_tcl.c b/ext/fts5/fts5_tcl.c

index 575f4f871a6815fe0bfb90b20787fce42e44a76b..1ce1bba49d6a5a87efdd90e32fc025478e54e624 100644 (file)
--- a/ext/fts5/fts5_tcl.c
+++ b/ext/fts5/fts5_tcl.c
@@ -518,16 +518,31 @@ static int f5tCreateFunction(
    return TCL_OK;
  }
  
+typedef struct F5tTokenizeCtx F5tTokenizeCtx;
+struct F5tTokenizeCtx {
+  Tcl_Obj *pRet;
+  int bSubst;
+  const char *zInput;
+};
+
  static int xTokenizeCb2(
    void *pCtx, 
    const char *zToken, int nToken, 
    int iStart, int iEnd, int iPos
  ){
-  Tcl_Obj *pRet = (Tcl_Obj*)pCtx;
-  Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
-  Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iStart));
-  Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iEnd));
-  Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
+  F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx;
+  if( p->bSubst ){
+    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iPos));
+    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
+    Tcl_ListObjAppendElement(
+        0, p->pRet, Tcl_NewStringObj(&p->zInput[iStart], iEnd-iStart)
+    );
+  }else{
+    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
+    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iStart));
+    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iEnd));
+    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iPos));
+  }
    return SQLITE_OK;
  }
  
@@ -543,7 +558,6 @@ static int f5tTokenize(
    int objc,
    Tcl_Obj *CONST objv[]
  ){
-  char *zName;
    char *zText;
    int nText;
    sqlite3 *db = 0;
@@ -554,21 +568,39 @@ static int f5tTokenize(
    void *pUserdata;
    int rc;
  
-  if( objc!=4 ){
-    Tcl_WrongNumArgs(interp, 1, objv, "DB NAME TEXT");
+  int nArg;
+  const char **azArg;
+  F5tTokenizeCtx ctx;
+
+  if( objc!=4 && objc!=5 ){
+    Tcl_WrongNumArgs(interp, 1, objv, "?-subst? DB NAME TEXT");
      return TCL_ERROR;
    }
-  if( f5tDbAndApi(interp, objv[1], &db, &pApi) ) return TCL_ERROR;
-  zName = Tcl_GetString(objv[2]);
-  zText = Tcl_GetStringFromObj(objv[3], &nText);
+  if( objc==5 ){
+    char *zOpt = Tcl_GetString(objv[1]);
+    if( strcmp("-subst", zOpt) ){
+      Tcl_AppendResult(interp, "unrecognized option: ", zOpt, 0);
+      return TCL_ERROR;
+    }
+  }
+  if( f5tDbAndApi(interp, objv[objc-3], &db, &pApi) ) return TCL_ERROR;
+  if( Tcl_SplitList(interp, Tcl_GetString(objv[objc-2]), &nArg, &azArg) ){
+    return TCL_ERROR;
+  }
+  if( nArg==0 ){
+    Tcl_AppendResult(interp, "no such tokenizer: ", 0);
+    Tcl_Free((void*)azArg);
+    return TCL_ERROR;
+  }
+  zText = Tcl_GetStringFromObj(objv[objc-1], &nText);
  
-  rc = pApi->xFindTokenizer(pApi, zName, &pUserdata, &tokenizer);
+  rc = pApi->xFindTokenizer(pApi, azArg[0], &pUserdata, &tokenizer);
    if( rc!=SQLITE_OK ){
-    Tcl_AppendResult(interp, "no such tokenizer: ", zName, 0);
+    Tcl_AppendResult(interp, "no such tokenizer: ", azArg[0], 0);
      return TCL_ERROR;
    }
  
-  rc = tokenizer.xCreate(pUserdata, 0, 0, &pTok);
+  rc = tokenizer.xCreate(pUserdata, &azArg[1], nArg-1, &pTok);
    if( rc!=SQLITE_OK ){
      Tcl_AppendResult(interp, "error in tokenizer.xCreate()", 0);
      return TCL_ERROR;
@@ -576,7 +608,10 @@ static int f5tTokenize(
  
    pRet = Tcl_NewObj();
    Tcl_IncrRefCount(pRet);
-  rc = tokenizer.xTokenize(pTok, pRet, zText, nText, xTokenizeCb2);
+  ctx.bSubst = (objc==5);
+  ctx.pRet = pRet;
+  ctx.zInput = zText;
+  rc = tokenizer.xTokenize(pTok, (void*)&ctx, zText, nText, xTokenizeCb2);
    tokenizer.xDelete(pTok);
    if( rc!=SQLITE_OK ){
      Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", 0);
@@ -585,6 +620,7 @@ static int f5tTokenize(
    }
  
  
+  Tcl_Free((void*)azArg);
    Tcl_SetObjResult(interp, pRet);
    Tcl_DecrRefCount(pRet);
    return TCL_OK;
diff --git a/ext/fts5/fts5_tokenize.c b/ext/fts5/fts5_tokenize.c

index 5352faa2c6ed5c7c875babc725e8d253b2e383b0..b23eccd97f0119d2573b8f4e1bb47a3148bbd323 100644 (file)
--- a/ext/fts5/fts5_tokenize.c
+++ b/ext/fts5/fts5_tokenize.c
@@ -15,6 +15,9 @@
  #include <string.h>
  #include <assert.h>
  
+/**************************************************************************
+** Start of unicode61 tokenizer implementation.
+*/
  
  /*
  ** Create a "simple" tokenizer.
@@ -69,7 +72,7 @@ static int fts5SimpleTokenize(
    const char *pText, int nText,
    int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
  ){
-  int rc;
+  int rc = SQLITE_OK;
    int ie;
    int is = 0;
    int iPos = 0;
@@ -78,7 +81,7 @@ static int fts5SimpleTokenize(
    int nFold = sizeof(aFold);
    char *pFold = aFold;
  
-  do {
+  while( is<nText && rc==SQLITE_OK ){
      int nByte;
  
      /* Skip any leading divider characters. */
@@ -110,13 +113,282 @@ static int fts5SimpleTokenize(
      rc = xToken(pCtx, pFold, nByte, is, ie, iPos);
      iPos++;
      is = ie+1;
-  }while( is<nText && rc==SQLITE_OK );
+  }
    
    if( pFold!=aFold ) sqlite3_free(pFold);
    if( rc==SQLITE_DONE ) rc = SQLITE_OK;
    return rc;
  }
  
+/**************************************************************************
+** Start of unicode61 tokenizer implementation.
+*/
+
+/*
+** Functions in fts5_unicode2.c. 
+*/
+int sqlite3Fts5UnicodeIsalnum(int c);
+int sqlite3Fts5UnicodeIsdiacritic(int c);
+int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);
+
+
+/*
+** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
+** from the sqlite3 source file utf.c. If this file is compiled as part
+** of the amalgamation, they are not required.
+*/
+#ifndef SQLITE_AMALGAMATION
+
+static const unsigned char sqlite3Utf8Trans1[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
+};
+
+#define READ_UTF8(zIn, zTerm, c)                           \
+  c = *(zIn++);                                            \
+  if( c>=0xc0 ){                                           \
+    c = sqlite3Utf8Trans1[c-0xc0];                         \
+    while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
+      c = (c<<6) + (0x3f & *(zIn++));                      \
+    }                                                      \
+    if( c<0x80                                             \
+        || (c&0xFFFFF800)==0xD800                          \
+        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
+  }
+
+#define WRITE_UTF8(zOut, c) {                          \
+  if( c<0x00080 ){                                     \
+    *zOut++ = (unsigned char)(c&0xFF);                 \
+  }                                                    \
+  else if( c<0x00800 ){                                \
+    *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
+    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
+  }                                                    \
+  else if( c<0x10000 ){                                \
+    *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F);    \
+    *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
+    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
+  }else{                                               \
+    *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07);  \
+    *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
+    *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
+    *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
+  }                                                    \
+}
+
+#endif /* ifndef SQLITE_AMALGAMATION */
+
+typedef struct Unicode61Tokenizer Unicode61Tokenizer;
+struct Unicode61Tokenizer {
+  int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
+  int nException;
+  int *aiException;
+};
+
+static int fts5UnicodeAddExceptions(
+  Unicode61Tokenizer *p,          /* Tokenizer object */
+  const char *z,                  /* Characters to treat as exceptions */
+  int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
+){
+  int rc = SQLITE_OK;
+  int n = strlen(z);
+  int *aNew;
+
+  if( n>0 ){
+    aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int));
+    if( aNew ){
+      int nNew = p->nException;
+      const unsigned char *zCsr = (const unsigned char*)z;
+      const unsigned char *zTerm = (const unsigned char*)&z[n];
+      while( zCsr<zTerm ){
+        int iCode;
+        int bToken;
+        READ_UTF8(zCsr, zTerm, iCode);
+        bToken = sqlite3Fts5UnicodeIsalnum(iCode);
+        assert( (bToken==0 || bToken==1) ); 
+        assert( (bTokenChars==0 || bTokenChars==1) );
+        if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
+          int i;
+          for(i=0; i<nNew; i++){
+            if( aNew[i]>iCode ) break;
+          }
+          memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
+          aNew[i] = iCode;
+          nNew++;
+        }
+      }
+      p->aiException = aNew;
+      p->nException = nNew;
+    }else{
+      rc = SQLITE_NOMEM;
+    }
+  }
+
+  return rc;
+}
+
+/*
+** Return true if the p->aiException[] array contains the value iCode.
+*/
+static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
+  if( p->nException>0 ){
+    int *a = p->aiException;
+    int iLo = 0;
+    int iHi = p->nException-1;
+
+    while( iHi>=iLo ){
+      int iTest = (iHi + iLo) / 2;
+      if( iCode==a[iTest] ){
+        return 1;
+      }else if( iCode>a[iTest] ){
+        iLo = iTest+1;
+      }else{
+        iHi = iTest-1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+/*
+** Create a "unicode61" tokenizer.
+*/
+static int fts5UnicodeCreate(
+  void *pCtx, 
+  const char **azArg, int nArg,
+  Fts5Tokenizer **ppOut
+){
+  int rc = SQLITE_OK;             /* Return code */
+  Unicode61Tokenizer *p = 0;      /* New tokenizer object */ 
+
+  if( nArg%2 ){
+    rc = SQLITE_ERROR;
+  }else{
+    p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
+    if( p ){
+      int i;
+      memset(p, 0, sizeof(Unicode61Tokenizer));
+      p->bRemoveDiacritic = 1;
+      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
+        const char *zArg = azArg[i+1];
+        if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
+          if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
+            rc = SQLITE_ERROR;
+          }
+          p->bRemoveDiacritic = (zArg[0]=='1');
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
+          rc = fts5UnicodeAddExceptions(p, zArg, 1);
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
+          rc = fts5UnicodeAddExceptions(p, zArg, 0);
+        }else{
+          rc = SQLITE_ERROR;
+        }
+      }
+    }else{
+      rc = SQLITE_NOMEM;
+    }
+    *ppOut = (Fts5Tokenizer*)p;
+  }
+  return rc;
+}
+
+/*
+** Delete a "unicode61" tokenizer.
+*/
+static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
+  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
+  sqlite3_free(p->aiException);
+  sqlite3_free(p);
+  return;
+}
+
+/*
+** Return true if, for the purposes of tokenizing with the tokenizer
+** passed as the first argument, codepoint iCode is considered a token 
+** character (not a separator).
+*/
+static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
+  assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
+  return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
+}
+
+/*
+** Tokenize some text using a unicode61 tokenizer.
+*/
+static int fts5UnicodeTokenize(
+  Fts5Tokenizer *pTokenizer,
+  void *pCtx,
+  const char *pText, int nText,
+  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
+){
+  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
+  const unsigned char *zInput = (const unsigned char*)pText;
+  const unsigned char *zTerm = &zInput[nText];
+  const unsigned char *z = zInput;
+  int rc = SQLITE_OK;
+  int nBuf = 0;
+  unsigned char *zBuf = 0;
+  unsigned char *zOut = 0;
+  int iPos = 0;
+
+  while( rc==SQLITE_OK && z<zTerm ){
+    int iCode;
+    int bAlnum;
+    const unsigned char *zStart;
+    const unsigned char *zCode;
+
+    if( zOut==zBuf ) zStart = z;
+    zCode = z;
+    READ_UTF8(z, zTerm, iCode);
+    bAlnum = fts5UnicodeIsAlnum(p, iCode);
+    if( bAlnum==0 && zOut>zBuf ){
+      bAlnum = sqlite3Fts5UnicodeIsdiacritic(iCode);
+    }
+
+    if( bAlnum ){
+      int iOut;
+
+      /* Grow the output buffer if required */
+      while( (zOut-zBuf)+4>=nBuf ){
+        unsigned char *zNew;
+        nBuf = (nBuf ? nBuf*2 : 128);
+        zNew = sqlite3_realloc(zBuf, nBuf);
+        if( zNew==0 ){
+          rc = SQLITE_NOMEM;
+          goto tokenize_finished;
+        }else{
+          zOut = &zNew[zOut-zBuf];
+          zBuf = zNew;
+        }
+      }
+
+      /* Write the new character to it */
+      iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
+      if( iOut ) WRITE_UTF8(zOut, iOut);
+    }
+
+    if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){
+      int ie = (bAlnum ? z : zCode) - zInput;
+      rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie, iPos);
+      zOut = zBuf;
+      iPos++;
+    }
+  }
+
+ tokenize_finished:
+  sqlite3_free(zBuf);
+  return rc;
+}
+
  /**************************************************************************
  ** Start of porter2 stemmer implementation.
  */
@@ -477,8 +749,9 @@ int sqlite3Fts5TokenizerInit(fts5_api *pApi){
      const char *zName;
      fts5_tokenizer x;
    } aBuiltin[] = {
-    { "porter",  { fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize } },
-    { "simple",  { fts5SimpleCreate, fts5SimpleDelete, fts5SimpleTokenize } }
+    { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
+    { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
+    { "simple",    {fts5SimpleCreate, fts5SimpleDelete, fts5SimpleTokenize }}
    };
    
    int rc = SQLITE_OK;             /* Return code */
diff --git a/ext/fts5/fts5_unicode2.c b/ext/fts5/fts5_unicode2.c

new file mode 100644 (file)

index 0000000..5692bf2
--- /dev/null
+++ b/ext/fts5/fts5_unicode2.c
@@ -0,0 +1,363 @@
+/*
+** 2012 May 25
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+******************************************************************************
+*/
+
+/*
+** DO NOT EDIT THIS MACHINE GENERATED FILE.
+*/
+
+#if defined(SQLITE_ENABLE_FTS5)
+
+#include <assert.h>
+
+/*
+** Return true if the argument corresponds to a unicode codepoint
+** classified as either a letter or a number. Otherwise false.
+**
+** The results are undefined if the value passed to this function
+** is less than zero.
+*/
+int sqlite3Fts5UnicodeIsalnum(int c){
+  /* Each unsigned integer in the following array corresponds to a contiguous
+  ** range of unicode codepoints that are not either letters or numbers (i.e.
+  ** codepoints for which this function should return 0).
+  **
+  ** The most significant 22 bits in each 32-bit value contain the first 
+  ** codepoint in the range. The least significant 10 bits are used to store
+  ** the size of the range (always at least 1). In other words, the value 
+  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
+  ** C. It is not possible to represent a range larger than 1023 codepoints 
+  ** using this format.
+  */
+  const static unsigned int aEntry[] = {
+    0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
+    0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
+    0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
+    0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
+    0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
+    0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
+    0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
+    0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
+    0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
+    0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
+    0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
+    0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
+    0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
+    0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
+    0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
+    0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
+    0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
+    0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
+    0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
+    0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
+    0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
+    0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
+    0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
+    0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
+    0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
+    0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
+    0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
+    0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
+    0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
+    0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
+    0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
+    0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
+    0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
+    0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
+    0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
+    0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
+    0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
+    0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
+    0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
+    0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
+    0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
+    0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
+    0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
+    0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
+    0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
+    0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
+    0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
+    0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
+    0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
+    0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
+    0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
+    0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
+    0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
+    0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
+    0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
+    0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
+    0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
+    0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
+    0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
+    0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
+    0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
+    0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
+    0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
+    0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
+    0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
+    0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
+    0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
+    0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
+    0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
+    0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
+    0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
+    0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
+    0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
+    0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
+    0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
+    0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
+    0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
+    0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
+    0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
+    0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
+    0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
+    0x380400F0,
+  };
+  static const unsigned int aAscii[4] = {
+    0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
+  };
+
+  if( c<128 ){
+    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
+  }else if( c<(1<<22) ){
+    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
+    int iRes;
+    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
+    int iLo = 0;
+    while( iHi>=iLo ){
+      int iTest = (iHi + iLo) / 2;
+      if( key >= aEntry[iTest] ){
+        iRes = iTest;
+        iLo = iTest+1;
+      }else{
+        iHi = iTest-1;
+      }
+    }
+    assert( aEntry[0]<key );
+    assert( key>=aEntry[iRes] );
+    return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
+  }
+  return 1;
+}
+
+
+/*
+** If the argument is a codepoint corresponding to a lowercase letter
+** in the ASCII range with a diacritic added, return the codepoint
+** of the ASCII letter only. For example, if passed 235 - "LATIN
+** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
+** E"). The resuls of passing a codepoint that corresponds to an
+** uppercase letter are undefined.
+*/
+static int remove_diacritic(int c){
+  unsigned short aDia[] = {
+        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
+     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
+     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
+     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
+     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
+     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
+     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
+     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
+    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
+    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
+    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
+    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
+    62924, 63050, 63082, 63274, 63390, 
+  };
+  char aChar[] = {
+    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
+    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
+    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
+    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
+    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
+    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
+    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
+    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
+    'e',  'i',  'o',  'u',  'y',  
+  };
+
+  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
+  int iRes = 0;
+  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
+  int iLo = 0;
+  while( iHi>=iLo ){
+    int iTest = (iHi + iLo) / 2;
+    if( key >= aDia[iTest] ){
+      iRes = iTest;
+      iLo = iTest+1;
+    }else{
+      iHi = iTest-1;
+    }
+  }
+  assert( key>=aDia[iRes] );
+  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
+};
+
+
+/*
+** Return true if the argument interpreted as a unicode codepoint
+** is a diacritical modifier character.
+*/
+int sqlite3Fts5UnicodeIsdiacritic(int c){
+  unsigned int mask0 = 0x08029FDF;
+  unsigned int mask1 = 0x000361F8;
+  if( c<768 || c>817 ) return 0;
+  return (c < 768+32) ?
+      (mask0 & (1 << (c-768))) :
+      (mask1 & (1 << (c-768-32)));
+}
+
+
+/*
+** Interpret the argument as a unicode codepoint. If the codepoint
+** is an upper case character that has a lower case equivalent,
+** return the codepoint corresponding to the lower case version.
+** Otherwise, return a copy of the argument.
+**
+** The results are undefined if the value passed to this function
+** is less than zero.
+*/
+int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
+  /* Each entry in the following array defines a rule for folding a range
+  ** of codepoints to lower case. The rule applies to a range of nRange
+  ** codepoints starting at codepoint iCode.
+  **
+  ** If the least significant bit in flags is clear, then the rule applies
+  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
+  ** need to be folded). Or, if it is set, then the rule only applies to
+  ** every second codepoint in the range, starting with codepoint C.
+  **
+  ** The 7 most significant bits in flags are an index into the aiOff[]
+  ** array. If a specific codepoint C does require folding, then its lower
+  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
+  **
+  ** The contents of this array are generated by parsing the CaseFolding.txt
+  ** file distributed as part of the "Unicode Character Database". See
+  ** http://www.unicode.org for details.
+  */
+  static const struct TableEntry {
+    unsigned short iCode;
+    unsigned char flags;
+    unsigned char nRange;
+  } aEntry[] = {
+    {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
+    {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
+    {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
+    {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
+    {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
+    {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
+    {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
+    {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
+    {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
+    {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
+    {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
+    {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
+    {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
+    {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
+    {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
+    {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
+    {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
+    {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
+    {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
+    {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
+    {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
+    {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
+    {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
+    {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
+    {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
+    {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
+    {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
+    {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
+    {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
+    {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
+    {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
+    {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
+    {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
+    {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
+    {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
+    {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
+    {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
+    {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
+    {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
+    {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
+    {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
+    {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
+    {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
+    {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
+    {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
+    {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
+    {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
+    {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
+    {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
+    {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
+    {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
+    {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
+    {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
+    {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
+    {65313, 14, 26},       
+  };
+  static const unsigned short aiOff[] = {
+   1,     2,     8,     15,    16,    26,    28,    32,    
+   37,    38,    40,    48,    63,    64,    69,    71,    
+   79,    80,    116,   202,   203,   205,   206,   207,   
+   209,   210,   211,   213,   214,   217,   218,   219,   
+   775,   7264,  10792, 10795, 23228, 23256, 30204, 54721, 
+   54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 
+   57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 
+   65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 
+   65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 
+   65514, 65521, 65527, 65528, 65529, 
+  };
+
+  int ret = c;
+
+  assert( c>=0 );
+  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
+
+  if( c<128 ){
+    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
+  }else if( c<65536 ){
+    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
+    int iLo = 0;
+    int iRes = -1;
+
+    while( iHi>=iLo ){
+      int iTest = (iHi + iLo) / 2;
+      int cmp = (c - aEntry[iTest].iCode);
+      if( cmp>=0 ){
+        iRes = iTest;
+        iLo = iTest+1;
+      }else{
+        iHi = iTest-1;
+      }
+    }
+    assert( iRes<0 || c>=aEntry[iRes].iCode );
+
+    if( iRes>=0 ){
+      const struct TableEntry *p = &aEntry[iRes];
+      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
+        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
+        assert( ret>0 );
+      }
+    }
+
+    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
+  }
+  
+  else if( c>=66560 && c<66600 ){
+    ret = c + 40;
+  }
+
+  return ret;
+}
+#endif /* defined(SQLITE_ENABLE_FTS5) */
diff --git a/ext/fts5/test/fts5unicode.test b/ext/fts5/test/fts5unicode.test

new file mode 100644 (file)

index 0000000..22082b9
--- /dev/null
+++ b/ext/fts5/test/fts5unicode.test
@@ -0,0 +1,39 @@
+# 2014 Dec 20
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+# Tests focusing on the fts5 tokenizers
+#
+
+if {![info exists testdir]} {
+  set testdir [file join [file dirname [info script]] .. .. .. test]
+}
+source $testdir/tester.tcl
+set testprefix fts5unicode
+
+proc tokenize_test {tn tokenizer input output} {
+  uplevel [list do_test $tn [subst -nocommands {
+    set ret {}
+    foreach {z s e p} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] {
+      lappend ret [set z]
+    }
+    set ret
+  }] [list {*}$output]]
+}
+
+foreach {tn t} {1 simple 2 unicode61} {
+  tokenize_test 1.$tn.0 $t {A B C D} {a b c d}
+  tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely}
+  tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
+  tokenize_test 1.$tn.3 $t {} {}
+}
+
+finish_test
+
diff --git a/ext/fts5/test/fts5unicode2.test b/ext/fts5/test/fts5unicode2.test

new file mode 100644 (file)

index 0000000..b26795f
--- /dev/null
+++ b/ext/fts5/test/fts5unicode2.test
@@ -0,0 +1,567 @@
+# 2012 May 25
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#*************************************************************************
+#
+# The tests in this file focus on testing the "unicode" FTS tokenizer.
+#
+# This is a modified copy of FTS4 test file "fts4_unicode.test".
+#
+
+if {![info exists testdir]} {
+  set testdir [file join [file dirname [info script]] .. .. .. test]
+}
+source $testdir/tester.tcl
+set testprefix fts5unicode2
+
+proc do_unicode_token_test {tn input res} {
+  uplevel [list do_test $tn [list \
+    sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input
+  ] [list {*}$res]]
+}
+
+proc do_unicode_token_test2 {tn input res} {
+  uplevel [list do_test $tn [list \
+    sqlite3_fts5_tokenize -subst db "unicode61" $input
+  ] [list {*}$res]]
+}
+
+proc do_unicode_token_test3 {tn args} {
+  set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]]
+  set input [lindex $args end-1]
+  set res [lindex $args end]
+  uplevel [list do_test $tn [list \
+    sqlite3_fts5_tokenize -subst db $tokenizer $input
+  ] [list {*}$res]]
+}
+
+do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
+
+do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
+    "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"
+
+do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
+    "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"
+
+# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
+do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
+do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"
+
+do_unicode_token_test 1.5 "The quick brown fox" {
+  0 the The 1 quick quick 2 brown brown 3 fox fox
+}
+do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
+  0 the The 1 quick quick 2 brown brown 3 fox fox
+}
+
+do_unicode_token_test2 1.7  {a B c D} {0 a a 1 b B 2 c c 3 d D}
+do_unicode_token_test2 1.8  "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"
+
+do_unicode_token_test2 1.9  "x\uC4x x\uD6x x\uDCx" \
+    "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"
+
+# Check that diacritics are removed if remove_diacritics=1 is specified.
+# And that they do not break tokens.
+do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
+
+# Title-case mappings work
+do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"
+
+#-------------------------------------------------------------------------
+#
+set docs [list {
+  Enhance the INSERT syntax to allow multiple rows to be inserted via the
+  VALUES clause.
+} {
+  Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
+} {
+  Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
+} {
+  Added the sqlite3_db_readonly() interface.
+} {
+  Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
+  ability to add new PRAGMA statements or to override built-in PRAGMAs.  
+} {
+  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
+  the same row that contains the maximum x value.
+} {
+  Added support for the FTS4 languageid option.
+} {
+  Documented support for the FTS4 content option. This feature has actually
+  been in the code since version 3.7.9 but is only now considered to be
+  officially supported.  
+} {
+  Pending statements no longer block ROLLBACK. Instead, the pending statement
+  will return SQLITE_ABORT upon next access after the ROLLBACK.  
+} {
+  Improvements to the handling of CSV inputs in the command-line shell
+} {
+  Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
+  incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
+  connected by OR.  
+}]
+
+set map(a) [list "\u00C4" "\u00E4"]  ; # LATIN LETTER A WITH DIAERESIS
+set map(e) [list "\u00CB" "\u00EB"]  ; # LATIN LETTER E WITH DIAERESIS
+set map(i) [list "\u00CF" "\u00EF"]  ; # LATIN LETTER I WITH DIAERESIS
+set map(o) [list "\u00D6" "\u00F6"]  ; # LATIN LETTER O WITH DIAERESIS
+set map(u) [list "\u00DC" "\u00FC"]  ; # LATIN LETTER U WITH DIAERESIS
+set map(y) [list "\u0178" "\u00FF"]  ; # LATIN LETTER Y WITH DIAERESIS
+set map(h) [list "\u1E26" "\u1E27"]  ; # LATIN LETTER H WITH DIAERESIS
+set map(w) [list "\u1E84" "\u1E85"]  ; # LATIN LETTER W WITH DIAERESIS
+set map(x) [list "\u1E8C" "\u1E8D"]  ; # LATIN LETTER X WITH DIAERESIS
+foreach k [array names map] {
+  lappend mappings [string toupper $k] [lindex $map($k) 0] 
+  lappend mappings $k [lindex $map($k) 1]
+}
+proc mapdoc {doc} { 
+  set doc [regsub -all {[[:space:]]+} $doc " "]
+  string map $::mappings [string trim $doc] 
+}
+
+do_test 2.0 {
+  execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); }
+  foreach doc $docs {
+    set d [mapdoc $doc]
+    execsql { INSERT INTO t2 VALUES($d) }
+  }
+} {}
+
+do_test 2.1 {
+  set q [mapdoc "row"]
+  execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
+} [list [mapdoc {
+  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
+  the same row that contains the maximum x value.
+}]]
+
+foreach {tn query snippet} {
+  2 "row" {
+     ...returns the value of y on the same [row] that contains 
+     the maximum x value.
+  }
+  3 "ROW" {
+     ...returns the value of y on the same [row] that contains 
+     the maximum x value.
+  }
+  4 "rollback" {
+     ...[ROLLBACK]. Instead, the pending statement
+     will return SQLITE_ABORT upon next access after the [ROLLBACK].
+  }
+  5 "rOllback" {
+     ...[ROLLBACK]. Instead, the pending statement
+     will return SQLITE_ABORT upon next access after the [ROLLBACK].
+  }
+  6 "lang*" {
+     Added support for the FTS4 [languageid] option.
+  }
+} {
+  do_test 2.$tn {
+    set q [mapdoc $query]
+    execsql { 
+      SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q 
+    }
+  } [list [mapdoc $snippet]]
+}
+
+#-------------------------------------------------------------------------
+# Make sure the unicode61 tokenizer does not crash if it is passed a 
+# NULL pointer.
+reset_db
+do_execsql_test 3.1 {
+  CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y);
+  INSERT INTO t1 VALUES(NULL, 'a b c');
+}
+
+do_execsql_test 3.2 {
+  SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b'
+} {{a [b] c}}
+
+do_execsql_test 3.3 {
+  BEGIN;
+  DELETE FROM t1;
+  INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 SELECT * FROM t1;
+  INSERT INTO t1 VALUES('a b c', NULL);
+  INSERT INTO t1 VALUES('a x c', NULL);
+  COMMIT;
+}
+
+do_execsql_test 3.4 {
+  SELECT * FROM t1 WHERE t1 MATCH 'a b';
+} {{a b c} {}}
+
+#-------------------------------------------------------------------------
+#
+reset_db
+
+do_test 4.1 {
+  set a "abc\uFFFEdef"
+  set b "abc\uD800def"
+  set c "\uFFFEdef"
+  set d "\uD800def"
+  execsql {
+    CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);
+    INSERT INTO t1 VALUES($a);
+    INSERT INTO t1 VALUES($b);
+    INSERT INTO t1 VALUES($c);
+    INSERT INTO t1 VALUES($d);
+  }
+} {}
+
+do_test 4.2 {
+  set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
+  set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
+  set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
+  set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
+  execsql {
+    INSERT INTO t1 VALUES($a);
+    INSERT INTO t1 VALUES($b);
+    INSERT INTO t1 VALUES($c);
+    INSERT INTO t1 VALUES($d);
+  }
+} {}
+
+do_test 4.3 {
+  set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
+  set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
+  set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
+  set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
+  execsql {
+    INSERT INTO t1 VALUES($a);
+    INSERT INTO t1 VALUES($b);
+    INSERT INTO t1 VALUES($c);
+    INSERT INTO t1 VALUES($d);
+  }
+} {}
+
+
+#-------------------------------------------------------------------------
+
+breakpoint
+do_unicode_token_test3 5.1 {tokenchars {}} {
+  sqlite3_reset sqlite3_column_int
+} {
+  0 sqlite3 sqlite3 
+  1 reset reset 
+  2 sqlite3 sqlite3 
+  3 column column 
+  4 int int
+}
+
+do_unicode_token_test3 5.2 {tokenchars _} {
+  sqlite3_reset sqlite3_column_int
+} {
+  0 sqlite3_reset sqlite3_reset 
+  1 sqlite3_column_int sqlite3_column_int
+}
+
+do_unicode_token_test3 5.3 {separators xyz} {
+  Laotianxhorseyrunszfast
+} {
+  0 laotian Laotian
+  1 horse horse
+  2 runs runs
+  3 fast fast
+}
+
+do_unicode_token_test3 5.4 {tokenchars xyz} {
+  Laotianxhorseyrunszfast
+} {
+  0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
+}
+
+do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {
+  sqlite3_resetxsqlite3_column_intyhonda_phantom
+} {
+  0 sqlite3_reset sqlite3_reset 
+  1 sqlite3_column_int sqlite3_column_int
+  2 honda_phantom honda_phantom
+}
+
+do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {
+  0 abc abc 1 def def
+}
+
+do_unicode_token_test3 5.7                             \
+  "tokenchars \u2444\u2445"                            \
+  "separators \u05D0\u05D1\u05D2"                      \
+  "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
+  [list                                                \
+    0 \u2444fre\u2445sh \u2444fre\u2445sh              \
+    1 water water                                      \
+    2 fish fish                                        \
+    3 \u2445timer \u2445timer                          \
+  ]
+
+# Check that it is not possible to add a standalone diacritic codepoint 
+# to either separators or tokenchars.
+do_unicode_token_test3 5.8 "separators \u0301" \
+  "hello\u0301world \u0301helloworld"          \
+  "0 helloworld hello\u0301world 1 helloworld helloworld"
+
+do_unicode_token_test3 5.9 "tokenchars \u0301" \
+  "hello\u0301world \u0301helloworld"          \
+  "0 helloworld hello\u0301world 1 helloworld helloworld"
+
+do_unicode_token_test3 5.10 "separators \u0301" \
+  "remove_diacritics 0"                        \
+  "hello\u0301world \u0301helloworld"          \
+  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
+
+do_unicode_token_test3 5.11 "tokenchars \u0301" \
+  "remove_diacritics 0"                         \
+  "hello\u0301world \u0301helloworld"           \
+  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
+
+
+#-------------------------------------------------------------------------
+
+proc do_tokenize {tokenizer txt} {
+  set res [list]
+  foreach {a b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {
+    lappend res $b
+  }
+  set res
+}
+
+# Argument $lCodepoint must be a list of codepoints (integers) that 
+# correspond to whitespace characters. This command creates a string
+# $W from the codepoints, then tokenizes "${W}hello{$W}world${W}" 
+# using tokenizer $tokenizer. The test passes if the tokenizer successfully
+# extracts the two 5 character tokens.
+#
+proc do_isspace_test {tn tokenizer lCp} {
+  set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp] 
+  set txt "${whitespace}hello${whitespace}world${whitespace}"
+  uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
+}
+
+set tokenizers [list unicode61]
+ifcapable icu { lappend tokenizers icu }
+
+# Some tests to check that the tokenizers can both identify white-space 
+# codepoints. All codepoints tested below are of type "Zs" in the
+# UnicodeData.txt file.
+foreach T $tokenizers {
+  do_isspace_test 6.$T.1 $T    32
+  do_isspace_test 6.$T.2 $T    160
+  do_isspace_test 6.$T.3 $T    5760
+  do_isspace_test 6.$T.4 $T    6158
+  do_isspace_test 6.$T.5 $T    8192
+  do_isspace_test 6.$T.6 $T    8193
+  do_isspace_test 6.$T.7 $T    8194
+  do_isspace_test 6.$T.8 $T    8195
+  do_isspace_test 6.$T.9 $T    8196
+  do_isspace_test 6.$T.10 $T    8197
+  do_isspace_test 6.$T.11 $T    8198
+  do_isspace_test 6.$T.12 $T    8199
+  do_isspace_test 6.$T.13 $T    8200
+  do_isspace_test 6.$T.14 $T    8201
+  do_isspace_test 6.$T.15 $T    8202
+  do_isspace_test 6.$T.16 $T    8239
+  do_isspace_test 6.$T.17 $T    8287
+  do_isspace_test 6.$T.18 $T   12288
+
+  do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
+  do_isspace_test 6.$T.20 $T   {8192 8193 8194 8195}
+  do_isspace_test 6.$T.21 $T   {8196 8197 8198 8199}
+  do_isspace_test 6.$T.22 $T   {8200 8201 8202 8239}
+  do_isspace_test 6.$T.23 $T   {8287 12288}
+}
+
+#-------------------------------------------------------------------------
+# Test that the private use ranges are treated as alphanumeric.
+#
+foreach {tn1 c} {
+  1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
+} {
+  foreach {tn2 config res} {
+    1 ""             "0 hello*world hello*world"
+    2 "separators *" "0 hello hello 1 world world"
+  } {
+    set config [string map [list * $c] $config]
+    set input  [string map [list * $c] "hello*world"]
+    set output [string map [list * $c] $res]
+    do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
+  }
+}
+
+#-------------------------------------------------------------------------
+# Cursory test of remove_diacritics=0.
+#
+# 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
+# 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
+# 00E4;LATIN SMALL LETTER A WITH DIAERESIS
+# 00F6;LATIN SMALL LETTER O WITH DIAERESIS
+#
+do_execsql_test 8.1.1 "
+  CREATE VIRTUAL TABLE t3 USING fts5(
+    content, tokenize='unicode61 remove_diacritics 1'
+  );
+  INSERT INTO t3 VALUES('o');
+  INSERT INTO t3 VALUES('a');
+  INSERT INTO t3 VALUES('O');
+  INSERT INTO t3 VALUES('A');
+  INSERT INTO t3 VALUES('\xD6');
+  INSERT INTO t3 VALUES('\xC4');
+  INSERT INTO t3 VALUES('\xF6');
+  INSERT INTO t3 VALUES('\xE4');
+"
+do_execsql_test 8.1.2 {
+  SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC;
+} {1 3 5 7}
+do_execsql_test 8.1.3 {
+  SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC;
+} {2 4 6 8}
+do_execsql_test 8.2.1 {
+  CREATE VIRTUAL TABLE t4 USING fts5(
+    content, tokenize='unicode61 remove_diacritics 0'
+  );
+  INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC;
+}
+do_execsql_test 8.2.2 {
+  SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC;
+} {1 3}
+do_execsql_test 8.2.3 {
+  SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC;
+} {2 4}
+
+#-------------------------------------------------------------------------
+#
+if 0 {
+foreach {tn sql} {
+  1 {
+    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
+    CREATE VIRTUAL TABLE t6 USING fts4(
+        tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
+    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
+  }
+  2 {
+    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
+    CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
+    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4");
+  }
+  3 {
+    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .');
+    CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]');
+    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4');
+  }
+  4 {
+    CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`);
+    CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`);
+    CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`);
+  }
+} {
+  do_execsql_test 9.$tn.0 { 
+    DROP TABLE IF EXISTS t5;
+    DROP TABLE IF EXISTS t5aux;
+    DROP TABLE IF EXISTS t6;
+    DROP TABLE IF EXISTS t6aux;
+    DROP TABLE IF EXISTS t7;
+    DROP TABLE IF EXISTS t7aux;
+  }
+  do_execsql_test 9.$tn.1 $sql
+
+  do_execsql_test 9.$tn.2 {
+    CREATE VIRTUAL TABLE t5aux USING fts4aux(t5);
+    INSERT INTO t5 VALUES('one two three/four.five.six');
+    SELECT * FROM t5aux;
+  } {
+    four.five.six   * 1 1 four.five.six   0 1 1 
+    {one two three} * 1 1 {one two three} 0 1 1
+  }
+
+  do_execsql_test 9.$tn.3 {
+    CREATE VIRTUAL TABLE t6aux USING fts4aux(t6);
+    INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta');
+    SELECT * FROM t6aux;
+  } {
+    {alpha=beta"gamma}   * 1 1 {alpha=beta"gamma} 0 1 1 
+    {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1
+  }
+
+  do_execsql_test 9.$tn.4 {
+    CREATE VIRTUAL TABLE t7aux USING fts4aux(t7);
+    INSERT INTO t7 VALUES('alephxbeth\xC4gimel');
+    SELECT * FROM t7aux;
+  } {
+    aleph * 1 1 aleph 0 1 1 
+    beth  * 1 1 beth  0 1 1 
+    gimel * 1 1 gimel 0 1 1
+  }
+}
+
+# Check that multiple options are handled correctly.
+#
+do_execsql_test 10.1 {
+  DROP TABLE IF EXISTS t1;
+  CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61
+    "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy"
+    "separators=a" "separators=a" "tokenchars=a" "tokenchars=a"
+  );
+
+  INSERT INTO t1 VALUES('oneatwoxthreeyfour');
+  INSERT INTO t1 VALUES('a.single=word');
+  CREATE VIRTUAL TABLE t1aux USING fts4aux(t1);
+  SELECT * FROM t1aux;
+} {
+  .single=word * 1 1 .single=word 0 1 1 
+  four         * 1 1 four         0 1 1 
+  one          * 1 1 one          0 1 1 
+  three        * 1 1 three        0 1 1 
+  two          * 1 1 two          0 1 1
+}
+
+# Test that case folding happens after tokenization, not before.
+#
+do_execsql_test 10.2 {
+  DROP TABLE IF EXISTS t2;
+  CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB");
+  INSERT INTO t2 VALUES('oneatwoBthree');
+  INSERT INTO t2 VALUES('onebtwoAthree');
+  CREATE VIRTUAL TABLE t2aux USING fts4aux(t2);
+  SELECT * FROM t2aux;
+} {
+  one           * 1 1 one           0 1 1 
+  onebtwoathree * 1 1 onebtwoathree 0 1 1 
+  three         * 1 1 three         0 1 1 
+  two           * 1 1 two           0 1 1
+}
+
+# Test that the tokenchars and separators options work with the 
+# fts3tokenize table.
+#
+do_execsql_test 11.1 {
+  CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
+    "unicode61", "tokenchars=@.", "separators=1234567890"
+  );
+  SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';
+} {
+  berlin@street sydney.road
+}
+
+}
+
+finish_test
diff --git a/main.mk b/main.mk

index 58044218a726371c8acd1e4ca8fb9cf25229c0ea..7a26313b12eb1f6d11da7149d780f5cc7a776cb1 100644 (file)
--- a/main.mk
+++ b/main.mk
@@ -81,6 +81,7 @@ LIBOBJ += fts5_hash.o
  LIBOBJ += fts5_index.o
  LIBOBJ += fts5_storage.o
  LIBOBJ += fts5_tokenize.o
+LIBOBJ += fts5_unicode2.o
  LIBOBJ += fts5parse.o
  
  
@@ -616,6 +617,9 @@ fts5_storage.o:     $(TOP)/ext/fts5/fts5_storage.c $(HDR) $(EXTHDR)
  fts5_tokenize.o:       $(TOP)/ext/fts5/fts5_tokenize.c $(HDR) $(EXTHDR)
         $(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts5/fts5_tokenize.c
  
+fts5_unicode2.o:       $(TOP)/ext/fts5/fts5_unicode2.c $(HDR) $(EXTHDR)
+       $(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts5/fts5_unicode2.c
+
  fts5parse.c:   $(TOP)/ext/fts5/fts5parse.y lemon 
         cp $(TOP)/ext/fts5/fts5parse.y .
         rm -f fts5parse.h
diff --git a/manifest b/manifest

index d8bf03435a28fe6bee8c46282d0ad4eb695f20cc..49fabbb73ca3a2c767eaaf39b067b1bb7b684d32 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Move\sall\sfts5\stest\sfiles\sto\snew\sdirectory\s"ext/fts5/test".
-D 2014-12-29T15:59:36.706
+C Add\sa\sversion\sof\sthe\sunicode61\stokenizer\sto\sfts5.
+D 2015-01-01T16:46:10.851
  F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
  F Makefile.in b03432313a3aad96c706f8164fb9f5307eaf19f5
  F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -102,7 +102,7 @@ F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
  F ext/fts3/tool/fts3view.c 6cfc5b67a5f0e09c0d698f9fd012c784bfaa9197
  F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
  F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
-F ext/fts3/unicode/mkunicode.tcl dc6f268eb526710e2c6e496c372471d773d0c368
+F ext/fts3/unicode/mkunicode.tcl 2fa92b916b17ee0fc94129d36969972d463bc016
  F ext/fts5/extract_api_docs.tcl 6320db4a1d0722a4e2069e661381ad75e9889786
  F ext/fts5/fts5.c 37e124e24e5860f9842e5f3ee22129a786c0fd74
  F ext/fts5/fts5.h 4f9d2c477c0ee1907164642471329a82cb6b203b
@@ -114,26 +114,29 @@ F ext/fts5/fts5_expr.c 27d3d2deebae277c34ae2bb3d501dd879c442ba5
  F ext/fts5/fts5_hash.c 63fa8379c5f2ac107d47c2b7d9ac04c95ef8a279
  F ext/fts5/fts5_index.c 4a8e8535b4303400ddb5f6fb08152da0d88ebf6f
  F ext/fts5/fts5_storage.c 13794781977c9a624eb8bd7b9509de241e405853
-F ext/fts5/fts5_tcl.c ce11e46589986b957b89809aabd3936d898d501b
-F ext/fts5/fts5_tokenize.c 5d6e785345b0d87d174fcc0653bfacd0d9fd7f2e
+F ext/fts5/fts5_tcl.c 664e710e2bbeed505cb91848772ca7538623a67f
+F ext/fts5/fts5_tokenize.c 5a0ad46408d09bcda2bf0addb5af42fdb75ebabb
+F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9
  F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9
-F ext/fts5/test/fts5aa.test 01fff9cf4e75c33871dd121d6adae33b609542cf w test/fts5aa.test
-F ext/fts5/test/fts5ab.test 7a58a954cae2ae50cef3ee525c57bc8eb3eb50b3 w test/fts5ab.test
-F ext/fts5/test/fts5ac.test d3de838f48d2ac8c26386832f6d93a3a3dbb5d4b w test/fts5ac.test
-F ext/fts5/test/fts5ad.test a8311d6ce46964fa1686937793dd81d284317324 w test/fts5ad.test
-F ext/fts5/test/fts5ae.test e576e646013489ce458a5b276caa787035efb175 w test/fts5ae.test
-F ext/fts5/test/fts5af.test 7e4c679bc6337ddcde6a3c9b9d81c81d2f7e77bd w test/fts5af.test
-F ext/fts5/test/fts5ag.test c79ee7707d120b79869fa2ac1538639b9fa1b997 w test/fts5ag.test
-F ext/fts5/test/fts5ah.test e510c741e9833d6335c87bef2e7f93fecfcc7c1d w test/fts5ah.test
-F ext/fts5/test/fts5ai.test 6a22f43776e1612591392721b535ca28d2c1a19f w test/fts5ai.test
-F ext/fts5/test/fts5aj.test 1a64ab4144f54bd12a520683950bf8460dd74fb3 w test/fts5aj.test
-F ext/fts5/test/fts5ak.test df2669fb76684f03d03918dfb2cf692012251b1f w test/fts5ak.test
-F ext/fts5/test/fts5al.test c055f1d682f931b8ea6c6e6251d90925f2aa55a1 w test/fts5al.test
-F ext/fts5/test/fts5auxdata.test fec4c9113176d351e567eab65fe9917e5ea0ab05 w ext/fts5/fts5auxdata.test
-F ext/fts5/test/fts5ea.test 0ef2c89e14c6360ad3905fae44409420d6b5a5c8 w test/fts5ea.test
-F ext/fts5/test/fts5fault1.test b95ed600b88bbbce5390f9097a5a5b7b01b3b9f7 w test/fts5fault1.test
-F ext/fts5/test/fts5porter.test d8f7591b733bcc1f02ca0dd313bc891a4b289562 w ext/fts5/fts5porter.test
-F ext/fts5/test/fts5tokenizer.test a1f3128e0d42c93632122c76cbe0d07a901591ca w ext/fts5/fts5tokenizer.test
+F ext/fts5/test/fts5aa.test 01fff9cf4e75c33871dd121d6adae33b609542cf
+F ext/fts5/test/fts5ab.test 7a58a954cae2ae50cef3ee525c57bc8eb3eb50b3
+F ext/fts5/test/fts5ac.test d3de838f48d2ac8c26386832f6d93a3a3dbb5d4b
+F ext/fts5/test/fts5ad.test a8311d6ce46964fa1686937793dd81d284317324
+F ext/fts5/test/fts5ae.test e576e646013489ce458a5b276caa787035efb175
+F ext/fts5/test/fts5af.test 7e4c679bc6337ddcde6a3c9b9d81c81d2f7e77bd
+F ext/fts5/test/fts5ag.test c79ee7707d120b79869fa2ac1538639b9fa1b997
+F ext/fts5/test/fts5ah.test e510c741e9833d6335c87bef2e7f93fecfcc7c1d
+F ext/fts5/test/fts5ai.test 6a22f43776e1612591392721b535ca28d2c1a19f
+F ext/fts5/test/fts5aj.test 1a64ab4144f54bd12a520683950bf8460dd74fb3
+F ext/fts5/test/fts5ak.test df2669fb76684f03d03918dfb2cf692012251b1f
+F ext/fts5/test/fts5al.test c055f1d682f931b8ea6c6e6251d90925f2aa55a1
+F ext/fts5/test/fts5auxdata.test fec4c9113176d351e567eab65fe9917e5ea0ab05
+F ext/fts5/test/fts5ea.test 0ef2c89e14c6360ad3905fae44409420d6b5a5c8
+F ext/fts5/test/fts5fault1.test b95ed600b88bbbce5390f9097a5a5b7b01b3b9f7
+F ext/fts5/test/fts5porter.test d8f7591b733bcc1f02ca0dd313bc891a4b289562
+F ext/fts5/test/fts5tokenizer.test a1f3128e0d42c93632122c76cbe0d07a901591ca
+F ext/fts5/test/fts5unicode.test b9c7bb982e0ee242a0774e636e1888ca32947a83
+F ext/fts5/test/fts5unicode2.test 7b0d64bbb7bfb7b5080e032e068404b42432ee02
  F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43
  F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb
  F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
@@ -177,7 +180,7 @@ F ext/rtree/viewrtree.tcl eea6224b3553599ae665b239bd827e182b466024
  F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 x
  F ltmain.sh 3ff0879076df340d2e23ae905484d8c15d5fdea8
  F magic.txt 8273bf49ba3b0c8559cb2774495390c31fd61c60
-F main.mk 863a6f5cdcc3a47a9dcbedc9af37d3c0d4172935
+F main.mk 602303f3596d10237f25da030ee1d96065e2e5a8
  F mkopcodec.awk c2ff431854d702cdd2d779c9c0d1f58fa16fa4ea
  F mkopcodeh.awk c6b3fa301db6ef7ac916b14c60868aeaec1337b5
  F mkso.sh fd21c06b063bb16a5d25deea1752c2da6ac3ed83
@@ -1212,7 +1215,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
  F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
  F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
  F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
-P b33fe0dd89f3180c209fa1f9e75d0a7acab12b8e
-R c65f16b94aeceea9cda28cb8f092d4a9
+P 7f148edb30103c5f4fee20cd08e38537f9615bf2
+R d01caf1e8e04bd7c1b6e26fb465c90b6
  U dan
-Z 822a98c34fd542b912bf890d737a0e9f
+Z 5c3f4d7bf4502327dfa6eb630b5a26ec
diff --git a/manifest.uuid b/manifest.uuid

index 7b2535c49c2bc20c0f1a82234754e572355cd410..f67937770d003c4f4030d07f36c0c7d17a867442 100644 (file)
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-7f148edb30103c5f4fee20cd08e38537f9615bf2
-\ No newline at end of file
+d09f7800cf14f73ea86d037107ef80295b2c173a
+\ No newline at end of file
author	dan <dan@noemail.net>
	Thu, 1 Jan 2015 16:46:10 +0000 (16:46 +0000)
committer	dan <dan@noemail.net>
	Thu, 1 Jan 2015 16:46:10 +0000 (16:46 +0000)
ext/fts3/unicode/mkunicode.tcl		patch \| blob \| blame \| history
ext/fts5/fts5_tcl.c		patch \| blob \| blame \| history
ext/fts5/fts5_tokenize.c		patch \| blob \| blame \| history
ext/fts5/fts5_unicode2.c	[new file with mode: 0644]	patch \| blob
ext/fts5/test/fts5unicode.test	[new file with mode: 0644]	patch \| blob
ext/fts5/test/fts5unicode2.test	[new file with mode: 0644]	patch \| blob
main.mk		patch \| blob \| blame \| history
manifest		patch \| blob \| blame \| history
manifest.uuid		patch \| blob \| blame \| history