int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
/* fts3_unicode2.c (functions generated by parsing unicode text files) */
-int sqlite3FtsUnicodeTolower(int);
+#ifndef SQLITE_DISABLE_FTS3_UNICODE
+int sqlite3FtsUnicodeFold(int, int);
int sqlite3FtsUnicodeIsalnum(int);
+int sqlite3FtsUnicodeIsdiacritic(int);
+#endif
#endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
#endif /* _FTSINT_H */
struct unicode_tokenizer {
sqlite3_tokenizer base;
+ int bRemoveDiacritic;
};
struct unicode_cursor {
sqlite3_tokenizer **pp /* OUT: New tokenizer handle */
){
unicode_tokenizer *pNew; /* New tokenizer object */
+ int i;
pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
if( pNew==NULL ){
return SQLITE_NOMEM;
}
memset(pNew, 0, sizeof(unicode_tokenizer));
+ pNew->bRemoveDiacritic = 1;
+
+ for(i=0; i<nArg; i++){
+ const char *z = azArg[i];
+ int n = strlen(z);
+
+ if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
+ pNew->bRemoveDiacritic = 1;
+ }
+ else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
+ pNew->bRemoveDiacritic = 0;
+ }
+ else{
+ /* Unrecognized argument */
+ return SQLITE_ERROR;
+ }
+ }
+
*pp = &pNew->base;
return SQLITE_OK;
}
zOut = pCsr->zToken;
do {
+ int iOut;
+
/* Grow the output buffer if required. */
if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
/* Write the folded case of the last character read to the output */
zEnd = z;
- WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
+ iOut = sqlite3FtsUnicodeFold(iCode,
+ ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic
+ );
+ if( iOut ){
+ WRITE_UTF8(zOut, iOut);
+ }
/* If the cursor is not at EOF, read the next character */
if( z>=zTerm ) break;
READ_UTF8(z, zTerm, iCode);
- }while( sqlite3FtsUnicodeIsalnum(iCode) );
+ }while( sqlite3FtsUnicodeIsalnum(iCode)
+ || sqlite3FtsUnicodeIsdiacritic(iCode)
+ );
/* Set the output variables and return. */
pCsr->iOff = (z - pCsr->aInput);
}
+/*
+** If the argument is a codepoint corresponding to a lowercase letter
+** in the ASCII range with a diacritic added, return the codepoint
+** of the ASCII letter only. For example, if passed 235 - "LATIN
+** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
+** E"). The resuls of passing a codepoint that corresponds to an
+** uppercase letter are undefined.
+*/
+static int remove_diacritic(int c){
+ unsigned short aDia[] = {
+ 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
+ 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
+ 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
+ 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
+ 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
+ 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
+ 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
+ 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
+ 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
+ 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
+ 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
+ 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
+ 62924, 63050, 63082, 63274, 63390,
+ };
+ char aChar[] = {
+ '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
+ 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
+ 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
+ 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
+ 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
+ '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
+ 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
+ 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
+ 'e', 'i', 'o', 'u', 'y',
+ };
+
+ unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
+ int iRes = 0;
+ int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
+ int iLo = 0;
+ while( iHi>=iLo ){
+ int iTest = (iHi + iLo) / 2;
+ if( key >= aDia[iTest] ){
+ iRes = iTest;
+ iLo = iTest+1;
+ }else{
+ iHi = iTest-1;
+ }
+ }
+ assert( key>=aDia[iRes] );
+ return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
+};
+
+
+/*
+** Return true if the argument interpreted as a unicode codepoint
+** is a diacritical modifier character.
+*/
+int sqlite3FtsUnicodeIsdiacritic(int c){
+ unsigned int mask0 = 0x08029FDF;
+ unsigned int mask1 = 0x000361F8;
+ if( c<768 || c>817 ) return 0;
+ return (c < 768+32) ?
+ (mask0 & (1 << (c-768))) :
+ (mask1 & (1 << (c-768-32)));
+}
+
+
/*
** Interpret the argument as a unicode codepoint. If the codepoint
** is an upper case character that has a lower case equivalent,
** The results are undefined if the value passed to this function
** is less than zero.
*/
-int sqlite3FtsUnicodeTolower(int c){
+int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
/* Each entry in the following array defines a rule for folding a range
** of codepoints to lower case. The rule applies to a range of nRange
** codepoints starting at codepoint iCode.
assert( ret>0 );
}
}
+
+ if( bRemoveDiacritic ) ret = remove_diacritic(ret);
}
else if( c>=66560 && c<66600 ){
+#
+# Parameter $zName must be a path to the file UnicodeData.txt. This command
+# reads the file and returns a list of mappings required to remove all
+# diacritical marks from a unicode string. Each mapping is itself a list
+# consisting of two elements - the unicode codepoint and the single ASCII
+# character that it should be replaced with, or an empty string if the
+# codepoint should simply be removed from the input. Examples:
+#
+# { 224 a } (replace codepoint 224 to "a")
+# { 769 "" } (remove codepoint 769 from input)
+#
+# Mappings are only returned for non-upper case codepoints. It is assumed
+# that the input has already been folded to lower case.
+#
+proc rd_load_unicodedata_text {zName} {
+ global tl_lookup_table
+
+ set fd [open $zName]
+ set lField {
+ code
+ character_name
+ general_category
+ canonical_combining_classes
+ bidirectional_category
+ character_decomposition_mapping
+ decimal_digit_value
+ digit_value
+ numeric_value
+ mirrored
+ unicode_1_name
+ iso10646_comment_field
+ uppercase_mapping
+ lowercase_mapping
+ titlecase_mapping
+ }
+ set lRet [list]
+
+ while { ![eof $fd] } {
+ set line [gets $fd]
+ if {$line == ""} continue
+
+ set fields [split $line ";"]
+ if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
+ foreach $lField $fields {}
+ if { [llength $character_decomposition_mapping]!=2
+ || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
+ } {
+ continue
+ }
+
+ set iCode [expr "0x$code"]
+ set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
+ set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
+
+ if {[info exists tl_lookup_table($iCode)]} continue
+
+ if { ($iAscii >= 97 && $iAscii <= 122)
+ || ($iAscii >= 65 && $iAscii <= 90)
+ } {
+ lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
+ set dia($iDia) 1
+ }
+ }
+
+ foreach d [array names dia] {
+ lappend lRet [list $d ""]
+ }
+ set lRet [lsort -integer -index 0 $lRet]
+
+ close $fd
+ set lRet
+}
+
+
+proc print_rd {map} {
+ global tl_lookup_table
+ set aChar [list]
+ set lRange [list]
+
+ set nRange 1
+ set iFirst [lindex $map 0 0]
+ set cPrev [lindex $map 0 1]
+
+ foreach m [lrange $map 1 end] {
+ foreach {i c} $m {}
+
+ if {$cPrev == $c} {
+ for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
+ if {[info exists tl_lookup_table($j)]==0} break
+ }
+
+ if {$j==$i} {
+ set nNew [expr {(1 + $i - $iFirst)}]
+ if {$nNew<=8} {
+ set nRange $nNew
+ continue
+ }
+ }
+ }
+
+ lappend lRange [list $iFirst $nRange]
+ lappend aChar $cPrev
+
+ set iFirst $i
+ set cPrev $c
+ set nRange 1
+ }
+ lappend lRange [list $iFirst $nRange]
+ lappend aChar $cPrev
+
+ puts "/*"
+ puts "** If the argument is a codepoint corresponding to a lowercase letter"
+ puts "** in the ASCII range with a diacritic added, return the codepoint"
+ puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
+ puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
+ puts "** E\"). The resuls of passing a codepoint that corresponds to an"
+ puts "** uppercase letter are undefined."
+ puts "*/"
+ puts "static int remove_diacritic(int c)\{"
+ puts " unsigned short aDia\[\] = \{"
+ puts -nonewline " 0, "
+ set i 1
+ foreach r $lRange {
+ foreach {iCode nRange} $r {}
+ if {($i % 8)==0} {puts "" ; puts -nonewline " " }
+ incr i
+
+ puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
+ puts -nonewline ", "
+ }
+ puts ""
+ puts " \};"
+ puts " char aChar\[\] = \{"
+ puts -nonewline " '\\0', "
+ set i 1
+ foreach c $aChar {
+ set str "'$c', "
+ if {$c == ""} { set str "'\\0', " }
+
+ if {($i % 12)==0} {puts "" ; puts -nonewline " " }
+ incr i
+ puts -nonewline "$str"
+ }
+ puts ""
+ puts " \};"
+ puts {
+ unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
+ int iRes = 0;
+ int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
+ int iLo = 0;
+ while( iHi>=iLo ){
+ int iTest = (iHi + iLo) / 2;
+ if( key >= aDia[iTest] ){
+ iRes = iTest;
+ iLo = iTest+1;
+ }else{
+ iHi = iTest-1;
+ }
+ }
+ assert( key>=aDia[iRes] );
+ return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
+ puts "\};"
+}
+
+proc print_isdiacritic {zFunc map} {
+
+ set lCode [list]
+ foreach m $map {
+ foreach {code char} $m {}
+ if {$code && $char == ""} { lappend lCode $code }
+ }
+ set lCode [lsort -integer $lCode]
+ set iFirst [lindex $lCode 0]
+ set iLast [lindex $lCode end]
+
+ set i1 0
+ set i2 0
+
+ foreach c $lCode {
+ set i [expr $c - $iFirst]
+ if {$i < 32} {
+ set i1 [expr {$i1 | (1<<$i)}]
+ } else {
+ set i2 [expr {$i2 | (1<<($i-32))}]
+ }
+ }
+
+ puts "/*"
+ puts "** Return true if the argument interpreted as a unicode codepoint"
+ puts "** is a diacritical modifier character."
+ puts "*/"
+ puts "int ${zFunc}\(int c)\{"
+ puts " unsigned int mask0 = [format "0x%08X" $i1];"
+ puts " unsigned int mask1 = [format "0x%08X" $i2];"
+
+ puts " if( c<$iFirst || c>$iLast ) return 0;"
+ puts " return (c < $iFirst+32) ?"
+ puts " (mask0 & (1 << (c-$iFirst))) :"
+ puts " (mask1 & (1 << (c-$iFirst-32)));"
+ puts "\}"
+}
+
+
+#-------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
}
-proc print_tolower {zFunc} {
+proc print_fold {zFunc} {
set lRecord [tl_create_records]
puts "** The results are undefined if the value passed to this function"
puts "** is less than zero."
puts "*/"
- puts "int ${zFunc}\(int c)\{"
+ puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
set liOff [tl_generate_ioff_table $lRecord]
tl_print_table_header
assert( ret>0 );
}
}
+
+ if( bRemoveDiacritic ) ret = remove_diacritic(ret);
}
}
puts "\}"
}
-proc print_tolower_test {zFunc} {
+proc print_fold_test {zFunc mappings} {
global tl_lookup_table
- puts "static int tolower_test(int *piCode)\{"
+ foreach m $mappings {
+ set c [lindex $m 1]
+ if {$c == ""} {
+ set extra([lindex $m 0]) 0
+ } else {
+ scan $c %c i
+ set extra([lindex $m 0]) $i
+ }
+ }
+
+ puts "static int fold_test(int *piCode)\{"
puts -nonewline " static int aLookup\[\] = \{"
for {set i 0} {$i < 70000} {incr i} {
+
set expected $i
catch { set expected $tl_lookup_table($i) }
- if {($i % 8)==0} { puts "" ; puts -nonewline " " }
- puts -nonewline "$expected, "
+ set expected2 $expected
+ catch { set expected2 $extra($expected2) }
+
+ if {($i % 4)==0} { puts "" ; puts -nonewline " " }
+ puts -nonewline "$expected, $expected2, "
}
puts " \};"
puts " int i;"
puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
- puts " if( ${zFunc}\(i)!=aLookup\[i\] )\{"
- puts " *piCode = i;"
+ puts " int iCode = (i/2);"
+ puts " int bFlag = i & 0x0001;"
+ puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
+ puts " *piCode = iCode;"
puts " return 1;"
puts " \}"
puts " \}"
puts " r1 = isalnum_test(&code);"
puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
puts " else printf(\"isalnum(): test passed\\n\");"
- puts " r2 = tolower_test(&code);"
- puts " if( r2 ) printf(\"tolower(): Problem with code %d\\n\",code);"
- puts " else printf(\"tolower(): test passed\\n\");"
+ puts " r2 = fold_test(&code);"
+ puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
+ puts " else printf(\"fold(): test passed\\n\");"
puts " return (r1 || r2);"
puts "\}"
}
set casefolding.txt [lindex $argv end-1]
set generate_test_code [expr {[llength $argv]==3}]
+print_fileheader
+
# Print the isalnum() function to stdout.
#
-print_fileheader
set lRange [an_load_separator_ranges]
print_isalnum sqlite3FtsUnicodeIsalnum $lRange
puts ""
puts ""
-# Print the tolower() function to stdout.
-#
+# Load the fold data. This is used by the [rd_XXX] commands
+# as well as [print_fold].
tl_load_casefolding_txt ${casefolding.txt}
-print_tolower sqlite3FtsUnicodeTolower
+
+set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
+print_rd $mappings
+puts ""
+puts ""
+print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings
+puts ""
+puts ""
+
+# Print the fold() function to stdout.
+#
+print_fold sqlite3FtsUnicodeFold
# Print the test routines and main() function to stdout, if -test
# was specified.
#
if {$::generate_test_code} {
print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
- print_tolower_test sqlite3FtsUnicodeTolower
+ print_fold_test sqlite3FtsUnicodeFold $mappings
print_test_main
}
-C Avoid\sresetting\sthe\sshared-cache\sschema\swhen\son\sof\sthe\sconnections\susing\nthe\sshared\scache\scloses.\s\sDelay\sresetting\sthe\sschema\suntil\sthe\slast\sconnection\ncloses.
-D 2012-06-06T19:01:13.928
+C Have\sthe\sFTS\sunicode61\sstrip\sout\sdiacritics\swhen\stokenizing\stext.\sThis\scan\sbe\sdisabled\sby\sspecifying\sthe\stokenizer\soption\s"remove_diacritics=0".
+D 2012-06-06T19:30:38.602
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in 4f37eb61be9d38643cdd839a74b8e3bad724cfcf
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
F ext/fts3/fts3.c 41824d0db7d244ca335ce98162df1244863a05c4
F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe
-F ext/fts3/fts3Int.h 7b163fa22e7a625c404c424f2779a4d7b14c14ad
+F ext/fts3/fts3Int.h 11c711068474ffe66548d21a2a8498b3dea25348
F ext/fts3/fts3_aux.c 5205182bd8f372782597888156404766edf5781e
F ext/fts3/fts3_expr.c dbc7ba4c3a6061adde0f38ed8e9b349568299551
F ext/fts3/fts3_hash.c 8dd2d06b66c72c628c2732555a32bc0943114914
F ext/fts3/fts3_tokenizer.c 3da7254a9881f7e270ab28e2004e0d22b3212bce
F ext/fts3/fts3_tokenizer.h 66dec98e365854b6cd2d54f1a96bb6d428fc5a68
F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
-F ext/fts3/fts3_unicode.c 76b6f6fe6e86acd75b08272502fae74a13cef310
-F ext/fts3/fts3_unicode2.c 3ddf1728a396a03b5a73ff0f11ecfd2009de117d
+F ext/fts3/fts3_unicode.c a3c1b0780f764c75844bd13afd9fba139049a121
+F ext/fts3/fts3_unicode2.c 6381bcfd621b2800df134a560737eaa1ed07cb17
F ext/fts3/fts3_write.c 6a6391d6b01114f885e24e1f66bbc11ffba0e9e2
F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
F ext/fts3/tool/fts3view.c 6cfc5b67a5f0e09c0d698f9fd012c784bfaa9197
F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
-F ext/fts3/unicode/mkunicode.tcl 2029991cc2cd0bf71df12768578a29c852bf54d1
+F ext/fts3/unicode/mkunicode.tcl 7a9bc018e2962abb79563c5a39fe581fcbf2f675
F ext/icu/README.txt bf8461d8cdc6b8f514c080e4e10dc3b2bbdfefa9
F ext/icu/icu.c eb9ae1d79046bd7871aa97ee6da51eb770134b5a
F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
-F test/fts4unicode.test c812e9cf843e26ba633f58b36a2629f878af20fd
+F test/fts4unicode.test f394585139ff878f9af0c83791a5f612d45a5984
F test/func.test 9809b7622d721904a8cc33c1ffb87f46d506ed01
F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
F tool/warnings-clang.sh a8a0a3babda96dfb1ff51adda3cbbf3dfb7266c2
F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
-P 61669c95859e187618fb2fb4249306a947ae8d26 c469850b2eb61a63150cc5fc7d2fe98f0b5abffb
-R 4e8ba0ec11cbdf49789a46888b404344
-U drh
-Z 7052addf7652de59fa1313dd54313af2
+P 635e3a762ddeb1f952f66a08c1d4d53e3f42c9eb
+R 24f67134e172119b7ccb50ae93a76cbd
+U dan
+Z 32c2f5c3d9b162b4ae41c62d929207f9
-635e3a762ddeb1f952f66a08c1d4d53e3f42c9eb
\ No newline at end of file
+790f76a5898dad1a955d40edddf11f7b0fec0ccd
\ No newline at end of file
set ::testprefix fts4unicode
proc do_unicode_token_test {tn input res} {
+ set input [string map {' ''} $input]
+ uplevel [list do_execsql_test $tn "
+ SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
+ " [list [list {*}$res]]]
+}
+
+proc do_unicode_token_test2 {tn input res} {
set input [string map {' ''} $input]
uplevel [list do_execsql_test $tn "
SELECT fts3_tokenizer_test('unicode61', '$input');
0 the The 1 quick quick 2 brown brown 3 fox fox
}
+do_unicode_token_test2 1.8 {a B c D} {0 a a 1 b B 2 c c 3 d D}
+do_unicode_token_test2 1.9 {Ä Ö Ü} {0 a Ä 1 o Ö 2 u Ü}
+do_unicode_token_test2 1.10 {xÄx xÖx xÜx} {0 xax xÄx 1 xox xÖx 2 xux xÜx}
+
+# Check that diacritics are removed if remove_diacritics=1 is specified.
+# And that they do not break tokens.
+do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
+
#-------------------------------------------------------------------------
#
set docs [list {