]> git.ipfire.org Git - thirdparty/sqlite.git/commitdiff
Up until now the fts4 "unicode61" tokenizer has treated all private use codepoints...
authordan <dan@noemail.net>
Wed, 5 Jun 2013 16:17:21 +0000 (16:17 +0000)
committerdan <dan@noemail.net>
Wed, 5 Jun 2013 16:17:21 +0000 (16:17 +0000)
FossilOrigin-Name: 6cfd9af5250029c0d275be027b4208c48954a8a1

ext/fts3/fts3_unicode2.c
ext/fts3/unicode/mkunicode.tcl
manifest
manifest.uuid
test/fts4unicode.test

index 3c24569026e81c5e4dca4e75258721f37024a509..fba688ff9ce5fe68098eec8bee4debb613f4b1fd 100644 (file)
@@ -101,28 +101,27 @@ int sqlite3FtsUnicodeIsalnum(int c){
     0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
     0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
     0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
-    0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810,
-    0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023,
-    0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
-    0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
-    0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E,
-    0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01,
-    0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01,
-    0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016,
-    0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004,
-    0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004,
-    0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5,
-    0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01,
-    0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401,
-    0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064,
-    0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F,
-    0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009,
-    0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
-    0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
-    0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
-    0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
-    0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
-    0x43FFF401,
+    0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
+    0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
+    0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
+    0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
+    0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
+    0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
+    0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
+    0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
+    0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
+    0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
+    0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
+    0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
+    0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
+    0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
+    0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
+    0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
+    0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
+    0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
+    0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
+    0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
+    0x380400F0,
   };
   static const unsigned int aAscii[4] = {
     0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
index 0d58e8aa5c0bc10bd631617582a7eb8dbfb30f1b..2da17c51a53ba79b923650d44b02a4a6c4f4cbeb 100644 (file)
@@ -239,7 +239,10 @@ proc an_load_unicodedata_text {zName} {
     foreach $lField $fields {}
 
     set iCode [expr "0x$code"]
-    set bAlnum [expr {[lsearch {L N} [string range $general_category 0 0]]>=0}]
+    set bAlnum [expr {
+         [lsearch {L N} [string range $general_category 0 0]] >= 0
+      || $general_category=="Co"
+    }]
 
     if { !$bAlnum } { lappend lRet $iCode }
   }
@@ -360,7 +363,7 @@ proc print_isalnum {zFunc lRange} {
     }
     assert( aEntry[0]<key );
     assert( key>=aEntry[iRes] );
-    return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
+    return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
   }
   return 1;}
   puts "\}"
@@ -729,7 +732,7 @@ proc print_fileheader {} {
 */
   }]
   puts ""
-  puts "#if !defined(SQLITE_DISABLE_FTS3_UNICODE)"
+  puts "#if defined(SQLITE_ENABLE_FTS4_UNICODE61)"
   puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
   puts ""
   puts "#include <assert.h>"
@@ -805,4 +808,4 @@ if {$::generate_test_code} {
 }
 
 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
-puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
+puts "#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */"
index 4c2a451fd1aaf36d75aa0d623528ec43af6a33c1..600d211c42103a6025e28c8fcf5accb14c6399a3 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Fix\sa\stypo\sin\sa\scollating\sfunction\sinside\sthe\se_reindex.test\sscript.
-D 2013-06-03T20:39:15.752
+C Up\suntil\snow\sthe\sfts4\s"unicode61"\stokenizer\shas\streated\sall\sprivate\suse\scodepoints\sexcept\sthe\sfirst\sand\slast\sof\seach\sof\sthe\sthree\sranges\sas\salphanumeric\s(eligible\sto\sbe\spart\sof\stokens).\sThis\scommit\sfixes\sthis\sso\sthat\sall\sprivate\suse\scodepoints\sare\sconsidered\salphanumeric.\sIn\sother\swords,\sit\sfixes\sthe\shandling\sof\scodepoints\s0xE000,\s0xF8FF,\s0xF0000,\s0xFFFFD,\s0x100000\sand\s0x10FFFD.
+D 2013-06-05T16:17:21.916
 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
 F Makefile.in 5e41da95d92656a5004b03d3576e8b226858a28e
 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -95,14 +95,14 @@ F ext/fts3/fts3_tokenizer.c bbdc731bc91338050675c6d1da9ab82147391e16
 F ext/fts3/fts3_tokenizer.h 64c6ef6c5272c51ebe60fc607a896e84288fcbc3
 F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
 F ext/fts3/fts3_unicode.c 92391b4b4fb043564c6539ea9b8661e3bcba47b9
-F ext/fts3/fts3_unicode2.c a863f05f758af36777dffc2facc898bc73fec896
+F ext/fts3/fts3_unicode2.c 0113d3acf13429e6dc38e0647d1bc71211c31a4d
 F ext/fts3/fts3_write.c 6a1fc0e922e76b68e594bf7bc33bac72af9dc47b
 F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
 F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
 F ext/fts3/tool/fts3view.c 6cfc5b67a5f0e09c0d698f9fd012c784bfaa9197
 F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
 F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
-F ext/fts3/unicode/mkunicode.tcl 7a9bc018e2962abb79563c5a39fe581fcbf2f675
+F ext/fts3/unicode/mkunicode.tcl dc6f268eb526710e2c6e496c372471d773d0c368
 F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43
 F ext/icu/icu.c 7538f98eab2854cf17fa5f7797bffa6c76e3863b
 F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
@@ -550,7 +550,7 @@ F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
 F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
 F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
 F test/fts4merge4.test c19c85ca1faa7b6d536832b49c12e1867235f584
-F test/fts4unicode.test 25ccad45896f8e50f6a694cff738a35f798cdb40
+F test/fts4unicode.test c8ac44217bf6c17812b03eaafa6c06995ad304c2
 F test/full.test 6b3c8fb43c6beab6b95438c1675374b95fab245d
 F test/func.test b0fc34fdc36897769651975a2b0a606312753643
 F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
@@ -1093,7 +1093,7 @@ F tool/vdbe-compress.tcl f12c884766bd14277f4fcedcae07078011717381
 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
 F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
 F tool/win/sqlite.vsix 97894c2790eda7b5bce3cc79cb2a8ec2fde9b3ac
-P 3bd5ad095b23102dd3379cb62997cbf23cc67b7a
-R 0bab77d0f95310ae1c21cfea10915144
-U drh
-Z 1023ee14390bd42e471d5323a67fa234
+P 4d74fccf02134a998a84097b021ba9d501e34ff0
+R 659aea33cb10f326783eda2b62f9d699
+U dan
+Z 74ecc7396dceda2a9a9f04f8bd9d8ced
index e3db3db77490ca78f77eb30402345c715622c343..0ba0d6eb302eec84235d7493d074c1faf33ab654 100644 (file)
@@ -1 +1 @@
-4d74fccf02134a998a84097b021ba9d501e34ff0
\ No newline at end of file
+6cfd9af5250029c0d275be027b4208c48954a8a1
\ No newline at end of file
index 8bd83f6d9eb0780e1c87cca1b7201ab97aba6511..aee7f05b1b771bea63fb312fa9f0f4a5f2fa0672 100644 (file)
@@ -384,5 +384,23 @@ foreach T $tokenizers {
   do_isspace_test 6.$T.19 $T   {8287 12288}
 }
 
+#-------------------------------------------------------------------------
+# Test that the private use ranges are treated as alphanumeric.
+#
+breakpoint
+foreach {tn1 c} {
+  1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
+} {
+  foreach {tn2 config res} {
+    1 ""             "0 hello*world hello*world"
+    2 "separators=*" "0 hello hello 1 world world"
+  } {
+    set config [string map [list * $c] $config]
+    set input  [string map [list * $c] "hello*world"]
+    set output [string map [list * $c] $res]
+    do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
+  }
+}
+
 
 finish_test