Add tests to check that the "unicode61" and "icu" tokenizers both identify white...

author dan <dan@noemail.net>

Tue, 19 Jun 2012 06:35:39 +0000 (06:35 +0000)

committer dan <dan@noemail.net>

Tue, 19 Jun 2012 06:35:39 +0000 (06:35 +0000)
author dan <dan@noemail.net>
Tue, 19 Jun 2012 06:35:39 +0000 (06:35 +0000)
committer dan <dan@noemail.net>
Tue, 19 Jun 2012 06:35:39 +0000 (06:35 +0000)
diff --git a/manifest b/manifest

index 9606896e108cbfba25979a1a0ba1e435b283a30b..cdb67bec0beaddff7cd67cbd09998ff97e8a03d8 100644 (file)
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Improved\srounding\saccuracy\son\stest-to-float\sconversions.
-D 2012-06-19T00:45:16.776
+C Add\stests\sto\scheck\sthat\sthe\s"unicode61"\sand\s"icu"\stokenizers\sboth\sidentify\swhite-space\scodepoints\soutside\sthe\sASCII\srange.
+D 2012-06-19T06:35:39.290
  F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
  F Makefile.in d17fddaa4e81f93a7c9c7c0808aacb3fc95f79f4
  F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -502,7 +502,7 @@ F test/fts4langid.test 24a6e41063b416bbdf371ff6b4476fa41c194aa7
  F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
  F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
  F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
-F test/fts4unicode.test 247e6c64563b5f930aec0f89a5b01ed6b4b129cd
+F test/fts4unicode.test aad033abdcfa0f87ce5f56468f59fdf2a0acbcef
  F test/func.test 9809b7622d721904a8cc33c1ffb87f46d506ed01
  F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
  F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a
@@ -1006,7 +1006,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06
  F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
  F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
  F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
-P 892b74116a3b23268895b96433d18ef00c1433d8
-R f92cf3b5f19c2cdf0a1f4043259e76e0
-U drh
-Z 37917877b4f7a2aac8f4f47ec8031274
+P 699b792c6a0e989994549959b11ec1bfad8bbd92
+R fcc8a5f96c6c9d1a1ba2f2345cbe909b
+U dan
+Z 04e7a9f69aa11cdd2b90d7abb2bd52a9
diff --git a/manifest.uuid b/manifest.uuid

index 9a76fd8178458b09d92aa9f6760893d1cdcce44d..9412d9826adf8352fd699c759f776067286da292 100644 (file)
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-699b792c6a0e989994549959b11ec1bfad8bbd92
-\ No newline at end of file
+bfb2d4730cbbe18fb940e72f4fde9122d550734e
+\ No newline at end of file
diff --git a/test/fts4unicode.test b/test/fts4unicode.test

index 3abceb68b29f448a7b7cac1d1d30f3c28e90a6de..0ac60a6f01a6ad981d8c4f055d36ea54308bac97 100644 (file)
--- a/test/fts4unicode.test
+++ b/test/fts4unicode.test
@@ -326,5 +326,62 @@ do_unicode_token_test3 5.11 "tokenchars=\u0301" \
    "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
  
  
+#-------------------------------------------------------------------------
+
+proc do_tokenize {tokenizer txt} {
+  set res [list]
+  foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
+    lappend res $b
+  }
+  set res
+}
+
+# Argument $lCodepoint must be a list of codepoints (integers) that 
+# correspond to whitespace characters. This command creates a string
+# $W from the codepoints, then tokenizes "${W}hello{$W}world${W}" 
+# using tokenizer $tokenizer. The test passes if the tokenizer successfully
+# extracts the two 5 character tokens.
+#
+proc do_isspace_test {tn tokenizer lCp} {
+  set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp] 
+  set txt "${whitespace}hello${whitespace}world${whitespace}"
+  uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
+}
+
+set tokenizers [list unicode61]
+ifcapable icu { lappend tokenizers icu }
+
+# Some tests to check that the tokenizers can both identify white-space 
+# codepoints. All codepoints tested below are of type "Zs" in the
+# UnicodeData.txt file.
+foreach T $tokenizers {
+  do_isspace_test 6.$T.1 $T    32
+  do_isspace_test 6.$T.2 $T    160
+  do_isspace_test 6.$T.3 $T    5760
+  do_isspace_test 6.$T.4 $T    6158
+  do_isspace_test 6.$T.5 $T    8192
+  do_isspace_test 6.$T.6 $T    8193
+  do_isspace_test 6.$T.7 $T    8194
+  do_isspace_test 6.$T.8 $T    8195
+  do_isspace_test 6.$T.9 $T    8196
+  do_isspace_test 6.$T.10 $T    8197
+  do_isspace_test 6.$T.11 $T    8198
+  do_isspace_test 6.$T.12 $T    8199
+  do_isspace_test 6.$T.13 $T    8200
+  do_isspace_test 6.$T.14 $T    8201
+  do_isspace_test 6.$T.15 $T    8202
+  do_isspace_test 6.$T.16 $T    8239
+  do_isspace_test 6.$T.17 $T    8287
+  do_isspace_test 6.$T.18 $T   12288
+
+  do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
+  do_isspace_test 6.$T.19 $T   {8192 8193 8194 8195}
+  do_isspace_test 6.$T.19 $T   {8196 8197 8198 8199}
+  do_isspace_test 6.$T.19 $T   {8200 8201 8202 8239}
+  do_isspace_test 6.$T.19 $T   {8287 12288}
+}
+
+
  finish_test
  
+
author	dan <dan@noemail.net>
	Tue, 19 Jun 2012 06:35:39 +0000 (06:35 +0000)
committer	dan <dan@noemail.net>
	Tue, 19 Jun 2012 06:35:39 +0000 (06:35 +0000)
manifest		patch \| blob \| blame \| history
manifest.uuid		patch \| blob \| blame \| history
test/fts4unicode.test		patch \| blob \| blame \| history