const char *zInput;
int nInput;
+ const char *zArg = 0;
+
const char *zToken;
int nToken;
int iStart;
Tcl_Obj *pRet;
- assert( argc==2 );
+ assert( argc==2 || argc==3 );
nName = sqlite3_value_bytes(argv[0]);
zName = (const char *)sqlite3_value_text(argv[0]);
- nInput = sqlite3_value_bytes(argv[1]);
- zInput = (const char *)sqlite3_value_text(argv[1]);
+ nInput = sqlite3_value_bytes(argv[argc-1]);
+ zInput = (const char *)sqlite3_value_text(argv[argc-1]);
+
+ if( argc==3 ){
+ zArg = (const char *)sqlite3_value_text(argv[1]);
+ }
pHash = (fts2Hash *)sqlite3_user_data(context);
p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
pRet = Tcl_NewObj();
Tcl_IncrRefCount(pRet);
- if( SQLITE_OK!=p->xCreate(0, 0, &pTokenizer) ){
+ if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
zErr = "error in xCreate()";
goto finish;
}
|| (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
#ifdef SQLITE_TEST
|| (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
+ || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
#endif
);
-C Add\ssome\sdocumentation\sfor\suser-defined\sfts2\stokenizers.\s(CVS\s4116)
-D 2007-06-25T09:52:31
+C Add\ssome\stests\sfor\sthe\sfts2\sicu\stokenizer.\s(CVS\s4117)
+D 2007-06-25T11:24:39
F Makefile.in 7f7485a4cc039476a42e534b3f26ec90e2f9753e
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
F ext/fts2/fts2_icu.c 45b54d1e075020b35db20f69d829f95ca0651111
F ext/fts2/fts2_porter.c 991a45463553c7318063fe7773368a6c0f39e35d
-F ext/fts2/fts2_tokenizer.c 836373ee0fab4f8288a7815496529f25e4504881
+F ext/fts2/fts2_tokenizer.c 40aa54fab0305b53f883fe4a394552f71dda29ee
F ext/fts2/fts2_tokenizer.h 6d151c51382e8f6cf689c616bb697fe780478089
F ext/fts2/fts2_tokenizer1.c 5c979fe8815f95396beb22b627571da895a025af
F ext/fts2/mkfts2amal.tcl 2a9ec76b0760fe7f3669dca5bc0d60728bc1c977
F test/fts2m.test 4b30142ead6f3ed076e880a2a464064c5ad58c51
F test/fts2n.test a70357e72742681eaebfdbe9007b87ff3b771638
F test/fts2o.test 05ce2ac9111c29998418a584de02136a0ded471b
-F test/fts2token.test 8cfc9ee33361b93fa175197f25fefdd13dfb442e
+F test/fts2token.test 5f349000669bc10bb392c87443ca5d3a587c9df0
F test/func.test 605989453d1b42cec1d05c17aa232dc98e3e04e6
F test/fuzz.test 62fc19dd36a427777fd671b569df07166548628a
F test/fuzz2.test ea38692ce2da99ad79fe0be5eb1a452c1c4d37bb
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5
-P 5b0408ddd0f1c825f402d0f5a3088a61b5ecd2c3
-R ba9f214b2c9893e0ab7d0a1d3ada75bc
+P 5a9eee86587219a68655d548864d129edec969ae
+R 61c2bfb00d5da35c1ad182696b085ade
U danielk1977
-Z 128ebbe7f73a75dd3845535d3b5f71db
+Z 97c1d47f7790b3efb6130173bff1e39a
-5a9eee86587219a68655d548864d129edec969ae
\ No newline at end of file
+b79ced3e0a26b0db13613073c847c2d2ba7e174e
\ No newline at end of file
# of this script is testing the pluggable tokeniser feature of the
# FTS2 module.
#
-# $Id: fts2token.test,v 1.1 2007/06/22 15:21:16 danielk1977 Exp $
+# $Id: fts2token.test,v 1.2 2007/06/25 11:24:39 danielk1977 Exp $
#
set testdir [file dirname $argv0]
return
}
+proc escape_string {str} {
+ set out ""
+ foreach char [split $str ""] {
+ scan $char %c i
+ if {$i<=127} {
+ append out $char
+ } else {
+ append out [format {\x%.4x} $i]
+ }
+ }
+ set out
+}
+
#--------------------------------------------------------------------------
# Test cases fts2token-1.* are the warm-body test for the SQL scalar
# function fts2_tokenizer(). The procedure is as follows:
SELECT fts2_tokenizer_test('porter', 'I don''t see how');
}
} {{0 i I 1 don don 2 t t 3 see see 4 how how}}
-
ifcapable icu {
do_test fts2token-3.3 {
execsql {
} {{0 i I 1 don't don't 2 see see 3 how how}}
}
+#--------------------------------------------------------------------------
+# Test cases fts2token-4.* test the ICU tokenizer. In practice, this
+# tokenizer only has two modes - "thai" and "everybody else". Some other
+# Asian languages (Lao, Khmer etc.) require the same special treatment as
+# Thai, but ICU doesn't support them yet.
+#
+ifcapable icu {
+
+ proc do_icu_test {name locale input output} {
+ set ::out [db eval { SELECT fts2_tokenizer_test('icu', $locale, $input) }]
+ do_test $name {
+ lindex $::out 0
+ } $output
+ }
+
+ do_icu_test fts2token-4.1 en_US {} {}
+ do_icu_test fts2token-4.2 en_US {Test cases fts2} [list \
+ 0 test Test 1 cases cases 2 fts2 fts2
+ ]
+
+ # The following test shows that ICU is smart enough to recognise
+ # Thai chararacters, even when the locale is set to English/United
+ # States.
+ #
+ set input "\u0e2d\u0e30\u0e44\u0e23\u0e19\u0e30\u0e04\u0e23\u0e31\u0e1a"
+ set output "0 \u0e2d\u0e30\u0e44\u0e23 \u0e2d\u0e30\u0e44\u0e23 "
+ append output "1 \u0e19\u0e30 \u0e19\u0e30 "
+ append output "2 \u0e04\u0e23\u0e31\u0e1a \u0e04\u0e23\u0e31\u0e1a"
+
+ do_icu_test fts2token-4.3 th_TH $input $output
+ do_icu_test fts2token-4.4 en_US $input $output
+
+ # ICU handles an unknown locale by falling back to the default.
+ # So this is not an error.
+ do_icu_test fts2token-4.5 MiddleOfTheOcean $input $output
+
+ set longtoken "AReallyReallyLongTokenOneThatWillSurelyRequire"
+ append longtoken "AReallocInTheIcuTokenizerCode"
+
+ set input "short tokens then "
+ append input $longtoken
+ set output "0 short short "
+ append output "1 tokens tokens "
+ append output "2 then then "
+ append output "3 [string tolower $longtoken] $longtoken"
+
+ do_icu_test fts2token-4.6 MiddleOfTheOcean $input $output
+ do_icu_test fts2token-4.7 th_TH $input $output
+ do_icu_test fts2token-4.8 en_US $input $output
+}
+
finish_test