From: danielk1977 Date: Mon, 25 Jun 2007 11:24:38 +0000 (+0000) Subject: Add some tests for the fts2 icu tokenizer. (CVS 4117) X-Git-Tag: version-3.6.10~2057 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f86643b32fded6039b61545b297c2ee0ae0fada1;p=thirdparty%2Fsqlite.git Add some tests for the fts2 icu tokenizer. (CVS 4117) FossilOrigin-Name: b79ced3e0a26b0db13613073c847c2d2ba7e174e --- diff --git a/ext/fts2/fts2_tokenizer.c b/ext/fts2/fts2_tokenizer.c index 95ab370fbf..bf84f583fc 100644 --- a/ext/fts2/fts2_tokenizer.c +++ b/ext/fts2/fts2_tokenizer.c @@ -117,6 +117,8 @@ static void testFunc( const char *zInput; int nInput; + const char *zArg = 0; + const char *zToken; int nToken; int iStart; @@ -125,12 +127,16 @@ static void testFunc( Tcl_Obj *pRet; - assert( argc==2 ); + assert( argc==2 || argc==3 ); nName = sqlite3_value_bytes(argv[0]); zName = (const char *)sqlite3_value_text(argv[0]); - nInput = sqlite3_value_bytes(argv[1]); - zInput = (const char *)sqlite3_value_text(argv[1]); + nInput = sqlite3_value_bytes(argv[argc-1]); + zInput = (const char *)sqlite3_value_text(argv[argc-1]); + + if( argc==3 ){ + zArg = (const char *)sqlite3_value_text(argv[1]); + } pHash = (fts2Hash *)sqlite3_user_data(context); p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1); @@ -145,7 +151,7 @@ static void testFunc( pRet = Tcl_NewObj(); Tcl_IncrRefCount(pRet); - if( SQLITE_OK!=p->xCreate(0, 0, &pTokenizer) ){ + if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ zErr = "error in xCreate()"; goto finish; } @@ -221,6 +227,7 @@ int sqlite3Fts2InitHashTable( || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0)) #ifdef SQLITE_TEST || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0)) + || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0)) #endif ); diff --git a/manifest b/manifest index 2165de94c1..c0bc73c16f 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\ssome\sdocumentation\sfor\suser-defined\sfts2\stokenizers.\s(CVS\s4116) -D 2007-06-25T09:52:31 +C Add\ssome\stests\sfor\sthe\sfts2\sicu\stokenizer.\s(CVS\s4117) +D 2007-06-25T11:24:39 F Makefile.in 7f7485a4cc039476a42e534b3f26ec90e2f9753e F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -43,7 +43,7 @@ F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e F ext/fts2/fts2_icu.c 45b54d1e075020b35db20f69d829f95ca0651111 F ext/fts2/fts2_porter.c 991a45463553c7318063fe7773368a6c0f39e35d -F ext/fts2/fts2_tokenizer.c 836373ee0fab4f8288a7815496529f25e4504881 +F ext/fts2/fts2_tokenizer.c 40aa54fab0305b53f883fe4a394552f71dda29ee F ext/fts2/fts2_tokenizer.h 6d151c51382e8f6cf689c616bb697fe780478089 F ext/fts2/fts2_tokenizer1.c 5c979fe8815f95396beb22b627571da895a025af F ext/fts2/mkfts2amal.tcl 2a9ec76b0760fe7f3669dca5bc0d60728bc1c977 @@ -256,7 +256,7 @@ F test/fts2l.test 4c53c89ce3919003765ff4fd8d98ecf724d97dd3 F test/fts2m.test 4b30142ead6f3ed076e880a2a464064c5ad58c51 F test/fts2n.test a70357e72742681eaebfdbe9007b87ff3b771638 F test/fts2o.test 05ce2ac9111c29998418a584de02136a0ded471b -F test/fts2token.test 8cfc9ee33361b93fa175197f25fefdd13dfb442e +F test/fts2token.test 5f349000669bc10bb392c87443ca5d3a587c9df0 F test/func.test 605989453d1b42cec1d05c17aa232dc98e3e04e6 F test/fuzz.test 62fc19dd36a427777fd671b569df07166548628a F test/fuzz2.test ea38692ce2da99ad79fe0be5eb1a452c1c4d37bb @@ -515,7 +515,7 @@ F www/tclsqlite.tcl 8be95ee6dba05eabcd27a9d91331c803f2ce2130 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5 -P 5b0408ddd0f1c825f402d0f5a3088a61b5ecd2c3 -R ba9f214b2c9893e0ab7d0a1d3ada75bc +P 5a9eee86587219a68655d548864d129edec969ae +R 61c2bfb00d5da35c1ad182696b085ade U danielk1977 -Z 128ebbe7f73a75dd3845535d3b5f71db +Z 97c1d47f7790b3efb6130173bff1e39a diff --git a/manifest.uuid b/manifest.uuid index d18fb4562e..d99ed85c2a 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -5a9eee86587219a68655d548864d129edec969ae \ No newline at end of file +b79ced3e0a26b0db13613073c847c2d2ba7e174e \ No newline at end of file diff --git a/test/fts2token.test b/test/fts2token.test index 641aedab7a..f3efc8e821 100644 --- a/test/fts2token.test +++ b/test/fts2token.test @@ -12,7 +12,7 @@ # of this script is testing the pluggable tokeniser feature of the # FTS2 module. # -# $Id: fts2token.test,v 1.1 2007/06/22 15:21:16 danielk1977 Exp $ +# $Id: fts2token.test,v 1.2 2007/06/25 11:24:39 danielk1977 Exp $ # set testdir [file dirname $argv0] @@ -24,6 +24,19 @@ ifcapable !fts2 { return } +proc escape_string {str} { + set out "" + foreach char [split $str ""] { + scan $char %c i + if {$i<=127} { + append out $char + } else { + append out [format {\x%.4x} $i] + } + } + set out +} + #-------------------------------------------------------------------------- # Test cases fts2token-1.* are the warm-body test for the SQL scalar # function fts2_tokenizer(). The procedure is as follows: @@ -95,7 +108,6 @@ do_test fts2token-3.2 { SELECT fts2_tokenizer_test('porter', 'I don''t see how'); } } {{0 i I 1 don don 2 t t 3 see see 4 how how}} - ifcapable icu { do_test fts2token-3.3 { execsql { @@ -104,4 +116,55 @@ ifcapable icu { } {{0 i I 1 don't don't 2 see see 3 how how}} } +#-------------------------------------------------------------------------- +# Test cases fts2token-4.* test the ICU tokenizer. In practice, this +# tokenizer only has two modes - "thai" and "everybody else". Some other +# Asian languages (Lao, Khmer etc.) require the same special treatment as +# Thai, but ICU doesn't support them yet. +# +ifcapable icu { + + proc do_icu_test {name locale input output} { + set ::out [db eval { SELECT fts2_tokenizer_test('icu', $locale, $input) }] + do_test $name { + lindex $::out 0 + } $output + } + + do_icu_test fts2token-4.1 en_US {} {} + do_icu_test fts2token-4.2 en_US {Test cases fts2} [list \ + 0 test Test 1 cases cases 2 fts2 fts2 + ] + + # The following test shows that ICU is smart enough to recognise + # Thai chararacters, even when the locale is set to English/United + # States. + # + set input "\u0e2d\u0e30\u0e44\u0e23\u0e19\u0e30\u0e04\u0e23\u0e31\u0e1a" + set output "0 \u0e2d\u0e30\u0e44\u0e23 \u0e2d\u0e30\u0e44\u0e23 " + append output "1 \u0e19\u0e30 \u0e19\u0e30 " + append output "2 \u0e04\u0e23\u0e31\u0e1a \u0e04\u0e23\u0e31\u0e1a" + + do_icu_test fts2token-4.3 th_TH $input $output + do_icu_test fts2token-4.4 en_US $input $output + + # ICU handles an unknown locale by falling back to the default. + # So this is not an error. + do_icu_test fts2token-4.5 MiddleOfTheOcean $input $output + + set longtoken "AReallyReallyLongTokenOneThatWillSurelyRequire" + append longtoken "AReallocInTheIcuTokenizerCode" + + set input "short tokens then " + append input $longtoken + set output "0 short short " + append output "1 tokens tokens " + append output "2 then then " + append output "3 [string tolower $longtoken] $longtoken" + + do_icu_test fts2token-4.6 MiddleOfTheOcean $input $output + do_icu_test fts2token-4.7 th_TH $input $output + do_icu_test fts2token-4.8 en_US $input $output +} + finish_test