From: Teemu Huovila Date: Tue, 17 Nov 2015 09:43:28 +0000 (+0200) Subject: lib-fts: Add Swedish (sv) to supported languages. X-Git-Tag: 2.2.20.rc1~80 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c5effa0f13da8f45991c89a9d8c9d2109db66039;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: Add Swedish (sv) to supported languages. --- diff --git a/src/lib-fts/Makefile.am b/src/lib-fts/Makefile.am index b530c63e6a..3a6dfcb986 100644 --- a/src/lib-fts/Makefile.am +++ b/src/lib-fts/Makefile.am @@ -14,7 +14,8 @@ stopwordsdir = $(datadir)/${PACKAGE_TARNAME}/stopwords dist_stopwords_DATA = \ stopwords_en.txt \ stopwords_fi.txt \ - stopwords_fr.txt + stopwords_fr.txt \ + stopwords_sv.txt BUILT_SOURCES = word-boundary-data.c word-break-data.c diff --git a/src/lib-fts/fts-language.c b/src/lib-fts/fts-language.c index dcd0bb02a8..870b139238 100644 --- a/src/lib-fts/fts-language.c +++ b/src/lib-fts/fts-language.c @@ -41,6 +41,7 @@ const struct fts_language fts_languages[] = { { "pt" }, { "ro" }, { "ru" }, + { "sv" } }; const struct fts_language fts_language_data = { diff --git a/src/lib-fts/stopwords_sv.txt b/src/lib-fts/stopwords_sv.txt new file mode 100644 index 0000000000..22bddfd8cb --- /dev/null +++ b/src/lib-fts/stopwords_sv.txt @@ -0,0 +1,131 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/src/lib-fts/test-fts-filter.c b/src/lib-fts/test-fts-filter.c index ad4014dc5e..0f9c5fe783 100644 --- a/src/lib-fts/test-fts-filter.c +++ b/src/lib-fts/test-fts-filter.c @@ -13,6 +13,7 @@ static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL}; static struct fts_language english_language = { .name = "en" }; static struct fts_language french_language = { .name = "fr" }; +static struct fts_language swedish_language = { .name = "sv" }; static void test_fts_filter_find(void) { @@ -653,6 +654,57 @@ static void test_fts_filter_normalizer_stopwords_stemmer_eng(void) test_assert(normalizer == NULL); test_end(); } + +static void test_fts_filter_stopwords_normalizer_stemmer_sv(void) +{ + int ret; + struct fts_filter *normalizer; + struct fts_filter *stemmer; + struct fts_filter *filter; + const char *error; + const char *token = NULL; + const char * const tokens[] = { + "Enär", "erkännandet", "av", "det", "inneboende", "värdet", + "hos", "alla", "medlemmar", "av", "människosläktet", "och", + "av", "deras", "lika", "och", "oförytterliga", "rättigheter", + "är", "grundvalen", "för", "frihet", "rättvisa", "och", "fred", + "i", "världen", NULL}; + const char * const bases[] = { + "enar", "erkan", NULL, NULL, "inneboend", "vardet", "hos", NULL, + "medlemm", NULL, "manniskoslaktet", NULL, NULL, NULL, "lik", + NULL, "oforytter", "ratt", NULL, "grundval", NULL, "frihet", + "rattvis", NULL, "fred", NULL, "varld", NULL}; + const char * const *tpp; + const char * const *bpp; + + test_begin("fts filters with stopwords, default normalizer and stemming chained, Swedish"); + + + test_assert(fts_filter_create(fts_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0); + test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0); + test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0); + + bpp = bases; + for (tpp = tokens; *tpp != NULL; tpp++) { + token = *tpp; + ret = fts_filter_filter(stemmer, &token, &error); + if (ret <= 0) { + test_assert(ret == 0); + test_assert(*bpp == NULL); + } else { + test_assert(*bpp != NULL); + test_assert(strcmp(*bpp, token) == 0); + } + bpp++; + } + fts_filter_unref(&stemmer); + fts_filter_unref(&normalizer); + fts_filter_unref(&filter); + test_assert(stemmer == NULL); + test_assert(filter == NULL); + test_assert(normalizer == NULL); + test_end(); +} #endif #endif @@ -741,6 +793,7 @@ int main(void) test_fts_filter_normalizer_invalid_id, #ifdef HAVE_FTS_STEMMER test_fts_filter_normalizer_stopwords_stemmer_eng, + test_fts_filter_stopwords_normalizer_stemmer_sv, #endif #endif test_fts_filter_english_possessive, diff --git a/src/lib-fts/test-fts-language.c b/src/lib-fts/test-fts-language.c index 62a91b1516..23733b2db5 100644 --- a/src/lib-fts/test-fts-language.c +++ b/src/lib-fts/test-fts-language.c @@ -134,6 +134,32 @@ static void test_fts_language_detect_german(void) test_end(); } +/* Detect Swedish */ +static void test_fts_language_detect_swedish(void) +{ + struct fts_language_list *lp = NULL; + const struct fts_language *lang_r = NULL; + const unsigned char swedish[] = + "Artikel 1."\ + "Alla m\xC3\xA4nniskor \xC3\xA4ro f\xC3\xB6""dda fria och lika"\ + " i v\xC3\xA4rde och r\xC3\xA4ttigheter. De \xC3\xA4ro "\ + "utrustade med f\xC3\xB6rnuft och samvete och b\xC3\xB6ra "\ + "handla gentemot varandra i en anda av broderskap."; + + + + const char names[] = "fi, de, sv, fr, en"; + const char *unknown, *error; + test_begin("fts language detect Swedish"); + test_assert(fts_language_list_init(settings, &lp, &error) == 0); + test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE); + test_assert(fts_language_detect(lp, swedish, sizeof(swedish)-1, &lang_r) + == FTS_LANGUAGE_RESULT_OK); + test_assert(strcmp(lang_r->name, "sv") == 0); + fts_language_list_deinit(&lp); + test_end(); +} + /* Detect Finnish as English */ static void test_fts_language_detect_finnish_as_english(void) { @@ -212,6 +238,7 @@ int main(void) test_fts_language_detect_english, test_fts_language_detect_french, test_fts_language_detect_german, + test_fts_language_detect_swedish, test_fts_language_detect_finnish_as_english, test_fts_language_detect_na, test_fts_language_detect_unknown,