]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: Add Swedish (sv) to supported languages.
authorTeemu Huovila <teemu.huovila@dovecot.fi>
Tue, 17 Nov 2015 09:43:28 +0000 (11:43 +0200)
committerTeemu Huovila <teemu.huovila@dovecot.fi>
Tue, 17 Nov 2015 09:43:28 +0000 (11:43 +0200)
src/lib-fts/Makefile.am
src/lib-fts/fts-language.c
src/lib-fts/stopwords_sv.txt [new file with mode: 0644]
src/lib-fts/test-fts-filter.c
src/lib-fts/test-fts-language.c

index b530c63e6aae322460613020c43b266025db5843..3a6dfcb986ace9c926fb481684c9b904edc2101f 100644 (file)
@@ -14,7 +14,8 @@ stopwordsdir = $(datadir)/${PACKAGE_TARNAME}/stopwords
 dist_stopwords_DATA = \
        stopwords_en.txt \
        stopwords_fi.txt \
-       stopwords_fr.txt
+       stopwords_fr.txt \
+       stopwords_sv.txt
 
 BUILT_SOURCES = word-boundary-data.c word-break-data.c
 
index dcd0bb02a8f926c44f5647f82ede658c557f1b44..870b139238c3db868c1635ebc3b0fc3b4a29892b 100644 (file)
@@ -41,6 +41,7 @@ const struct fts_language fts_languages[] = {
        { "pt" },
        { "ro" },
        { "ru" },
+       { "sv" }
 };
 
 const struct fts_language fts_language_data = {
diff --git a/src/lib-fts/stopwords_sv.txt b/src/lib-fts/stopwords_sv.txt
new file mode 100644 (file)
index 0000000..22bddfd
--- /dev/null
@@ -0,0 +1,131 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Swedish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This is a ranked list (commonest to rarest) of stopwords derived from
+ | a large text sample.
+
+ | Swedish stop words occasionally exhibit homonym clashes. For example
+ |  så = so, but also seed. These are indicated clearly below.
+
+och            | and
+det            | it, this/that
+att            | to (with infinitive)
+i              | in, at
+en             | a
+jag            | I
+hon            | she
+som            | who, that
+han            | he
+på             | on
+den            | it, this/that
+med            | with
+var            | where, each
+sig            | him(self) etc
+för            | for
+så             | so (also: seed)
+till           | to
+är             | is
+men            | but
+ett            | a
+om             | if; around, about
+hade           | had
+de             | they, these/those
+av             | of
+icke           | not, no
+mig            | me
+du             | you
+henne          | her
+då             | then, when
+sin            | his
+nu             | now
+har            | have
+inte           | inte någon = no one
+hans           | his
+honom          | him
+skulle         | 'sake'
+hennes         | her
+där            | there
+min            | my
+man            | one (pronoun)
+ej             | nor
+vid            | at, by, on (also: vast)
+kunde          | could
+något          | some etc
+från           | from, off
+ut             | out
+när            | when
+efter          | after, behind
+upp            | up
+vi             | we
+dem            | them
+vara           | be
+vad            | what
+över           | over
+än             | than
+dig            | you
+kan            | can
+sina           | his
+här            | here
+ha             | have
+mot            | towards
+alla           | all
+under          | under (also: wonder)
+någon          | some etc
+eller          | or (else)
+allt           | all
+mycket         | much
+sedan          | since
+ju             | why
+denna          | this/that
+själv          | myself, yourself etc
+detta          | this/that
+åt             | to
+utan           | without
+varit          | was
+hur            | how
+ingen          | no
+mitt           | my
+ni             | you
+bli            | to be, become
+blev           | from bli
+oss            | us
+din            | thy
+dessa          | these/those
+några          | some etc
+deras          | their
+blir           | from bli
+mina           | my
+samma          | (the) same
+vilken         | who, that
+er             | you, your
+sådan          | such a
+vår            | our
+blivit         | from bli
+dess           | its
+inom           | within
+mellan         | between
+sådant         | such a
+varför         | why
+varje          | each
+vilka          | who, that
+ditt           | thy
+vem            | who
+vilket         | who, that
+sitta          | his
+sådana         | such a
+vart           | each
+dina           | thy
+vars           | whose
+vårt           | our
+våra           | our
+ert            | your
+era            | your
+vilkas         | whose
+
index ad4014dc5edf816aee8326064c55603677283568..0f9c5fe783aa12a8dfc715ff64b04feb1a5fdf77 100644 (file)
@@ -13,6 +13,7 @@
 static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
 static struct fts_language english_language = { .name = "en" };
 static struct fts_language french_language = { .name = "fr" };
+static struct fts_language swedish_language = { .name = "sv" };
 
 static void test_fts_filter_find(void)
 {
@@ -653,6 +654,57 @@ static void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
        test_assert(normalizer == NULL);
        test_end();
 }
+
+static void test_fts_filter_stopwords_normalizer_stemmer_sv(void)
+{
+       int ret;
+       struct fts_filter *normalizer;
+       struct fts_filter *stemmer;
+       struct fts_filter *filter;
+       const char *error;
+       const char *token = NULL;
+       const char * const tokens[] = {
+               "Enär", "erkännandet", "av", "det", "inneboende", "värdet",
+               "hos", "alla", "medlemmar", "av", "människosläktet", "och",
+               "av", "deras", "lika", "och", "oförytterliga", "rättigheter",
+               "är", "grundvalen", "för", "frihet", "rättvisa", "och", "fred",
+               "i", "världen",        NULL};
+       const char * const bases[] = {
+               "enar", "erkan", NULL, NULL, "inneboend", "vardet", "hos", NULL,
+               "medlemm", NULL, "manniskoslaktet", NULL, NULL, NULL, "lik",
+               NULL, "oforytter", "ratt", NULL, "grundval", NULL, "frihet",
+               "rattvis", NULL, "fred", NULL, "varld", NULL};
+       const char * const *tpp;
+       const char * const *bpp;
+
+       test_begin("fts filters with stopwords, default normalizer and stemming chained, Swedish");
+
+
+       test_assert(fts_filter_create(fts_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0);
+       test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
+       test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0);
+
+       bpp = bases;
+       for (tpp = tokens; *tpp != NULL; tpp++) {
+               token = *tpp;
+               ret = fts_filter_filter(stemmer, &token, &error);
+               if (ret <= 0) {
+                       test_assert(ret == 0);
+                       test_assert(*bpp == NULL);
+               } else {
+                       test_assert(*bpp != NULL);
+                       test_assert(strcmp(*bpp, token)  == 0);
+               }
+               bpp++;
+       }
+       fts_filter_unref(&stemmer);
+       fts_filter_unref(&normalizer);
+       fts_filter_unref(&filter);
+       test_assert(stemmer == NULL);
+       test_assert(filter == NULL);
+       test_assert(normalizer == NULL);
+       test_end();
+}
 #endif
 #endif
 
@@ -741,6 +793,7 @@ int main(void)
                test_fts_filter_normalizer_invalid_id,
 #ifdef HAVE_FTS_STEMMER
                test_fts_filter_normalizer_stopwords_stemmer_eng,
+               test_fts_filter_stopwords_normalizer_stemmer_sv,
 #endif
 #endif
                test_fts_filter_english_possessive,
index 62a91b1516c80b8249479dfbfa4479564ab778e3..23733b2db58fe8ef771cf897bc832e93831d8fc7 100644 (file)
@@ -134,6 +134,32 @@ static void test_fts_language_detect_german(void)
        test_end();
 }
 
+/* Detect Swedish */
+static void test_fts_language_detect_swedish(void)
+{
+       struct fts_language_list *lp = NULL;
+       const struct fts_language *lang_r = NULL;
+       const unsigned char swedish[]  =
+               "Artikel 1."\
+               "Alla m\xC3\xA4nniskor \xC3\xA4ro f\xC3\xB6""dda fria och lika"\
+               " i v\xC3\xA4rde och r\xC3\xA4ttigheter. De \xC3\xA4ro "\
+               "utrustade med f\xC3\xB6rnuft och samvete och b\xC3\xB6ra "\
+               "handla gentemot varandra i en anda av broderskap.";
+
+
+
+       const char names[] = "fi, de, sv, fr, en";
+       const char *unknown, *error;
+       test_begin("fts language detect Swedish");
+       test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+       test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+       test_assert(fts_language_detect(lp, swedish, sizeof(swedish)-1, &lang_r)
+                   == FTS_LANGUAGE_RESULT_OK);
+       test_assert(strcmp(lang_r->name, "sv") == 0);
+       fts_language_list_deinit(&lp);
+       test_end();
+}
+
 /* Detect Finnish as English */
 static void test_fts_language_detect_finnish_as_english(void)
 {
@@ -212,6 +238,7 @@ int main(void)
                test_fts_language_detect_english,
                test_fts_language_detect_french,
                test_fts_language_detect_german,
+               test_fts_language_detect_swedish,
                test_fts_language_detect_finnish_as_english,
                test_fts_language_detect_na,
                test_fts_language_detect_unknown,