]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: Fix simple tokenizer apostrophe handling.
authorTeemu Huovila <teemu.huovila@dovecot.fi>
Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)
committerTeemu Huovila <teemu.huovila@dovecot.fi>
Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)
Apostrophes and quotation marks are now treated as word breaks,
except U+0027 between non-wordbrek characters. The characters
U+2019 and U+FF07 are transformed to U+0027 before processing.

src/lib-fts/fts-tokenizer-generic-private.h
src/lib-fts/fts-tokenizer-generic.c
src/lib-fts/test-fts-tokenizer.c
src/lib-fts/word-properties.pl

index 1696759e64088eec7e792e2584cd21245acd1b33..1bda47c93b40a6640d905f87b384ae87b0736026 100644 (file)
@@ -40,8 +40,7 @@ struct generic_fts_tokenizer {
        struct fts_tokenizer tokenizer;
        unsigned int max_length;
        enum boundary_algorithm algorithm;
-       enum letter_type prev_letter; /* These two are basically the
-                                            state of the parsing. */
+       enum letter_type prev_letter;
        enum letter_type prev_prev_letter;
        size_t last_size; /* Bytes in latest utf8 character. */
        buffer_t *token;
index cf9681dc886095570b4d01bf1ccdb804845d2adb..d58484d77e24ee70262182220b36a2824e03a4a9 100644 (file)
 
 #define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
 
-static unsigned char fts_ascii_word_boundaries[128] = {
+#define IS_NONASCII_APOSTROPHE(c) \
+       ((c) == 0x2019 || (c) == 0xFF07)
+#define IS_APOSTROPHE(c) \
+       ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
+
+static unsigned char fts_ascii_word_breaks[128] = {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
 
@@ -95,34 +100,60 @@ static const char *fts_uni_strndup(const unsigned char *data, size_t size)
        return t_strndup(data, pos);
 }
 
-static void
+static bool
 fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                                            const char **token_r)
 {
-       *token_r = fts_uni_strndup(tok->token->data, tok->token->used);
+       const unsigned char *data;
+       size_t start = 0, len;
+
+       /* clean trailing and starting apostrophes. they were all made
+          into U+0027 earlier. */
+       data = tok->token->data;
+       len = tok->token->used;
+       while (len > 0 && data[len - 1] == '\'')
+               len--;
+       while (start < len && data[start] == '\'')
+               start++;
+
+       *token_r = len - start == 0 ? "" :
+               fts_uni_strndup(CONST_PTR_OFFSET(tok->token->data, start),
+                               len - start);
        buffer_set_used_size(tok->token, 0);
+       return (*token_r)[0] != '\0';
 }
 
-/* TODO: This is duplicated from unichar.c */
 static bool uint32_find(const uint32_t *data, unsigned int count,
                        uint32_t value, unsigned int *idx_r)
 {
        BINARY_NUMBER_SEARCH(data, count, value, idx_r);
 }
 
-static bool is_word_break(unichar_t c)
+static bool fts_ascii_word_break(unsigned char c)
+{
+       if (c < 0x80)
+               return fts_ascii_word_breaks[c] != 0;
+       return FALSE;
+}
+
+static bool fts_uni_word_break(unichar_t c)
 {
        unsigned int idx;
 
+       /* Override some apostrophes, which get special treatment. */
+       if (IS_APOSTROPHE(c))
+               return FALSE;
+
        /* Unicode General Punctuation, including deprecated characters. */
        if (c >= 0x2000 && c <= 0x206f)
                return TRUE;
-
        /* From word-break-data.c, which is generated from PropList.txt. */
        if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
                return TRUE;
        if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
                return TRUE;
+       if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
+               return TRUE;
        if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
                return TRUE;
        if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
@@ -133,17 +164,17 @@ static bool is_word_break(unichar_t c)
 }
 
 static bool
-data_is_word_boundary(const unsigned char *data, size_t size, size_t *i)
-{
-       unichar_t c;
-
-       if (data[*i] < 0x80)
-               return fts_ascii_word_boundaries[data[*i]] != 0;
-       /* unicode punctuation? */
-       if (uni_utf8_get_char_n(data + *i, size - *i, &c) <= 0)
-               i_unreached();
-       *i += uni_utf8_char_bytes(data[*i]) - 1;
-       return is_word_break(c);
+fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
+{
+       if (IS_APOSTROPHE(c)) {
+               if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
+                       return TRUE;
+               else
+                       tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
+       } else {
+               tok->prev_letter = LETTER_TYPE_NONE;
+       }
+       return FALSE;
 }
 
 static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -160,10 +191,26 @@ static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
 static void tok_append_truncated(struct generic_fts_tokenizer *tok,
                                 const unsigned char *data, size_t size)
 {
+       size_t append_len, pos = 0, appended = 0;
+       unichar_t c;
+
        i_assert(tok->max_length >= tok->token->used);
+       append_len = I_MIN(size, tok->max_length - tok->token->used);
 
-       buffer_append(tok->token, data,
-                     I_MIN(size, tok->max_length - tok->token->used));
+       /* Append only one kind of apostrophes. Simplifies things when returning
+          token. */
+       while (pos < append_len) {
+               if (uni_utf8_get_char_n(data + pos, size - pos, &c) <= 0)
+                       i_unreached();
+               if (IS_NONASCII_APOSTROPHE(c)) {
+                       buffer_append(tok->token, data, pos);
+                       buffer_append_c(tok->token, '\'');
+                       appended = pos + 1;
+               }
+               pos += uni_utf8_char_bytes(data[pos]);
+       }
+       if (appended < append_len)
+               buffer_append(tok->token, data + appended, append_len - appended);
 }
 
 static int
@@ -175,21 +222,27 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
        struct generic_fts_tokenizer *tok =
                (struct generic_fts_tokenizer *)_tok;
        size_t i, char_start_i, len, start = 0;
+       unsigned int char_size;
+       unichar_t c;
 
-       for (i = 0; i < size; i++) {
+       for (i = 0; i < size; i += char_size) {
                char_start_i = i;
-               if (data_is_word_boundary(data, size, &i)) {
+               if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
+                       i_unreached();
+               char_size = uni_utf8_char_bytes(data[i]);
+               if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
+                   fts_apostrophe_word_break(tok, c)) {
                        len = char_start_i - start;
                        tok_append_truncated(tok, data + start, len);
                        if (tok->token->used == 0) {
-                               /* no text read yet */
-                               start = i + 1;
+                               start = i + char_size;
                                continue;
                        }
-                       /* word boundary found - return a new token */
-                       *skip_r = i + 1;
-                       fts_tokenizer_generic_simple_current_token(tok, token_r);
-                       return 1;
+
+                       if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
+                               *skip_r = i + char_size;
+                               return 1;
+                       }
                }
        }
        /* word boundary not found yet */
@@ -199,9 +252,10 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
 
        /* return the last token */
        if (size == 0 && tok->token->used > 0) {
-               fts_tokenizer_generic_simple_current_token(tok, token_r);
-               return 1;
+               if (fts_tokenizer_generic_simple_current_token(tok, token_r))
+                       return 1;
        }
+
        return 0;
 }
 
index 45953aff20677b4bda5ebe25899b69b60df1400b..94e6166d1b0f8ee916db5b18d9b8eab7cf84561a 100644 (file)
@@ -29,6 +29,8 @@ static const char *test_inputs[] = {
 
        "1.",
 
+       "'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
+
        /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
           U+205A(e2 81 9a) and U+205F(e2 81 9f) */
        "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
@@ -99,6 +101,7 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
                outi++;
        }
        test_assert_idx(expected_output[outi] == NULL, outi);
+
        return outi+1;
 }
 
@@ -130,6 +133,9 @@ static void test_fts_tokenizer_generic_only(void)
 
                "1", NULL,
 
+               "quoted", "text", "word", "hlo", "words", "you're", "bad",
+               "word", "pre", "post", NULL,
+
                "hello", "world", "And",
                "there", "was", "text", "galore",
                "and", "more", NULL,
@@ -169,6 +175,9 @@ static void test_fts_tokenizer_generic_tr29_only(void)
 
                "1", NULL,
 
+               "quoted", "text", "word", "hlo", "words", "you're", "bad",
+               "word", "pre", "post", NULL,
+
                "hello", "world", "And",
                "there", "was", "text", "galore",
                "and", "more", NULL,
index 8bd08fa1ce05a58b8b32bab5628c1147fdc46788..c600d14373ab6a3af546466aa9b207cbafd046d7 100644 (file)
@@ -8,7 +8,7 @@ if ($which eq 'boundaries') {
     @categories = qw(CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter
                    Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet);
 } elsif ($which eq 'breaks') {
-    @categories = qw(White_Space Dash Terminal_Punctuation STerm Pattern_White_Space);
+    @categories = qw(White_Space Dash Quotation_Mark Terminal_Punctuation STerm Pattern_White_Space);
 } else {
     die "specify 'boundaries' or 'breaks'";
 }