lib-fts: Fix simple tokenizer apostrophe handling.

author Teemu Huovila <teemu.huovila@dovecot.fi>

Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)

committer Teemu Huovila <teemu.huovila@dovecot.fi>

Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)
author Teemu Huovila <teemu.huovila@dovecot.fi>
Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)
committer Teemu Huovila <teemu.huovila@dovecot.fi>
Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)
diff --git a/src/lib-fts/fts-tokenizer-generic-private.h b/src/lib-fts/fts-tokenizer-generic-private.h

index 1696759e64088eec7e792e2584cd21245acd1b33..1bda47c93b40a6640d905f87b384ae87b0736026 100644 (file)
--- a/src/lib-fts/fts-tokenizer-generic-private.h
+++ b/src/lib-fts/fts-tokenizer-generic-private.h
@@ -40,8 +40,7 @@ struct generic_fts_tokenizer {
         struct fts_tokenizer tokenizer;
         unsigned int max_length;
         enum boundary_algorithm algorithm;
-       enum letter_type prev_letter; /* These two are basically the
-                                            state of the parsing. */
+       enum letter_type prev_letter;
         enum letter_type prev_prev_letter;
         size_t last_size; /* Bytes in latest utf8 character. */
         buffer_t *token;
diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c

index cf9681dc886095570b4d01bf1ccdb804845d2adb..d58484d77e24ee70262182220b36a2824e03a4a9 100644 (file)
--- a/src/lib-fts/fts-tokenizer-generic.c
+++ b/src/lib-fts/fts-tokenizer-generic.c
@@ -11,7 +11,12 @@
  
  #define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
  
-static unsigned char fts_ascii_word_boundaries[128] = {
+#define IS_NONASCII_APOSTROPHE(c) \
+       ((c) == 0x2019 || (c) == 0xFF07)
+#define IS_APOSTROPHE(c) \
+       ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
+
+static unsigned char fts_ascii_word_breaks[128] = {
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
  
@@ -95,34 +100,60 @@ static const char *fts_uni_strndup(const unsigned char *data, size_t size)
         return t_strndup(data, pos);
  }
  
-static void
+static bool
  fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                                             const char **token_r)
  {
-       *token_r = fts_uni_strndup(tok->token->data, tok->token->used);
+       const unsigned char *data;
+       size_t start = 0, len;
+
+       /* clean trailing and starting apostrophes. they were all made
+          into U+0027 earlier. */
+       data = tok->token->data;
+       len = tok->token->used;
+       while (len > 0 && data[len - 1] == '\'')
+               len--;
+       while (start < len && data[start] == '\'')
+               start++;
+
+       *token_r = len - start == 0 ? "" :
+               fts_uni_strndup(CONST_PTR_OFFSET(tok->token->data, start),
+                               len - start);
         buffer_set_used_size(tok->token, 0);
+       return (*token_r)[0] != '\0';
  }
  
-/* TODO: This is duplicated from unichar.c */
  static bool uint32_find(const uint32_t *data, unsigned int count,
                         uint32_t value, unsigned int *idx_r)
  {
         BINARY_NUMBER_SEARCH(data, count, value, idx_r);
  }
  
-static bool is_word_break(unichar_t c)
+static bool fts_ascii_word_break(unsigned char c)
+{
+       if (c < 0x80)
+               return fts_ascii_word_breaks[c] != 0;
+       return FALSE;
+}
+
+static bool fts_uni_word_break(unichar_t c)
  {
         unsigned int idx;
  
+       /* Override some apostrophes, which get special treatment. */
+       if (IS_APOSTROPHE(c))
+               return FALSE;
+
         /* Unicode General Punctuation, including deprecated characters. */
         if (c >= 0x2000 && c <= 0x206f)
                 return TRUE;
-
         /* From word-break-data.c, which is generated from PropList.txt. */
         if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
                 return TRUE;
         if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
                 return TRUE;
+       if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
+               return TRUE;
         if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
                 return TRUE;
         if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
@@ -133,17 +164,17 @@ static bool is_word_break(unichar_t c)
  }
  
  static bool
-data_is_word_boundary(const unsigned char *data, size_t size, size_t *i)
-{
-       unichar_t c;
-
-       if (data[*i] < 0x80)
-               return fts_ascii_word_boundaries[data[*i]] != 0;
-       /* unicode punctuation? */
-       if (uni_utf8_get_char_n(data + *i, size - *i, &c) <= 0)
-               i_unreached();
-       *i += uni_utf8_char_bytes(data[*i]) - 1;
-       return is_word_break(c);
+fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
+{
+       if (IS_APOSTROPHE(c)) {
+               if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
+                       return TRUE;
+               else
+                       tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
+       } else {
+               tok->prev_letter = LETTER_TYPE_NONE;
+       }
+       return FALSE;
  }
  
  static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -160,10 +191,26 @@ static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
  static void tok_append_truncated(struct generic_fts_tokenizer *tok,
                                  const unsigned char *data, size_t size)
  {
+       size_t append_len, pos = 0, appended = 0;
+       unichar_t c;
+
         i_assert(tok->max_length >= tok->token->used);
+       append_len = I_MIN(size, tok->max_length - tok->token->used);
  
-       buffer_append(tok->token, data,
-                     I_MIN(size, tok->max_length - tok->token->used));
+       /* Append only one kind of apostrophes. Simplifies things when returning
+          token. */
+       while (pos < append_len) {
+               if (uni_utf8_get_char_n(data + pos, size - pos, &c) <= 0)
+                       i_unreached();
+               if (IS_NONASCII_APOSTROPHE(c)) {
+                       buffer_append(tok->token, data, pos);
+                       buffer_append_c(tok->token, '\'');
+                       appended = pos + 1;
+               }
+               pos += uni_utf8_char_bytes(data[pos]);
+       }
+       if (appended < append_len)
+               buffer_append(tok->token, data + appended, append_len - appended);
  }
  
  static int
@@ -175,21 +222,27 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
         struct generic_fts_tokenizer *tok =
                 (struct generic_fts_tokenizer *)_tok;
         size_t i, char_start_i, len, start = 0;
+       unsigned int char_size;
+       unichar_t c;
  
-       for (i = 0; i < size; i++) {
+       for (i = 0; i < size; i += char_size) {
                 char_start_i = i;
-               if (data_is_word_boundary(data, size, &i)) {
+               if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
+                       i_unreached();
+               char_size = uni_utf8_char_bytes(data[i]);
+               if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
+                   fts_apostrophe_word_break(tok, c)) {
                         len = char_start_i - start;
                         tok_append_truncated(tok, data + start, len);
                         if (tok->token->used == 0) {
-                               /* no text read yet */
-                               start = i + 1;
+                               start = i + char_size;
                                 continue;
                         }
-                       /* word boundary found - return a new token */
-                       *skip_r = i + 1;
-                       fts_tokenizer_generic_simple_current_token(tok, token_r);
-                       return 1;
+
+                       if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
+                               *skip_r = i + char_size;
+                               return 1;
+                       }
                 }
         }
         /* word boundary not found yet */
@@ -199,9 +252,10 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
  
         /* return the last token */
         if (size == 0 && tok->token->used > 0) {
-               fts_tokenizer_generic_simple_current_token(tok, token_r);
-               return 1;
+               if (fts_tokenizer_generic_simple_current_token(tok, token_r))
+                       return 1;
         }
+
         return 0;
  }
  
diff --git a/src/lib-fts/test-fts-tokenizer.c b/src/lib-fts/test-fts-tokenizer.c

index 45953aff20677b4bda5ebe25899b69b60df1400b..94e6166d1b0f8ee916db5b18d9b8eab7cf84561a 100644 (file)
--- a/src/lib-fts/test-fts-tokenizer.c
+++ b/src/lib-fts/test-fts-tokenizer.c
@@ -29,6 +29,8 @@ static const char *test_inputs[] = {
  
         "1.",
  
+       "'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
+
         /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
            U+205A(e2 81 9a) and U+205F(e2 81 9f) */
         "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
@@ -99,6 +101,7 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
                 outi++;
         }
         test_assert_idx(expected_output[outi] == NULL, outi);
+
         return outi+1;
  }
  
@@ -130,6 +133,9 @@ static void test_fts_tokenizer_generic_only(void)
  
                 "1", NULL,
  
+               "quoted", "text", "word", "hlo", "words", "you're", "bad",
+               "word", "pre", "post", NULL,
+
                 "hello", "world", "And",
                 "there", "was", "text", "galore",
                 "and", "more", NULL,
@@ -169,6 +175,9 @@ static void test_fts_tokenizer_generic_tr29_only(void)
  
                 "1", NULL,
  
+               "quoted", "text", "word", "hlo", "words", "you're", "bad",
+               "word", "pre", "post", NULL,
+
                 "hello", "world", "And",
                 "there", "was", "text", "galore",
                 "and", "more", NULL,
diff --git a/src/lib-fts/word-properties.pl b/src/lib-fts/word-properties.pl

index 8bd08fa1ce05a58b8b32bab5628c1147fdc46788..c600d14373ab6a3af546466aa9b207cbafd046d7 100644 (file)
--- a/src/lib-fts/word-properties.pl
+++ b/src/lib-fts/word-properties.pl
@@ -8,7 +8,7 @@ if ($which eq 'boundaries') {
      @categories = qw(CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter
                     Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet);
  } elsif ($which eq 'breaks') {
-    @categories = qw(White_Space Dash Terminal_Punctuation STerm Pattern_White_Space);
+    @categories = qw(White_Space Dash Quotation_Mark Terminal_Punctuation STerm Pattern_White_Space);
  } else {
      die "specify 'boundaries' or 'breaks'";
  }
author	Teemu Huovila <teemu.huovila@dovecot.fi>
	Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)
committer	Teemu Huovila <teemu.huovila@dovecot.fi>
	Thu, 21 May 2015 10:29:15 +0000 (06:29 -0400)
src/lib-fts/fts-tokenizer-generic-private.h		patch \| blob \| blame \| history
src/lib-fts/fts-tokenizer-generic.c		patch \| blob \| blame \| history
src/lib-fts/test-fts-tokenizer.c		patch \| blob \| blame \| history
src/lib-fts/word-properties.pl		patch \| blob \| blame \| history