]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
fts: Added FTS_BACKEND_FLAG_BUILD_SHORT_UTF8 to require sending only short UTF8 data...
authorTimo Sirainen <tss@iki.fi>
Tue, 27 Nov 2012 01:49:25 +0000 (03:49 +0200)
committerTimo Sirainen <tss@iki.fi>
Tue, 27 Nov 2012 01:49:25 +0000 (03:49 +0200)
src/plugins/fts/fts-api-private.h
src/plugins/fts/fts-build-mail.c
src/plugins/fts/fts-parser.c
src/plugins/fts/fts-parser.h

index 05e43f1a4955feb27208f054c0863f5869671b68..663386234dba2bee607b2d057894148e4f084cda 100644 (file)
@@ -59,7 +59,9 @@ enum fts_backend_flags {
        /* Send only fully indexable words rather than randomly sized blocks */
        FTS_BACKEND_FLAG_BUILD_FULL_WORDS       = 0x04,
        /* Fuzzy search works */
-       FTS_BACKEND_FLAG_FUZZY_SEARCH           = 0x08
+       FTS_BACKEND_FLAG_FUZZY_SEARCH           = 0x08,
+       /* Don't allow 5-byte or 6-byte UTF8 sequences */
+       FTS_BACKEND_FLAG_BUILD_SHORT_UTF8       = 0x10
 };
 
 struct fts_backend {
index c6cec4314f459145afa246ec17855f74df6c6cdb..5e685ee5412b33d180c6be7810ac8260b3476796 100644 (file)
@@ -144,6 +144,7 @@ fts_build_body_begin(struct fts_mail_build_context *ctx, bool *binary_body_r)
        struct mail_storage *storage;
        const char *content_type;
        struct fts_backend_build_key key;
+       bool require_short_utf8;
 
        i_assert(ctx->body_parser == NULL);
 
@@ -158,9 +159,11 @@ fts_build_body_begin(struct fts_mail_build_context *ctx, bool *binary_body_r)
                return FALSE;
        }
 
-       
+       require_short_utf8 = (ctx->update_ctx->backend->flags &
+                             FTS_BACKEND_FLAG_BUILD_SHORT_UTF8) != 0;
+
        storage = mailbox_get_storage(ctx->mail->box);
-       if (fts_parser_init(mail_storage_get_user(storage),
+       if (fts_parser_init(mail_storage_get_user(storage), require_short_utf8,
                            content_type, ctx->content_disposition,
                            &ctx->body_parser)) {
                /* extract text using the the returned parser */
index 45e325ca49a9d8b9899168c359534082e8c8cb3f..8624d113d47c46ef3b8d5a932e96a43b580cbb76 100644 (file)
@@ -11,7 +11,7 @@ const struct fts_parser_vfuncs *parsers[] = {
        &fts_parser_script
 };
 
-bool fts_parser_init(struct mail_user *user,
+bool fts_parser_init(struct mail_user *user, bool require_short_utf8,
                     const char *content_type, const char *content_disposition,
                     struct fts_parser **parser_r)
 {
@@ -20,8 +20,10 @@ bool fts_parser_init(struct mail_user *user,
        for (i = 0; i < N_ELEMENTS(parsers); i++) {
                *parser_r = parsers[i]->try_init(user, content_type,
                                                 content_disposition);
-               if (*parser_r != NULL)
+               if (*parser_r != NULL) {
+                       (*parser_r)->require_short_utf8 = require_short_utf8;
                        return TRUE;
+               }
        }
        return FALSE;
 }
@@ -56,11 +58,15 @@ static void replace_nul_bytes(buffer_t *buf)
 
 void fts_parser_more(struct fts_parser *parser, struct message_block *block)
 {
+       bool valid_utf8;
+
        if (parser->v.more != NULL)
                parser->v.more(parser, block);
 
-       if (!uni_utf8_data_is_valid(block->data, block->size) ||
-           data_has_nuls(block->data, block->size)) {
+       valid_utf8 = parser->require_short_utf8 ?
+               uni_utf8_short_data_is_valid(block->data, block->size) :
+               uni_utf8_data_is_valid(block->data, block->size);
+       if (!valid_utf8 || data_has_nuls(block->data, block->size)) {
                /* output isn't valid UTF-8. make it. */
                if (parser->utf8_output == NULL) {
                        parser->utf8_output =
@@ -68,8 +74,14 @@ void fts_parser_more(struct fts_parser *parser, struct message_block *block)
                } else {
                        buffer_set_used_size(parser->utf8_output, 0);
                }
-               (void)uni_utf8_get_valid_data(block->data, block->size,
-                                             parser->utf8_output);
+               if (parser->require_short_utf8) {
+                       (void)uni_utf8_short_get_valid_data(block->data,
+                                                           block->size,
+                                                           parser->utf8_output);
+               } else {
+                       (void)uni_utf8_get_valid_data(block->data, block->size,
+                                                     parser->utf8_output);
+               }
                replace_nul_bytes(parser->utf8_output);
                block->data = parser->utf8_output->data;
                block->size = parser->utf8_output->used;
index 6b8e709d070f31d77006137b88179d151cf49e06..b396c6b3229c349bb464eb3e70738a5b98d24f9b 100644 (file)
@@ -15,12 +15,13 @@ struct fts_parser_vfuncs {
 struct fts_parser {
        struct fts_parser_vfuncs v;
        buffer_t *utf8_output;
+       bool require_short_utf8;
 };
 
 extern struct fts_parser_vfuncs fts_parser_html;
 extern struct fts_parser_vfuncs fts_parser_script;
 
-bool fts_parser_init(struct mail_user *user,
+bool fts_parser_init(struct mail_user *user, bool require_short_utf8,
                     const char *content_type, const char *content_disposition,
                     struct fts_parser **parser_r);
 struct fts_parser *fts_parser_text_init(void);