]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
fts: Backends can now index non-text body parts if they support it.
authorTimo Sirainen <tss@iki.fi>
Mon, 17 May 2010 16:06:57 +0000 (18:06 +0200)
committerTimo Sirainen <tss@iki.fi>
Mon, 17 May 2010 16:06:57 +0000 (18:06 +0200)
--HG--
branch : HEAD

src/plugins/fts-solr/fts-backend-solr.c
src/plugins/fts-squat/fts-backend-squat.c
src/plugins/fts/fts-api-private.h
src/plugins/fts/fts-api.c
src/plugins/fts/fts-api.h
src/plugins/fts/fts-storage.c

index 5de0770debf5632c9156bfbf3aa3c463d75edf29..d4b9c7ac6ddacea6e3148211e0afc8290cae4a48 100644 (file)
@@ -561,48 +561,78 @@ static void xml_encode_id(string_t *str, struct fts_backend *_backend,
        xml_encode(str, backend->id_box_name);
 }
 
-static int
-fts_backend_solr_build_more(struct fts_backend_build_context *_ctx,
-                           uint32_t uid, const unsigned char *data,
-                           size_t size, bool headers)
+static void
+fts_backend_solr_uid_changed(struct solr_fts_backend_build_context *ctx,
+                            uint32_t uid)
+{
+       if (ctx->post == NULL) {
+               ctx->post = solr_connection_post_begin(solr_conn);
+               str_append(ctx->cmd, "<add>");
+       } else {
+               str_append(ctx->cmd, "</field></doc>");
+       }
+       ctx->prev_uid = uid;
+       ctx->headers = FALSE;
+
+       fts_backend_solr_add_doc_prefix(ctx, uid);
+       str_printfa(ctx->cmd, "<field name=\"id\">");
+       xml_encode_id(ctx->cmd, ctx->ctx.backend, uid, ctx->uid_validity);
+       str_append(ctx->cmd, "</field>");
+}
+
+static void
+fts_backend_solr_build_hdr(struct fts_backend_build_context *_ctx,
+                          uint32_t uid)
 {
        struct solr_fts_backend_build_context *ctx =
                (struct solr_fts_backend_build_context *)_ctx;
-       string_t *cmd = ctx->cmd;
-
-       /* body comes first, then headers */
-       if (ctx->prev_uid != uid) {
-               /* uid changed */
-               if (ctx->post == NULL) {
-                       ctx->post = solr_connection_post_begin(solr_conn);
-                       str_append(cmd, "<add>");
-               } else {
-                       str_append(cmd, "</field></doc>");
-               }
-               ctx->prev_uid = uid;
 
-               fts_backend_solr_add_doc_prefix(ctx, uid);
-               str_printfa(cmd, "<field name=\"id\">");
-               xml_encode_id(cmd, _ctx->backend, uid, ctx->uid_validity);
-               str_append(cmd, "</field>");
+       if (uid != ctx->prev_uid)
+               fts_backend_solr_uid_changed(ctx, uid);
+       else {
+               i_assert(!ctx->headers);
+               str_append(ctx->cmd, "</field>");
+       }
 
-               ctx->headers = headers;
-               if (headers) {
-                       str_append(cmd, "<field name=\"hdr\">");
-               } else {
-                       str_append(cmd, "<field name=\"body\">");
-               }
-       } else if (headers && !ctx->headers) {
-               str_append(cmd, "</field><field name=\"hdr\">");
-       } else {
-               i_assert(!(!headers && ctx->headers));
+       ctx->headers = TRUE;
+       str_append(ctx->cmd, "<field name=\"hdr\">");
+}
+
+static bool
+fts_backend_solr_build_body_begin(struct fts_backend_build_context *_ctx,
+                                 uint32_t uid, const char *content_type,
+                                 const char *content_disposition ATTR_UNUSED)
+{
+       struct solr_fts_backend_build_context *ctx =
+               (struct solr_fts_backend_build_context *)_ctx;
+
+       if (!fts_backend_default_can_index(content_type))
+               return FALSE;
+
+       if (uid != ctx->prev_uid)
+               fts_backend_solr_uid_changed(ctx, uid);
+       else {
+               /* body comes first, then headers */
+               i_assert(!ctx->headers);
        }
 
-       xml_encode_data(cmd, data, size);
-       if (str_len(cmd) > SOLR_CMDBUF_SIZE-128) {
-               solr_connection_post_more(ctx->post, str_data(cmd),
-                                         str_len(cmd));
-               str_truncate(cmd, 0);
+       ctx->headers = FALSE;
+       str_append(ctx->cmd, "<field name=\"body\">");
+       return TRUE;
+}
+
+static int
+fts_backend_solr_build_more(struct fts_backend_build_context *_ctx,
+                           const unsigned char *data, size_t size)
+{
+       struct solr_fts_backend_build_context *ctx =
+               (struct solr_fts_backend_build_context *)_ctx;
+
+       xml_encode_data(ctx->cmd, data, size);
+       if (str_len(ctx->cmd) > SOLR_CMDBUF_SIZE-128) {
+               solr_connection_post_more(ctx->post, str_data(ctx->cmd),
+                                         str_len(ctx->cmd));
+               str_truncate(ctx->cmd, 0);
        }
        return 0;
 }
@@ -806,6 +836,9 @@ struct fts_backend fts_backend_solr = {
                fts_backend_solr_get_last_uid,
                fts_backend_solr_get_all_last_uids,
                fts_backend_solr_build_init,
+               fts_backend_solr_build_hdr,
+               fts_backend_solr_build_body_begin,
+               NULL,
                fts_backend_solr_build_more,
                fts_backend_solr_build_deinit,
                fts_backend_solr_expunge,
index 0f3d6e7cab077655f964c9b6c0516b203e77eb52..1f3afadb6ed9b4946c963a5ba83bd2662c100094 100644 (file)
@@ -21,6 +21,8 @@ struct squat_fts_backend {
 struct squat_fts_backend_build_context {
        struct fts_backend_build_context ctx;
        struct squat_trie_build_context *build_ctx;
+       enum squat_index_type squat_type;
+       uint32_t uid;
 };
 
 static void
@@ -127,18 +129,41 @@ fts_backend_squat_build_init(struct fts_backend *_backend, uint32_t *last_uid_r,
        return 0;
 }
 
+static void
+fts_backend_squat_build_hdr(struct fts_backend_build_context *_ctx,
+                           uint32_t uid)
+{
+       struct squat_fts_backend_build_context *ctx =
+               (struct squat_fts_backend_build_context *)_ctx;
+
+       ctx->squat_type = SQUAT_INDEX_TYPE_HEADER;
+       ctx->uid = uid;
+}
+
+static bool
+fts_backend_squat_build_body_begin(struct fts_backend_build_context *_ctx,
+                                  uint32_t uid, const char *content_type,
+                                  const char *content_disposition ATTR_UNUSED)
+{
+       struct squat_fts_backend_build_context *ctx =
+               (struct squat_fts_backend_build_context *)_ctx;
+
+       if (!fts_backend_default_can_index(content_type))
+               return FALSE;
+
+       ctx->squat_type = SQUAT_INDEX_TYPE_BODY;
+       ctx->uid = uid;
+       return TRUE;
+}
+
 static int
 fts_backend_squat_build_more(struct fts_backend_build_context *_ctx,
-                            uint32_t uid, const unsigned char *data,
-                            size_t size, bool headers)
+                            const unsigned char *data, size_t size)
 {
        struct squat_fts_backend_build_context *ctx =
                (struct squat_fts_backend_build_context *)_ctx;
-       enum squat_index_type squat_type;
 
-       squat_type = headers ? SQUAT_INDEX_TYPE_HEADER :
-               SQUAT_INDEX_TYPE_BODY;
-       return squat_trie_build_more(ctx->build_ctx, uid, squat_type,
+       return squat_trie_build_more(ctx->build_ctx, ctx->uid, ctx->squat_type,
                                     data, size);
 }
 
@@ -248,6 +273,9 @@ struct fts_backend fts_backend_squat = {
                fts_backend_squat_get_last_uid,
                NULL,
                fts_backend_squat_build_init,
+               fts_backend_squat_build_hdr,
+               fts_backend_squat_build_body_begin,
+               NULL,
                fts_backend_squat_build_more,
                fts_backend_squat_build_deinit,
                fts_backend_squat_expunge,
index 188d0b3b88f52f5573beca5915f361a6fbeb194f..51186df7727c0399fc2c1c1a7159aae7c579c741 100644 (file)
@@ -13,8 +13,13 @@ struct fts_backend_vfuncs {
 
        int (*build_init)(struct fts_backend *backend, uint32_t *last_uid_r,
                          struct fts_backend_build_context **ctx_r);
-       int (*build_more)(struct fts_backend_build_context *ctx, uint32_t uid,
-                         const unsigned char *data, size_t size, bool headers);
+       void (*build_hdr)(struct fts_backend_build_context *ctx, uint32_t uid);
+       bool (*build_body_begin)(struct fts_backend_build_context *ctx,
+                                uint32_t uid, const char *content_type,
+                                const char *content_disposition);
+       void (*build_body_end)(struct fts_backend_build_context *ctx);
+       int (*build_more)(struct fts_backend_build_context *ctx,
+                         const unsigned char *data, size_t size);
        int (*build_deinit)(struct fts_backend_build_context *ctx);
 
        void (*expunge)(struct fts_backend *backend, struct mail *mail);
@@ -80,6 +85,8 @@ struct fts_backend_lookup_context {
 void fts_backend_register(const struct fts_backend *backend);
 void fts_backend_unregister(const char *name);
 
+bool fts_backend_default_can_index(const char *content_type);
+
 void fts_filter_uids(ARRAY_TYPE(seq_range) *definite_dest,
                     const ARRAY_TYPE(seq_range) *definite_filter,
                     ARRAY_TYPE(seq_range) *maybe_dest,
index fd7b07af630fb36b0fcb3bee52d77b5304d42485..89c386f6b90d7bd06d4f5f0a10aa2dfc4e9c2d01 100644 (file)
@@ -99,10 +99,29 @@ int fts_backend_build_init(struct fts_backend *backend, uint32_t *last_uid_r,
        return ret;
 }
 
-int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid,
-                          const unsigned char *data, size_t size, bool headers)
+void fts_backend_build_hdr(struct fts_backend_build_context *ctx, uint32_t uid)
 {
-       return ctx->backend->v.build_more(ctx, uid, data, size, headers);
+       ctx->backend->v.build_hdr(ctx, uid);
+}
+
+bool fts_backend_build_body_begin(struct fts_backend_build_context *ctx,
+                                 uint32_t uid, const char *content_type,
+                                 const char *content_disposition)
+{
+       return ctx->backend->v.build_body_begin(ctx, uid, content_type,
+                                               content_disposition);
+}
+
+void fts_backend_build_body_end(struct fts_backend_build_context *ctx)
+{
+       if (ctx->backend->v.build_body_end != NULL)
+               ctx->backend->v.build_body_end(ctx);
+}
+
+int fts_backend_build_more(struct fts_backend_build_context *ctx,
+                          const unsigned char *data, size_t size)
+{
+       return ctx->backend->v.build_more(ctx, data, size);
 }
 
 int fts_backend_build_deinit(struct fts_backend_build_context **_ctx)
@@ -321,3 +340,9 @@ int fts_backend_lookup_deinit(struct fts_backend_lookup_context **_ctx,
        pool_unref(&ctx->pool);
        return ret;
 }
+
+bool fts_backend_default_can_index(const char *content_type)
+{
+       return strncasecmp(content_type, "text/", 5) == 0 ||
+               strcasecmp(content_type, "message/rfc822") == 0;
+}
index 8bed62e4cd308c493c866d42e5b42f80b186f9da..1fcc5f78a39153368699a713c56016160c3cb854 100644 (file)
@@ -8,8 +8,12 @@ struct fts_backend_build_context;
 #include "seq-range-array.h"
 
 enum fts_lookup_flags {
+       /* Search within header and/or body.
+          At least one of these must be set. */
        FTS_LOOKUP_FLAG_HEADER  = 0x01,
        FTS_LOOKUP_FLAG_BODY    = 0x02,
+
+       /* The key must NOT be found */
        FTS_LOOKUP_FLAG_INVERT  = 0x04
 };
 
@@ -33,23 +37,42 @@ void fts_backend_deinit(struct fts_backend **backend);
 /* Get the last_uid for the mailbox. */
 int fts_backend_get_last_uid(struct fts_backend *backend, uint32_t *last_uid_r);
 /* Get last_uids for all mailboxes that might be backend mailboxes for a
-   virtual mailbox. Depending on virtual mailbox configuration, this function
-   may also return mailboxes that don't really even match the virtual mailbox
-   patterns. The caller should filter out the list itself. */
+   virtual mailbox. The backend can use mailbox_get_virtual_backend_boxes() or
+   mailbox_get_virtual_box_patterns() functions to get the list of mailboxes.
+
+   Depending on virtual mailbox configuration, this function may also return
+   mailboxes that don't even match the virtual mailbox patterns. The caller
+   needs to be able to ignore the unnecessary ones. */
 int fts_backend_get_all_last_uids(struct fts_backend *backend, pool_t pool,
                                  ARRAY_TYPE(fts_backend_uid_map) *last_uids);
 
-/* Initialize adding new data to the index. last_uid_r is set to the last UID
-   that exists in the index. */
+/* Initialize adding new data to the index. last_uid_r is set to the last
+   indexed message's IMAP UID */
 int fts_backend_build_init(struct fts_backend *backend, uint32_t *last_uid_r,
                           struct fts_backend_build_context **ctx_r);
-/* Add more contents to the index. The data must contain only full valid
-   UTF-8 characters, but it doesn't need to be NUL-terminated. size contains
-   the data size in bytes, not characters. headers is TRUE if the data contains
-   message headers instead of message body. */
-int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid,
-                          const unsigned char *data, size_t size,
-                          bool headers);
+/* Switch to building index for mail's headers or MIME part headers. */
+void fts_backend_build_hdr(struct fts_backend_build_context *ctx, uint32_t uid);
+/* Switch to building index for the next body part. If backend doesn't want
+   to index this body part (based on content type/disposition check), it can
+   return FALSE and caller will skip to next part. The backend must return
+   TRUE for all text/xxx and message/rfc822 content types.
+
+   The content_type contains a valid parsed "type/subtype" string. For messages
+   without (valid) Content-Type header, the content_type is set to "text/plain".
+   The content_disposition is passed without parsing/validation if it exists,
+   otherwise it's NULL. */
+bool fts_backend_build_body_begin(struct fts_backend_build_context *ctx,
+                                 uint32_t uid, const char *content_type,
+                                 const char *content_disposition);
+/* Called once when the whole body part has been sent. */
+void fts_backend_build_body_end(struct fts_backend_build_context *ctx);
+/* Add more content to the index for the currently selected header/body part.
+   The data must contain only full valid UTF-8 characters, but it doesn't need
+   to be NUL-terminated. size contains the data size in bytes, not characters.
+   This function may be called many times and the data block sizes may be
+   small. Backend returns 0 if ok, -1 if build should be aborted. */
+int fts_backend_build_more(struct fts_backend_build_context *ctx,
+                          const unsigned char *data, size_t size);
 /* Finish adding new data to the index. */
 int fts_backend_build_deinit(struct fts_backend_build_context **ctx);
 
@@ -57,14 +80,15 @@ int fts_backend_build_deinit(struct fts_backend_build_context **ctx);
 bool fts_backend_is_building(struct fts_backend *backend);
 
 /* Expunge given mail from the backend. Note that the transaction may still
-   fail later. */
+   fail later, so backend shouldn't do anything irreversible. */
 void fts_backend_expunge(struct fts_backend *backend, struct mail *mail);
 /* Called after transaction has been committed or rollbacked. */
 void fts_backend_expunge_finish(struct fts_backend *backend,
                                struct mailbox *box, bool committed);
 
 /* Lock/unlock the backend for multiple lookups. Returns 1 if locked, 0 if
-   locking timeouted, -1 if error.
+   locking timeouted, -1 if error. If backend doesn't require locking, it
+   always returns 1.
 
    It's not required to call these functions manually, but if you're doing
    multiple lookup/filter operations this avoids multiple lock/unlock calls. */
@@ -74,10 +98,14 @@ void fts_backend_unlock(struct fts_backend *backend);
 /* Start building a FTS lookup. */
 struct fts_backend_lookup_context *
 fts_backend_lookup_init(struct fts_backend *backend);
-/* Add a new search key to the lookup. */
+/* Add a new search key to the lookup. The keys are ANDed together. */
 void fts_backend_lookup_add(struct fts_backend_lookup_context *ctx,
                            const char *key, enum fts_lookup_flags flags);
-/* Finish the lookup and return found UIDs. */
+/* Finish the lookup and return found UIDs. The definite_uids are returned
+   to client directly, while for maybe_uids Dovecot first verifies (by
+   opening and reading the mail) that they really do contain the searched
+   keys. The maybe_uids is useful with backends that can only filter out
+   messages, but can't definitively say if the search matched a message. */
 int fts_backend_lookup_deinit(struct fts_backend_lookup_context **ctx,
                              ARRAY_TYPE(seq_range) *definite_uids,
                              ARRAY_TYPE(seq_range) *maybe_uids,
index 5ed941c5801c207ba5ce68170fbb0d18096700db..b30ee92f91a76d144e5e4206d6d7b23dfa45e443 100644 (file)
@@ -6,6 +6,7 @@
 #include "str.h"
 #include "istream.h"
 #include "time-util.h"
+#include "rfc822-parser.h"
 #include "message-parser.h"
 #include "message-decoder.h"
 #include "mail-namespace.h"
@@ -40,6 +41,7 @@ struct fts_storage_build_context {
 
        uint32_t uid;
        string_t *headers;
+       char *content_type, *content_disposition;
 };
 
 struct fts_transaction_context {
@@ -77,20 +79,52 @@ static int fts_build_mail_flush_headers(struct fts_storage_build_context *ctx)
        if (str_len(ctx->headers) == 0)
                return 0;
 
-       if (fts_backend_build_more(ctx->build, ctx->uid, str_data(ctx->headers),
-                                  str_len(ctx->headers), TRUE) < 0)
+       fts_backend_build_hdr(ctx->build, ctx->uid);
+       if (fts_backend_build_more(ctx->build, str_data(ctx->headers),
+                                  str_len(ctx->headers)) < 0)
                return -1;
 
        str_truncate(ctx->headers, 0);
        return 0;
 }
 
-static bool fts_build_want_index_part(const struct message_block *block)
+static void fts_build_parse_content_type(struct fts_storage_build_context *ctx,
+                                        const struct message_header_line *hdr)
 {
-       /* we'll index only text/xxx and message/rfc822 parts for now */
-       return (block->part->flags &
-               (MESSAGE_PART_FLAG_TEXT |
-                MESSAGE_PART_FLAG_MESSAGE_RFC822)) != 0;
+       struct rfc822_parser_context parser;
+       string_t *content_type;
+
+       rfc822_parser_init(&parser, hdr->full_value, hdr->full_value_len, NULL);
+       (void)rfc822_skip_lwsp(&parser);
+
+       T_BEGIN {
+               content_type = t_str_new(64);
+               if (rfc822_parse_content_type(&parser, content_type) >= 0) {
+                       i_free(ctx->content_type);
+                       ctx->content_type = i_strdup(str_c(content_type));
+               }
+       } T_END;
+}
+
+static void
+fts_build_parse_content_disposition(struct fts_storage_build_context *ctx,
+                                   const struct message_header_line *hdr)
+{
+       /* just pass it as-is to backend. */
+       i_free(ctx->content_disposition);
+       ctx->content_disposition =
+               i_strndup(hdr->full_value, hdr->full_value_len);
+}
+
+static void fts_parse_mail_header(struct fts_storage_build_context *ctx,
+                                 const struct message_block *raw_block)
+{
+       const struct message_header_line *hdr = raw_block->hdr;
+
+       if (strcasecmp(hdr->name, "Content-Type") == 0)
+               fts_build_parse_content_type(ctx, hdr);
+       else if (strcasecmp(hdr->name, "Content-Disposition") == 0)
+               fts_build_parse_content_disposition(ctx, hdr);
 }
 
 static void fts_build_mail_header(struct fts_storage_build_context *ctx,
@@ -114,6 +148,7 @@ static int fts_build_mail(struct fts_storage_build_context *ctx, uint32_t uid)
        struct message_decoder_context *decoder;
        struct message_block raw_block, block;
        struct message_part *prev_part, *parts;
+       bool skip_body = FALSE, body_part = FALSE;
        int ret;
 
        ctx->uid = uid;
@@ -125,7 +160,8 @@ static int fts_build_mail(struct fts_storage_build_context *ctx, uint32_t uid)
        parser = message_parser_init(pool_datastack_create(), input,
                                     MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE,
                                     0);
-       decoder = message_decoder_init(MESSAGE_DECODER_FLAG_DTCASE);
+       decoder = message_decoder_init(MESSAGE_DECODER_FLAG_DTCASE |
+                                      MESSAGE_DECODER_FLAG_RETURN_BINARY);
        for (;;) {
                ret = message_parser_parse_next_block(parser, &raw_block);
                i_assert(ret != 0);
@@ -134,30 +170,62 @@ static int fts_build_mail(struct fts_storage_build_context *ctx, uint32_t uid)
                                ret = 0;
                        break;
                }
-               if (raw_block.hdr == NULL && raw_block.size != 0 &&
-                   !fts_build_want_index_part(&raw_block)) {
-                       /* skipping this body */
-                       continue;
+
+               if (raw_block.part != prev_part) {
+                       /* body part changed. we're now parsing the end of
+                          boundary, possibly followed by message epilogue */
+                       if (!skip_body && prev_part != NULL) {
+                               i_assert(body_part);
+                               fts_backend_build_body_end(ctx->build);
+                       }
+                       prev_part = raw_block.part;
+                       i_free_and_null(ctx->content_type);
+                       i_free_and_null(ctx->content_disposition);
+
+                       if (raw_block.size != 0) {
+                               /* multipart. skip until beginning of next
+                                  part's headers */
+                               skip_body = TRUE;
+                       }
+               }
+
+               if (raw_block.hdr != NULL) {
+                       /* always handle headers */
+               } else if (raw_block.size == 0) {
+                       /* end of headers */
+                       const char *content_type = ctx->content_type == NULL ?
+                               "text/plain" : ctx->content_type;
+
+                       skip_body = !fts_backend_build_body_begin(ctx->build,
+                                       ctx->uid, content_type,
+                                       ctx->content_disposition);
+                       body_part = TRUE;
+               } else {
+                       if (skip_body)
+                               continue;
                }
 
                if (!message_decoder_decode_next_block(decoder, &raw_block,
                                                       &block))
                        continue;
 
-               if (block.hdr != NULL)
+               if (block.hdr != NULL) {
+                       fts_parse_mail_header(ctx, &raw_block);
                        fts_build_mail_header(ctx, &block);
-               else if (block.size == 0) {
+               else if (block.size == 0) {
                        /* end of headers */
                        str_append_c(ctx->headers, '\n');
                } else {
-                       if (fts_backend_build_more(ctx->build, ctx->uid,
-                                                  block.data, block.size,
-                                                  FALSE) < 0) {
+                       i_assert(body_part);
+                       if (fts_backend_build_more(ctx->build,
+                                                  block.data, block.size) < 0) {
                                ret = -1;
                                break;
                        }
                }
        }
+       if (!skip_body && body_part)
+               fts_backend_build_body_end(ctx->build);
        if (message_parser_deinit(&parser, &parts) < 0)
                mail_set_cache_corrupted(ctx->mail, MAIL_FETCH_MESSAGE_PARTS);
        message_decoder_deinit(&decoder);
@@ -483,6 +551,8 @@ static int fts_build_deinit(struct fts_storage_build_context **_ctx)
 
        str_free(&ctx->headers);
        mail_search_args_unref(&ctx->search_args);
+       i_free(ctx->content_type);
+       i_free(ctx->content_disposition);
        i_free(ctx);
        return ret;
 }