]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
fts: Strip text/html mails to plaintext before sending them to FTS backend.
authorTimo Sirainen <tss@iki.fi>
Wed, 27 Jul 2011 14:58:27 +0000 (17:58 +0300)
committerTimo Sirainen <tss@iki.fi>
Wed, 27 Jul 2011 14:58:27 +0000 (17:58 +0300)
src/plugins/fts/Makefile.am
src/plugins/fts/fts-build-private.h
src/plugins/fts/fts-build.c
src/plugins/fts/fts-parser-html.c [new file with mode: 0644]
src/plugins/fts/fts-parser.c [new file with mode: 0644]
src/plugins/fts/fts-parser.h [new file with mode: 0644]
src/plugins/fts/html-entities.h [new file with mode: 0644]

index 738468781c368f332954df536887ecd8cfd72e43..8fac101ac7f1d5d1ac7a762dd5fb8e9ada34b745 100644 (file)
@@ -17,16 +17,20 @@ lib20_fts_plugin_la_SOURCES = \
        fts-build-indexer.c \
        fts-build-mailbox.c \
        fts-build-virtual.c \
+       fts-parser.c \
+       fts-parser-html.c \
        fts-plugin.c \
        fts-search.c \
        fts-search-serialize.c \
        fts-storage.c
 
 noinst_HEADERS = \
+       html-entities.h \
        fts-api.h \
        fts-api-private.h \
        fts-build.h \
        fts-build-private.h \
+       fts-parser.h \
        fts-plugin.h \
        fts-search-serialize.h \
        fts-storage.h
index 1db6fc6c74d9b7cf0bec71f5252d338d9aee750a..e035ec77703300e943f590d19827a0170be0095b 100644 (file)
@@ -27,6 +27,7 @@ struct fts_storage_build_context {
 
        uint32_t uid;
        char *content_type, *content_disposition;
+       struct fts_parser *body_parser;
 
        unsigned int binary_mime_parts:1;
        unsigned int dtcase:1;
index 6f842904c607ed76461302cdfd0d9559067815aa..d22aa5c025cb22267e9e8fcd0d3f568abc837661 100644 (file)
@@ -11,6 +11,7 @@
 #include "message-decoder.h"
 #include "../virtual/virtual-storage.h"
 #include "fts-api-private.h"
+#include "fts-parser.h"
 #include "fts-build-private.h"
 
 #define FTS_BUILD_NOTIFY_INTERVAL_SECS 10
@@ -103,13 +104,19 @@ static bool fts_build_body_begin(struct fts_storage_build_context *ctx)
        const char *content_type;
        struct fts_backend_build_key key;
 
+       i_assert(ctx->body_parser == NULL);
+
        memset(&key, 0, sizeof(key));
        key.uid = ctx->uid;
 
        content_type = ctx->content_type != NULL ?
                ctx->content_type : "text/plain";
-       if (strncmp(content_type, "text/", 5) == 0 ||
-           strncmp(content_type, "message/", 8) == 0) {
+       if (fts_parser_init(content_type, ctx->content_disposition,
+                           &ctx->body_parser)) {
+               /* extract text using the the returned parser */
+               key.type = FTS_BACKEND_BUILD_KEY_BODY_PART;
+       } else if (strncmp(content_type, "text/", 5) == 0 ||
+                  strncmp(content_type, "message/", 8) == 0) {
                /* text body parts */
                key.type = FTS_BACKEND_BUILD_KEY_BODY_PART;
        } else {
@@ -161,6 +168,8 @@ int fts_build_mail(struct fts_storage_build_context *ctx, struct mail *mail)
                if (raw_block.part != prev_part) {
                        /* body part changed. we're now parsing the end of
                           boundary, possibly followed by message epilogue */
+                       if (ctx->body_parser != NULL)
+                               fts_parser_deinit(&ctx->body_parser);
                        fts_backend_update_unset_build_key(ctx->update_ctx);
                        prev_part = raw_block.part;
                        i_free_and_null(ctx->content_type);
@@ -195,6 +204,8 @@ int fts_build_mail(struct fts_storage_build_context *ctx, struct mail *mail)
                        /* end of headers */
                } else {
                        i_assert(body_part);
+                       if (ctx->body_parser != NULL)
+                               fts_parser_more(ctx->body_parser, &block);
                        if (fts_backend_update_build_more(ctx->update_ctx,
                                                          block.data,
                                                          block.size) < 0) {
diff --git a/src/plugins/fts/fts-parser-html.c b/src/plugins/fts/fts-parser-html.c
new file mode 100644 (file)
index 0000000..553eec4
--- /dev/null
@@ -0,0 +1,241 @@
+/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "message-parser.h"
+#include "fts-parser.h"
+
+/* Zero-width space (&#x200B;) apparently also belongs here, but that gets a
+   bit tricky to handle.. is it actually used anywhere? */
+#define HTML_WHITESPACE(c) \
+       ((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
+
+enum html_state {
+       /* regular text */
+       HTML_STATE_TEXT,
+       /* tag outside "quoted string" */
+       HTML_STATE_TAG,
+       /* tag inside "quoted string" */
+       HTML_STATE_TAG_QUOTED,
+       /* tag -> "escape\ */
+       HTML_STATE_TAG_QUOTED_ESCAPE,
+       /* script/stype content */
+       HTML_STATE_IGNORE,
+       /* comment */
+       HTML_STATE_COMMENT,
+       /* comment is ending, we've seen "--" and now just waiting for ">" */
+       HTML_STATE_COMMENT_END
+};
+
+struct html_fts_parser {
+       struct fts_parser parser;
+
+       enum html_state state;
+       buffer_t *input, *output;
+       bool ignore_next_text;
+};
+
+struct {
+       const char *name;
+       unichar_t chr;
+} html_entities[] = {
+#include "html-entities.h"
+};
+
+static struct fts_parser *
+fts_parser_html_try_init(const char *content_type ATTR_UNUSED,
+                        const char *content_disposition ATTR_UNUSED)
+{
+       struct html_fts_parser *parser;
+
+       if (strcasecmp(content_type, "text/html") != 0)
+               return NULL;
+
+       parser = i_new(struct html_fts_parser, 1);
+       parser->parser = fts_parser_html;
+       parser->input = buffer_create_dynamic(default_pool, 512);
+       parser->output = buffer_create_dynamic(default_pool, 4096);
+       return &parser->parser;
+}
+
+static bool
+parse_tag_name(struct html_fts_parser *parser,
+              const unsigned char *data, size_t size)
+{
+       size_t i = 1;
+
+       if (size >= 3 && memcmp(data, "!--", 3) == 0) {
+               parser->state = HTML_STATE_COMMENT;
+               return 3;
+       }
+
+       if (size > 5 && i_memcasecmp(data, "style", 5) == 0) {
+               i = 5;
+       } else if (size > 6 && i_memcasecmp(data, "script", 6) == 0) {
+               i = 6;
+       } else if (size <= 6) {
+               /* need more data */
+               return 0;
+       } else {
+               parser->state = HTML_STATE_TAG;
+               return 1;
+       }
+       parser->state = HTML_STATE_TAG;
+       if (HTML_WHITESPACE(data[i]) || data[i] == '>')
+               parser->ignore_next_text = TRUE;
+       return 1;
+}
+
+static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
+{
+       unsigned int i;
+
+       for (i = 0; i < N_ELEMENTS(html_entities); i++) {
+               if (strcasecmp(html_entities[i].name, name) == 0) {
+                       *chr_r = html_entities[i].chr;
+                       return TRUE;
+               }
+       }
+       return FALSE;
+}
+
+static size_t parse_entity(struct html_fts_parser *parser,
+                          const unsigned char *data, size_t size)
+{
+       char entity[10];
+       unichar_t chr;
+       size_t i;
+
+       for (i = 0; i < size; i++) {
+               if (data[i] == ';')
+                       break;
+               if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
+                       /* broken entity */
+                       return 1;
+               }
+       }
+       if (i == size)
+               return 0;
+
+       i_assert(i < sizeof(entity));
+       memcpy(entity, data, i); entity[i] = '\0';
+
+       if (html_entity_get_unichar(entity, &chr))
+               uni_ucs4_to_utf8_c(chr, parser->output);
+       return i + 1;
+}
+
+static size_t
+parse_data(struct html_fts_parser *parser,
+          const unsigned char *data, size_t size)
+{
+       size_t i, ret;
+
+       for (i = 0; i < size; i++) {
+               char c = data[i];
+
+               switch (parser->state) {
+               case HTML_STATE_TEXT:
+                       if (c == '<') {
+                               ret = parse_tag_name(parser, data+i+1, size-i-1);
+                               if (ret == 0)
+                                       return i;
+                               i += ret - 1;
+                       } else if (c == '&') {
+                               ret = parse_entity(parser, data+i+1, size-i-1);
+                               if (ret == 0)
+                                       return i;
+                               i += ret - 1;
+                       } else {
+                               buffer_append_c(parser->output, c);
+                       }
+                       break;
+               case HTML_STATE_TAG:
+                       if (c == '"')
+                               parser->state = HTML_STATE_TAG_QUOTED;
+                       else if (c == '>') {
+                               parser->state = parser->ignore_next_text ?
+                                       HTML_STATE_IGNORE : HTML_STATE_TEXT;
+                       }
+                       break;
+               case HTML_STATE_TAG_QUOTED:
+                       if (c == '"')
+                               parser->state = HTML_STATE_TAG;
+                       else if (c == '\\')
+                               parser->state = HTML_STATE_TAG_QUOTED_ESCAPE;
+                       break;
+               case HTML_STATE_TAG_QUOTED_ESCAPE:
+                       parser->state = HTML_STATE_TAG_QUOTED;
+                       break;
+               case HTML_STATE_IGNORE:
+                       if (c == '<') {
+                               parser->state = HTML_STATE_TAG;
+                               parser->ignore_next_text = FALSE;
+                       }
+                       break;
+               case HTML_STATE_COMMENT:
+                       if (c == '-') {
+                               if (i+1 == size)
+                                       return i;
+                               if (data[i+1] == '-') {
+                                       parser->state = HTML_STATE_COMMENT_END;
+                                       i++;
+                               }
+                       }
+                       break;
+               case HTML_STATE_COMMENT_END:
+                       if (c == '>')
+                               parser->state = HTML_STATE_TEXT;
+                       else if (!HTML_WHITESPACE(c))
+                               parser->state = HTML_STATE_COMMENT;
+                       break;
+               }
+       }
+       return i;
+}
+
+static void fts_parser_html_more(struct fts_parser *_parser,
+                                struct message_block *block)
+{
+       struct html_fts_parser *parser = (struct html_fts_parser *)_parser;
+       size_t size, buf_orig_size;
+
+       buffer_set_used_size(parser->output, 0);
+
+       if (parser->input->used > 0) {
+               /* we didn't get enough input the last time to know
+                  what to do. */
+               buf_orig_size = parser->input->used;
+
+               size = I_MIN(block->size, 128);
+               buffer_append(parser->input, block->data, size);
+               size = parse_data(parser, parser->input->data,
+                                 parser->input->used);
+
+               i_assert(size >= buf_orig_size);
+               block->data += size - buf_orig_size;
+               block->size -= size - buf_orig_size;
+               buffer_set_used_size(parser->input, 0);
+       }
+       size = parse_data(parser, block->data, block->size);
+       buffer_append(parser->input, block->data + size, block->size - size);
+
+       block->data = parser->output->data;
+       block->size = parser->output->used;
+}
+
+static void fts_parser_html_deinit(struct fts_parser *_parser)
+{
+       struct html_fts_parser *parser = (struct html_fts_parser *)_parser;
+
+       buffer_free(&parser->input);
+       buffer_free(&parser->output);
+       i_free(parser);
+}
+
+struct fts_parser fts_parser_html = {
+       fts_parser_html_try_init,
+       fts_parser_html_more,
+       fts_parser_html_deinit
+};
diff --git a/src/plugins/fts/fts-parser.c b/src/plugins/fts/fts-parser.c
new file mode 100644 (file)
index 0000000..991d872
--- /dev/null
@@ -0,0 +1,35 @@
+/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "fts-parser.h"
+
+const struct fts_parser *parsers[] = {
+       &fts_parser_html
+};
+
+bool fts_parser_init(const char *content_type, const char *content_disposition,
+                    struct fts_parser **parser_r)
+{
+       unsigned int i;
+
+       for (i = 0; i < N_ELEMENTS(parsers); i++) {
+               *parser_r = parsers[i]->try_init(content_type,
+                                                content_disposition);
+               if (*parser_r != NULL)
+                       return TRUE;
+       }
+       return FALSE;
+}
+
+void fts_parser_more(struct fts_parser *parser, struct message_block *block)
+{
+       parser->more(parser, block);
+}
+
+void fts_parser_deinit(struct fts_parser **_parser)
+{
+       struct fts_parser *parser = *_parser;
+
+       *_parser = NULL;
+       parser->deinit(parser);
+}
diff --git a/src/plugins/fts/fts-parser.h b/src/plugins/fts/fts-parser.h
new file mode 100644 (file)
index 0000000..f4f0240
--- /dev/null
@@ -0,0 +1,20 @@
+#ifndef FTS_PARSER_H
+#define FTS_PARSER_H
+
+struct message_block;
+
+struct fts_parser {
+       struct fts_parser *(*try_init)(const char *content_type,
+                                      const char *content_disposition);
+       void (*more)(struct fts_parser *parser, struct message_block *block);
+       void (*deinit)(struct fts_parser *parser);
+};
+
+extern struct fts_parser fts_parser_html;
+
+bool fts_parser_init(const char *content_type, const char *content_disposition,
+                    struct fts_parser **parser_r);
+void fts_parser_more(struct fts_parser *parser, struct message_block *block);
+void fts_parser_deinit(struct fts_parser **parser);
+
+#endif
diff --git a/src/plugins/fts/html-entities.h b/src/plugins/fts/html-entities.h
new file mode 100644 (file)
index 0000000..a3a9f96
--- /dev/null
@@ -0,0 +1,253 @@
+{ "quot",      0x0022 },
+{ "amp",       0x0026 },
+{ "apos",      0x0027 },
+{ "lt",                0x003C },
+{ "gt",                0x003E },
+{ "nbsp",      0x00A0 },
+{ "iexcl",     0x00A1 },
+{ "cent",      0x00A2 },
+{ "pound",     0x00A3 },
+{ "curren",    0x00A4 },
+{ "yen",       0x00A5 },
+{ "brvbar",    0x00A6 },
+{ "sect",      0x00A7 },
+{ "uml",       0x00A8 },
+{ "copy",      0x00A9 },
+{ "ordf",      0x00AA },
+{ "laquo",     0x00AB },
+{ "not",       0x00AC },
+{ "shy",       0x00AD },
+{ "reg",       0x00AE },
+{ "macr",      0x00AF },
+{ "deg",       0x00B0 },
+{ "plusmn",    0x00B1 },
+{ "sup2",      0x00B2 },
+{ "sup3",      0x00B3 },
+{ "acute",     0x00B4 },
+{ "micro",     0x00B5 },
+{ "para",      0x00B6 },
+{ "middot",    0x00B7 },
+{ "cedil",     0x00B8 },
+{ "sup1",      0x00B9 },
+{ "ordm",      0x00BA },
+{ "raquo",     0x00BB },
+{ "frac14",    0x00BC },
+{ "frac12",    0x00BD },
+{ "frac34",    0x00BE },
+{ "iquest",    0x00BF },
+{ "Agrave",    0x00C0 },
+{ "Aacute",    0x00C1 },
+{ "Acirc",     0x00C2 },
+{ "Atilde",    0x00C3 },
+{ "Auml",      0x00C4 },
+{ "Aring",     0x00C5 },
+{ "AElig",     0x00C6 },
+{ "Ccedil",    0x00C7 },
+{ "Egrave",    0x00C8 },
+{ "Eacute",    0x00C9 },
+{ "Ecirc",     0x00CA },
+{ "Euml",      0x00CB },
+{ "Igrave",    0x00CC },
+{ "Iacute",    0x00CD },
+{ "Icirc",     0x00CE },
+{ "Iuml",      0x00CF },
+{ "ETH",       0x00D0 },
+{ "Ntilde",    0x00D1 },
+{ "Ograve",    0x00D2 },
+{ "Oacute",    0x00D3 },
+{ "Ocirc",     0x00D4 },
+{ "Otilde",    0x00D5 },
+{ "Ouml",      0x00D6 },
+{ "times",     0x00D7 },
+{ "Oslash",    0x00D8 },
+{ "Ugrave",    0x00D9 },
+{ "Uacute",    0x00DA },
+{ "Ucirc",     0x00DB },
+{ "Uuml",      0x00DC },
+{ "Yacute",    0x00DD },
+{ "THORN",     0x00DE },
+{ "szlig",     0x00DF },
+{ "agrave",    0x00E0 },
+{ "aacute",    0x00E1 },
+{ "acirc",     0x00E2 },
+{ "atilde",    0x00E3 },
+{ "auml",      0x00E4 },
+{ "aring",     0x00E5 },
+{ "aelig",     0x00E6 },
+{ "ccedil",    0x00E7 },
+{ "egrave",    0x00E8 },
+{ "eacute",    0x00E9 },
+{ "ecirc",     0x00EA },
+{ "euml",      0x00EB },
+{ "igrave",    0x00EC },
+{ "iacute",    0x00ED },
+{ "icirc",     0x00EE },
+{ "iuml",      0x00EF },
+{ "eth",       0x00F0 },
+{ "ntilde",    0x00F1 },
+{ "ograve",    0x00F2 },
+{ "oacute",    0x00F3 },
+{ "ocirc",     0x00F4 },
+{ "otilde",    0x00F5 },
+{ "ouml",      0x00F6 },
+{ "divide",    0x00F7 },
+{ "oslash",    0x00F8 },
+{ "ugrave",    0x00F9 },
+{ "uacute",    0x00FA },
+{ "ucirc",     0x00FB },
+{ "uuml",      0x00FC },
+{ "yacute",    0x00FD },
+{ "thorn",     0x00FE },
+{ "yuml",      0x00FF },
+{ "OElig",     0x0152 },
+{ "oelig",     0x0153 },
+{ "Scaron",    0x0160 },
+{ "scaron",    0x0161 },
+{ "Yuml",      0x0178 },
+{ "fnof",      0x0192 },
+{ "circ",      0x02C6 },
+{ "tilde",     0x02DC },
+{ "Alpha",     0x0391 },
+{ "Beta",      0x0392 },
+{ "Gamma",     0x0393 },
+{ "Delta",     0x0394 },
+{ "Epsilon",   0x0395 },
+{ "Zeta",      0x0396 },
+{ "Eta",       0x0397 },
+{ "Theta",     0x0398 },
+{ "Iota",      0x0399 },
+{ "Kappa",     0x039A },
+{ "Lambda",    0x039B },
+{ "Mu",                0x039C },
+{ "Nu",                0x039D },
+{ "Xi",                0x039E },
+{ "Omicron",   0x039F },
+{ "Pi",                0x03A0 },
+{ "Rho",       0x03A1 },
+{ "Sigma",     0x03A3 },
+{ "Tau",       0x03A4 },
+{ "Upsilon",   0x03A5 },
+{ "Phi",       0x03A6 },
+{ "Chi",       0x03A7 },
+{ "Psi",       0x03A8 },
+{ "Omega",     0x03A9 },
+{ "alpha",     0x03B1 },
+{ "beta",      0x03B2 },
+{ "gamma",     0x03B3 },
+{ "delta",     0x03B4 },
+{ "epsilon",   0x03B5 },
+{ "zeta",      0x03B6 },
+{ "eta",       0x03B7 },
+{ "theta",     0x03B8 },
+{ "iota",      0x03B9 },
+{ "kappa",     0x03BA },
+{ "lambda",    0x03BB },
+{ "mu",                0x03BC },
+{ "nu",                0x03BD },
+{ "xi",                0x03BE },
+{ "omicron",   0x03BF },
+{ "pi",                0x03C0 },
+{ "rho",       0x03C1 },
+{ "sigmaf",    0x03C2 },
+{ "sigma",     0x03C3 },
+{ "tau",       0x03C4 },
+{ "upsilon",   0x03C5 },
+{ "phi",       0x03C6 },
+{ "chi",       0x03C7 },
+{ "psi",       0x03C8 },
+{ "omega",     0x03C9 },
+{ "thetasym",  0x03D1 },
+{ "upsih",     0x03D2 },
+{ "piv",       0x03D6 },
+{ "ensp",      0x2002 },
+{ "emsp",      0x2003 },
+{ "thinsp",    0x2009 },
+{ "zwnj",      0x200C },
+{ "zwj",       0x200D },
+{ "lrm",       0x200E },
+{ "rlm",       0x200F },
+{ "ndash",     0x2013 },
+{ "mdash",     0x2014 },
+{ "lsquo",     0x2018 },
+{ "rsquo",     0x2019 },
+{ "sbquo",     0x201A },
+{ "ldquo",     0x201C },
+{ "rdquo",     0x201D },
+{ "bdquo",     0x201E },
+{ "dagger",    0x2020 },
+{ "Dagger",    0x2021 },
+{ "bull",      0x2022 },
+{ "hellip",    0x2026 },
+{ "permil",    0x2030 },
+{ "prime",     0x2032 },
+{ "Prime",     0x2033 },
+{ "lsaquo",    0x2039 },
+{ "rsaquo",    0x203A },
+{ "oline",     0x203E },
+{ "frasl",     0x2044 },
+{ "euro",      0x20AC },
+{ "image",     0x2111 },
+{ "weierp",    0x2118 },
+{ "real",      0x211C },
+{ "trade",     0x2122 },
+{ "alefsym",   0x2135 },
+{ "larr",      0x2190 },
+{ "uarr",      0x2191 },
+{ "rarr",      0x2192 },
+{ "darr",      0x2193 },
+{ "harr",      0x2194 },
+{ "crarr",     0x21B5 },
+{ "lArr",      0x21D0 },
+{ "uArr",      0x21D1 },
+{ "rArr",      0x21D2 },
+{ "dArr",      0x21D3 },
+{ "hArr",      0x21D4 },
+{ "forall",    0x2200 },
+{ "part",      0x2202 },
+{ "exist",     0x2203 },
+{ "empty",     0x2205 },
+{ "nabla",     0x2207 },
+{ "isin",      0x2208 },
+{ "notin",     0x2209 },
+{ "ni",                0x220B },
+{ "prod",      0x220F },
+{ "sum",       0x2211 },
+{ "minus",     0x2212 },
+{ "lowast",    0x2217 },
+{ "radic",     0x221A },
+{ "prop",      0x221D },
+{ "infin",     0x221E },
+{ "ang",       0x2220 },
+{ "and",       0x2227 },
+{ "or",                0x2228 },
+{ "cap",       0x2229 },
+{ "cup",       0x222A },
+{ "int",       0x222B },
+{ "there4",    0x2234 },
+{ "sim",       0x223C },
+{ "cong",      0x2245 },
+{ "asymp",     0x2248 },
+{ "ne",                0x2260 },
+{ "equiv",     0x2261 },
+{ "le",                0x2264 },
+{ "ge",                0x2265 },
+{ "sub",       0x2282 },
+{ "sup",       0x2283 },
+{ "nsub",      0x2284 },
+{ "sube",      0x2286 },
+{ "supe",      0x2287 },
+{ "oplus",     0x2295 },
+{ "otimes",    0x2297 },
+{ "perp",      0x22A5 },
+{ "sdot",      0x22C5 },
+{ "lceil",     0x2308 },
+{ "rceil",     0x2309 },
+{ "lfloor",    0x230A },
+{ "rfloor",    0x230B },
+{ "lang",      0x27E8 },
+{ "rang",      0x27E9 },
+{ "loz",       0x25CA },
+{ "spades",    0x2660 },
+{ "clubs",     0x2663 },
+{ "hearts",    0x2665 },
+{ "diams",     0x2666 }