]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-mail: Added message_snippet_generate() to produce a short text snippet of a mail.
authorTimo Sirainen <tss@iki.fi>
Fri, 16 Jan 2015 22:23:36 +0000 (00:23 +0200)
committerTimo Sirainen <tss@iki.fi>
Fri, 16 Jan 2015 22:23:36 +0000 (00:23 +0200)
src/lib-mail/Makefile.am
src/lib-mail/message-snippet.c [new file with mode: 0644]
src/lib-mail/message-snippet.h [new file with mode: 0644]
src/lib-mail/test-message-snippet.c [new file with mode: 0644]

index 085c3c68f117e0c783a8ef0bf9cfe2d15a934ed7..4f21529cbbc53aa48efbd619088ee896b1312f2a 100644 (file)
@@ -29,6 +29,7 @@ libmail_la_SOURCES = \
        message-part-serialize.c \
        message-search.c \
        message-size.c \
+       message-snippet.c \
        ostream-dot.c \
        quoted-printable.c \
        rfc2231-parser.c \
@@ -62,6 +63,7 @@ headers = \
        message-part-serialize.h \
        message-search.h \
        message-size.h \
+       message-snippet.h \
        ostream-dot.h \
        quoted-printable.h \
        rfc2231-parser.h \
@@ -87,6 +89,7 @@ test_programs = \
        test-message-id \
        test-message-parser \
        test-message-part \
+       test-message-snippet \
        test-ostream-dot \
        test-quoted-printable \
        test-rfc2231-parser
@@ -166,6 +169,10 @@ test_message_part_SOURCES = test-message-part.c
 test_message_part_LDADD = message-part.lo message-parser.lo message-header-parser.lo message-size.lo rfc822-parser.lo rfc2231-parser.lo $(test_libs)
 test_message_part_DEPENDENCIES = $(test_deps)
 
+test_message_snippet_SOURCES = test-message-snippet.c
+test_message_snippet_LDADD = message-snippet.lo mail-html2text.lo $(test_message_decoder_LDADD) message-parser.lo message-header-parser.lo message-header-decode.lo message-size.lo
+test_message_snippet_DEPENDENCIES = $(test_deps)
+
 test_mail_html2text_SOURCES = test-mail-html2text.c
 test_mail_html2text_LDADD = mail-html2text.lo $(test_libs)
 test_mail_html2text_DEPENDENCIES = $(test_deps)
diff --git a/src/lib-mail/message-snippet.c b/src/lib-mail/message-snippet.c
new file mode 100644 (file)
index 0000000..2446258
--- /dev/null
@@ -0,0 +1,136 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "str.h"
+#include "istream.h"
+#include "mail-html2text.h"
+#include "message-parser.h"
+#include "message-decoder.h"
+#include "message-snippet.h"
+
+enum snippet_state {
+       /* beginning of the line */
+       SNIPPET_STATE_NEWLINE = 0,
+       /* within normal text */
+       SNIPPET_STATE_NORMAL,
+       /* within quoted text - skip until EOL */
+       SNIPPET_STATE_QUOTED
+};
+
+struct snippet_context {
+       string_t *snippet;
+       unsigned int chars_left;
+       enum snippet_state state;
+       bool add_whitespace;
+       struct mail_html2text *html2text;
+       buffer_t *plain_output;
+};
+
+static bool snippet_generate(struct snippet_context *ctx,
+                            const unsigned char *data, size_t size)
+{
+       unsigned int i, count;
+
+       if (ctx->html2text != NULL) {
+               buffer_set_used_size(ctx->plain_output, 0);
+               mail_html2text_more(ctx->html2text, data, size,
+                                   ctx->plain_output);
+               data = ctx->plain_output->data;
+               size = ctx->plain_output->used;
+       }
+
+       /* message-decoder should feed us only valid and complete
+          UTF-8 input */
+       for (i = 0; i < size; i += count) {
+               count = 1;
+               switch (ctx->state) {
+               case SNIPPET_STATE_NEWLINE:
+                       if (data[i] == '>' && ctx->html2text == NULL) {
+                               ctx->state = SNIPPET_STATE_QUOTED;
+                               break;
+                       }
+                       ctx->state = SNIPPET_STATE_NORMAL;
+                       /* fallthrough */
+               case SNIPPET_STATE_NORMAL:
+                       if (data[i] == '\r' || data[i] == '\n' ||
+                           data[i] == '\t' || data[i] == ' ') {
+                               ctx->add_whitespace = TRUE;
+                               if (data[i] == '\n')
+                                       ctx->state = SNIPPET_STATE_NEWLINE;
+                               break;
+                       }
+                       if (ctx->add_whitespace) {
+                               str_append_c(ctx->snippet, ' ');
+                               ctx->add_whitespace = FALSE;
+                               if (ctx->chars_left-- == 0)
+                                       return FALSE;
+                       }
+                       if (ctx->chars_left-- == 0)
+                               return FALSE;
+                       count = uni_utf8_char_bytes(data[i]);
+                       i_assert(i + count <= size);
+                       str_append_n(ctx->snippet, data + i, count);
+                       break;
+               case SNIPPET_STATE_QUOTED:
+                       if (data[i] == '\n')
+                               ctx->state = SNIPPET_STATE_NEWLINE;
+                       break;
+               }
+       }
+       return TRUE;
+}
+
+int message_snippet_generate(struct istream *input,
+                            unsigned int max_snippet_chars,
+                            string_t *snippet)
+{
+       struct message_parser_ctx *parser;
+       struct message_part *parts;
+       struct message_decoder_context *decoder;
+       struct message_block raw_block, block;
+       struct snippet_context ctx;
+       pool_t pool;
+       int ret;
+
+       memset(&ctx, 0, sizeof(ctx));
+       pool = pool_alloconly_create("message snippet", 1024);
+       ctx.snippet = snippet;
+       ctx.chars_left = max_snippet_chars;
+
+       parser = message_parser_init(pool_datastack_create(), input, 0, 0);
+       decoder = message_decoder_init(NULL, 0);
+       while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
+               if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
+                       continue;
+               if (block.size == 0) {
+                       const char *ct;
+
+                       if (block.hdr != NULL)
+                               continue;
+
+                       /* end of headers - verify that we can use this
+                          Content-Type. we get here only once, because we
+                          always handle only one non-multipart MIME part. */
+                       ct = message_decoder_current_content_type(decoder);
+                       if (ct == NULL)
+                               /* text/plain */ ;
+                       else if (strcasecmp(ct, "text/html") == 0) {
+                               ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
+                               ctx.plain_output = buffer_create_dynamic(pool, 1024);
+                       } else if (strncasecmp(ct, "text/", 5) != 0)
+                               break;
+                       continue;
+               }
+               if (!snippet_generate(&ctx, block.data, block.size))
+                       break;
+       }
+       i_assert(ret != 0);
+       message_decoder_deinit(&decoder);
+       if (message_parser_deinit(&parser, &parts) < 0)
+               i_unreached();
+       if (ctx.html2text != NULL)
+               mail_html2text_deinit(&ctx.html2text);
+       pool_unref(&pool);
+       return input->stream_errno == 0 ? 0 : -1;
+}
diff --git a/src/lib-mail/message-snippet.h b/src/lib-mail/message-snippet.h
new file mode 100644 (file)
index 0000000..fe9c3b6
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef MESSAGE_SNIPPET_H
+#define MESSAGE_SNIPPET_H
+
+/* Generate UTF-8 text snippet from the beginning of the given mail input
+   stream. The stream is expected to start at the MIME part's headers whose
+   snippet is being generated. Returns 0 if ok, -1 if I/O error.
+
+   Currently only Content-Type: text/ is supported, others will result in an
+   empty string. */
+int message_snippet_generate(struct istream *input,
+                            unsigned int max_snippet_chars,
+                            string_t *snippet);
+
+#endif
diff --git a/src/lib-mail/test-message-snippet.c b/src/lib-mail/test-message-snippet.c
new file mode 100644 (file)
index 0000000..2444a9b
--- /dev/null
@@ -0,0 +1,80 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "istream.h"
+#include "message-snippet.h"
+#include "test-common.h"
+
+static struct {
+       const char *input;
+       unsigned int max_snippet_chars;
+       const char *output;
+} tests[] = {
+       { "Content-Type: text/plain\n"
+         "\n"
+         "1234567890 234567890",
+         12,
+         "1234567890 2" },
+       { "Content-Type: text/plain\n"
+         "\n"
+         "line1\n>quote2\nline2\n",
+         100,
+         "line1 line2" },
+       { "Content-Type: text/plain\n"
+         "\n"
+         "line1\n>quote2\n> quote3\n > line4\n\n  \t\t  \nline5\n  \t ",
+         100,
+         "line1 > line4 line5" },
+       { "Content-Type: text/plain; charset=utf-8\n"
+         "\n"
+         "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4\xC3\xA4",
+         11,
+         "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+       { "Content-Type: text/plain; charset=utf-8\n"
+         "Content-Transfer-Encoding: quoted-printable\n"
+         "\n"
+         "hyv=C3=A4=C3=A4 p=C3=A4iv=C3=A4=C3=A4",
+         11,
+         "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+
+       { "Content-Transfer-Encoding: quoted-printable\n"
+         "Content-Type: text/html;\n"
+         "      charset=utf-8\n"
+         "\n"
+         "<html><head><meta http-equiv=3D\"Content-Type\" content=3D\"text/html =\n"
+         "charset=3Dutf-8\"></head><body style=3D\"word-wrap: break-word; =\n"
+         "-webkit-nbsp-mode: space; -webkit-line-break: after-white-space;\" =\n"
+         "class=3D\"\">Hi,<div class=3D\"\"><br class=3D\"\"></div><div class=3D\"\">How =\n"
+         "is it going? <blockquote>quoted text is ignored</blockquote>\n"
+         "&gt; -foo\n"
+         "</div><br =class=3D\"\"></body></html>=\n",
+         100,
+         "Hi, How is it going? > -foo" },
+};
+
+static void test_message_snippet(void)
+{
+       string_t *str = t_str_new(128);
+       struct istream *input;
+       unsigned int i;
+
+       test_begin("message snippet");
+       for (i = 0; i < N_ELEMENTS(tests); i++) {
+               str_truncate(str, 0);
+               input = i_stream_create_from_data(tests[i].input, strlen(tests[i].input));
+               test_assert_idx(message_snippet_generate(input, tests[i].max_snippet_chars, str) == 0, i);
+               test_assert_idx(strcmp(tests[i].output, str_c(str)) == 0, i);
+               i_stream_destroy(&input);
+       }
+       test_end();
+}
+
+int main(void)
+{
+       static void (*test_functions[])(void) = {
+               test_message_snippet,
+               NULL
+       };
+       return test_run(test_functions);
+}