From: Timo Sirainen Date: Fri, 16 Jan 2015 22:23:36 +0000 (+0200) Subject: lib-mail: Added message_snippet_generate() to produce a short text snippet of a mail. X-Git-Tag: 2.2.16.rc1~149 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7f7be2cbf68f8a202a688d5bc50f82483d461643;p=thirdparty%2Fdovecot%2Fcore.git lib-mail: Added message_snippet_generate() to produce a short text snippet of a mail. --- diff --git a/src/lib-mail/Makefile.am b/src/lib-mail/Makefile.am index 085c3c68f1..4f21529cbb 100644 --- a/src/lib-mail/Makefile.am +++ b/src/lib-mail/Makefile.am @@ -29,6 +29,7 @@ libmail_la_SOURCES = \ message-part-serialize.c \ message-search.c \ message-size.c \ + message-snippet.c \ ostream-dot.c \ quoted-printable.c \ rfc2231-parser.c \ @@ -62,6 +63,7 @@ headers = \ message-part-serialize.h \ message-search.h \ message-size.h \ + message-snippet.h \ ostream-dot.h \ quoted-printable.h \ rfc2231-parser.h \ @@ -87,6 +89,7 @@ test_programs = \ test-message-id \ test-message-parser \ test-message-part \ + test-message-snippet \ test-ostream-dot \ test-quoted-printable \ test-rfc2231-parser @@ -166,6 +169,10 @@ test_message_part_SOURCES = test-message-part.c test_message_part_LDADD = message-part.lo message-parser.lo message-header-parser.lo message-size.lo rfc822-parser.lo rfc2231-parser.lo $(test_libs) test_message_part_DEPENDENCIES = $(test_deps) +test_message_snippet_SOURCES = test-message-snippet.c +test_message_snippet_LDADD = message-snippet.lo mail-html2text.lo $(test_message_decoder_LDADD) message-parser.lo message-header-parser.lo message-header-decode.lo message-size.lo +test_message_snippet_DEPENDENCIES = $(test_deps) + test_mail_html2text_SOURCES = test-mail-html2text.c test_mail_html2text_LDADD = mail-html2text.lo $(test_libs) test_mail_html2text_DEPENDENCIES = $(test_deps) diff --git a/src/lib-mail/message-snippet.c b/src/lib-mail/message-snippet.c new file mode 100644 index 0000000000..2446258791 --- /dev/null +++ b/src/lib-mail/message-snippet.c @@ -0,0 +1,136 @@ +/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "buffer.h" +#include "str.h" +#include "istream.h" +#include "mail-html2text.h" +#include "message-parser.h" +#include "message-decoder.h" +#include "message-snippet.h" + +enum snippet_state { + /* beginning of the line */ + SNIPPET_STATE_NEWLINE = 0, + /* within normal text */ + SNIPPET_STATE_NORMAL, + /* within quoted text - skip until EOL */ + SNIPPET_STATE_QUOTED +}; + +struct snippet_context { + string_t *snippet; + unsigned int chars_left; + enum snippet_state state; + bool add_whitespace; + struct mail_html2text *html2text; + buffer_t *plain_output; +}; + +static bool snippet_generate(struct snippet_context *ctx, + const unsigned char *data, size_t size) +{ + unsigned int i, count; + + if (ctx->html2text != NULL) { + buffer_set_used_size(ctx->plain_output, 0); + mail_html2text_more(ctx->html2text, data, size, + ctx->plain_output); + data = ctx->plain_output->data; + size = ctx->plain_output->used; + } + + /* message-decoder should feed us only valid and complete + UTF-8 input */ + for (i = 0; i < size; i += count) { + count = 1; + switch (ctx->state) { + case SNIPPET_STATE_NEWLINE: + if (data[i] == '>' && ctx->html2text == NULL) { + ctx->state = SNIPPET_STATE_QUOTED; + break; + } + ctx->state = SNIPPET_STATE_NORMAL; + /* fallthrough */ + case SNIPPET_STATE_NORMAL: + if (data[i] == '\r' || data[i] == '\n' || + data[i] == '\t' || data[i] == ' ') { + ctx->add_whitespace = TRUE; + if (data[i] == '\n') + ctx->state = SNIPPET_STATE_NEWLINE; + break; + } + if (ctx->add_whitespace) { + str_append_c(ctx->snippet, ' '); + ctx->add_whitespace = FALSE; + if (ctx->chars_left-- == 0) + return FALSE; + } + if (ctx->chars_left-- == 0) + return FALSE; + count = uni_utf8_char_bytes(data[i]); + i_assert(i + count <= size); + str_append_n(ctx->snippet, data + i, count); + break; + case SNIPPET_STATE_QUOTED: + if (data[i] == '\n') + ctx->state = SNIPPET_STATE_NEWLINE; + break; + } + } + return TRUE; +} + +int message_snippet_generate(struct istream *input, + unsigned int max_snippet_chars, + string_t *snippet) +{ + struct message_parser_ctx *parser; + struct message_part *parts; + struct message_decoder_context *decoder; + struct message_block raw_block, block; + struct snippet_context ctx; + pool_t pool; + int ret; + + memset(&ctx, 0, sizeof(ctx)); + pool = pool_alloconly_create("message snippet", 1024); + ctx.snippet = snippet; + ctx.chars_left = max_snippet_chars; + + parser = message_parser_init(pool_datastack_create(), input, 0, 0); + decoder = message_decoder_init(NULL, 0); + while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) { + if (!message_decoder_decode_next_block(decoder, &raw_block, &block)) + continue; + if (block.size == 0) { + const char *ct; + + if (block.hdr != NULL) + continue; + + /* end of headers - verify that we can use this + Content-Type. we get here only once, because we + always handle only one non-multipart MIME part. */ + ct = message_decoder_current_content_type(decoder); + if (ct == NULL) + /* text/plain */ ; + else if (strcasecmp(ct, "text/html") == 0) { + ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED); + ctx.plain_output = buffer_create_dynamic(pool, 1024); + } else if (strncasecmp(ct, "text/", 5) != 0) + break; + continue; + } + if (!snippet_generate(&ctx, block.data, block.size)) + break; + } + i_assert(ret != 0); + message_decoder_deinit(&decoder); + if (message_parser_deinit(&parser, &parts) < 0) + i_unreached(); + if (ctx.html2text != NULL) + mail_html2text_deinit(&ctx.html2text); + pool_unref(&pool); + return input->stream_errno == 0 ? 0 : -1; +} diff --git a/src/lib-mail/message-snippet.h b/src/lib-mail/message-snippet.h new file mode 100644 index 0000000000..fe9c3b69ef --- /dev/null +++ b/src/lib-mail/message-snippet.h @@ -0,0 +1,14 @@ +#ifndef MESSAGE_SNIPPET_H +#define MESSAGE_SNIPPET_H + +/* Generate UTF-8 text snippet from the beginning of the given mail input + stream. The stream is expected to start at the MIME part's headers whose + snippet is being generated. Returns 0 if ok, -1 if I/O error. + + Currently only Content-Type: text/ is supported, others will result in an + empty string. */ +int message_snippet_generate(struct istream *input, + unsigned int max_snippet_chars, + string_t *snippet); + +#endif diff --git a/src/lib-mail/test-message-snippet.c b/src/lib-mail/test-message-snippet.c new file mode 100644 index 0000000000..2444a9b70d --- /dev/null +++ b/src/lib-mail/test-message-snippet.c @@ -0,0 +1,80 @@ +/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "istream.h" +#include "message-snippet.h" +#include "test-common.h" + +static struct { + const char *input; + unsigned int max_snippet_chars; + const char *output; +} tests[] = { + { "Content-Type: text/plain\n" + "\n" + "1234567890 234567890", + 12, + "1234567890 2" }, + { "Content-Type: text/plain\n" + "\n" + "line1\n>quote2\nline2\n", + 100, + "line1 line2" }, + { "Content-Type: text/plain\n" + "\n" + "line1\n>quote2\n> quote3\n > line4\n\n \t\t \nline5\n \t ", + 100, + "line1 > line4 line5" }, + { "Content-Type: text/plain; charset=utf-8\n" + "\n" + "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4\xC3\xA4", + 11, + "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" }, + { "Content-Type: text/plain; charset=utf-8\n" + "Content-Transfer-Encoding: quoted-printable\n" + "\n" + "hyv=C3=A4=C3=A4 p=C3=A4iv=C3=A4=C3=A4", + 11, + "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" }, + + { "Content-Transfer-Encoding: quoted-printable\n" + "Content-Type: text/html;\n" + " charset=utf-8\n" + "\n" + "Hi,

How =\n" + "is it going?
quoted text is ignored
\n" + "> -foo\n" + "

=\n", + 100, + "Hi, How is it going? > -foo" }, +}; + +static void test_message_snippet(void) +{ + string_t *str = t_str_new(128); + struct istream *input; + unsigned int i; + + test_begin("message snippet"); + for (i = 0; i < N_ELEMENTS(tests); i++) { + str_truncate(str, 0); + input = i_stream_create_from_data(tests[i].input, strlen(tests[i].input)); + test_assert_idx(message_snippet_generate(input, tests[i].max_snippet_chars, str) == 0, i); + test_assert_idx(strcmp(tests[i].output, str_c(str)) == 0, i); + i_stream_destroy(&input); + } + test_end(); +} + +int main(void) +{ + static void (*test_functions[])(void) = { + test_message_snippet, + NULL + }; + return test_run(test_functions); +}