--- /dev/null
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "str.h"
+#include "istream.h"
+#include "mail-html2text.h"
+#include "message-parser.h"
+#include "message-decoder.h"
+#include "message-snippet.h"
+
+enum snippet_state {
+ /* beginning of the line */
+ SNIPPET_STATE_NEWLINE = 0,
+ /* within normal text */
+ SNIPPET_STATE_NORMAL,
+ /* within quoted text - skip until EOL */
+ SNIPPET_STATE_QUOTED
+};
+
+struct snippet_context {
+ string_t *snippet;
+ unsigned int chars_left;
+ enum snippet_state state;
+ bool add_whitespace;
+ struct mail_html2text *html2text;
+ buffer_t *plain_output;
+};
+
+static bool snippet_generate(struct snippet_context *ctx,
+ const unsigned char *data, size_t size)
+{
+ unsigned int i, count;
+
+ if (ctx->html2text != NULL) {
+ buffer_set_used_size(ctx->plain_output, 0);
+ mail_html2text_more(ctx->html2text, data, size,
+ ctx->plain_output);
+ data = ctx->plain_output->data;
+ size = ctx->plain_output->used;
+ }
+
+ /* message-decoder should feed us only valid and complete
+ UTF-8 input */
+ for (i = 0; i < size; i += count) {
+ count = 1;
+ switch (ctx->state) {
+ case SNIPPET_STATE_NEWLINE:
+ if (data[i] == '>' && ctx->html2text == NULL) {
+ ctx->state = SNIPPET_STATE_QUOTED;
+ break;
+ }
+ ctx->state = SNIPPET_STATE_NORMAL;
+ /* fallthrough */
+ case SNIPPET_STATE_NORMAL:
+ if (data[i] == '\r' || data[i] == '\n' ||
+ data[i] == '\t' || data[i] == ' ') {
+ ctx->add_whitespace = TRUE;
+ if (data[i] == '\n')
+ ctx->state = SNIPPET_STATE_NEWLINE;
+ break;
+ }
+ if (ctx->add_whitespace) {
+ str_append_c(ctx->snippet, ' ');
+ ctx->add_whitespace = FALSE;
+ if (ctx->chars_left-- == 0)
+ return FALSE;
+ }
+ if (ctx->chars_left-- == 0)
+ return FALSE;
+ count = uni_utf8_char_bytes(data[i]);
+ i_assert(i + count <= size);
+ str_append_n(ctx->snippet, data + i, count);
+ break;
+ case SNIPPET_STATE_QUOTED:
+ if (data[i] == '\n')
+ ctx->state = SNIPPET_STATE_NEWLINE;
+ break;
+ }
+ }
+ return TRUE;
+}
+
+int message_snippet_generate(struct istream *input,
+ unsigned int max_snippet_chars,
+ string_t *snippet)
+{
+ struct message_parser_ctx *parser;
+ struct message_part *parts;
+ struct message_decoder_context *decoder;
+ struct message_block raw_block, block;
+ struct snippet_context ctx;
+ pool_t pool;
+ int ret;
+
+ memset(&ctx, 0, sizeof(ctx));
+ pool = pool_alloconly_create("message snippet", 1024);
+ ctx.snippet = snippet;
+ ctx.chars_left = max_snippet_chars;
+
+ parser = message_parser_init(pool_datastack_create(), input, 0, 0);
+ decoder = message_decoder_init(NULL, 0);
+ while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
+ if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
+ continue;
+ if (block.size == 0) {
+ const char *ct;
+
+ if (block.hdr != NULL)
+ continue;
+
+ /* end of headers - verify that we can use this
+ Content-Type. we get here only once, because we
+ always handle only one non-multipart MIME part. */
+ ct = message_decoder_current_content_type(decoder);
+ if (ct == NULL)
+ /* text/plain */ ;
+ else if (strcasecmp(ct, "text/html") == 0) {
+ ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
+ ctx.plain_output = buffer_create_dynamic(pool, 1024);
+ } else if (strncasecmp(ct, "text/", 5) != 0)
+ break;
+ continue;
+ }
+ if (!snippet_generate(&ctx, block.data, block.size))
+ break;
+ }
+ i_assert(ret != 0);
+ message_decoder_deinit(&decoder);
+ if (message_parser_deinit(&parser, &parts) < 0)
+ i_unreached();
+ if (ctx.html2text != NULL)
+ mail_html2text_deinit(&ctx.html2text);
+ pool_unref(&pool);
+ return input->stream_errno == 0 ? 0 : -1;
+}
--- /dev/null
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "istream.h"
+#include "message-snippet.h"
+#include "test-common.h"
+
+static struct {
+ const char *input;
+ unsigned int max_snippet_chars;
+ const char *output;
+} tests[] = {
+ { "Content-Type: text/plain\n"
+ "\n"
+ "1234567890 234567890",
+ 12,
+ "1234567890 2" },
+ { "Content-Type: text/plain\n"
+ "\n"
+ "line1\n>quote2\nline2\n",
+ 100,
+ "line1 line2" },
+ { "Content-Type: text/plain\n"
+ "\n"
+ "line1\n>quote2\n> quote3\n > line4\n\n \t\t \nline5\n \t ",
+ 100,
+ "line1 > line4 line5" },
+ { "Content-Type: text/plain; charset=utf-8\n"
+ "\n"
+ "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4\xC3\xA4",
+ 11,
+ "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+ { "Content-Type: text/plain; charset=utf-8\n"
+ "Content-Transfer-Encoding: quoted-printable\n"
+ "\n"
+ "hyv=C3=A4=C3=A4 p=C3=A4iv=C3=A4=C3=A4",
+ 11,
+ "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+
+ { "Content-Transfer-Encoding: quoted-printable\n"
+ "Content-Type: text/html;\n"
+ " charset=utf-8\n"
+ "\n"
+ "<html><head><meta http-equiv=3D\"Content-Type\" content=3D\"text/html =\n"
+ "charset=3Dutf-8\"></head><body style=3D\"word-wrap: break-word; =\n"
+ "-webkit-nbsp-mode: space; -webkit-line-break: after-white-space;\" =\n"
+ "class=3D\"\">Hi,<div class=3D\"\"><br class=3D\"\"></div><div class=3D\"\">How =\n"
+ "is it going? <blockquote>quoted text is ignored</blockquote>\n"
+ "> -foo\n"
+ "</div><br =class=3D\"\"></body></html>=\n",
+ 100,
+ "Hi, How is it going? > -foo" },
+};
+
+static void test_message_snippet(void)
+{
+ string_t *str = t_str_new(128);
+ struct istream *input;
+ unsigned int i;
+
+ test_begin("message snippet");
+ for (i = 0; i < N_ELEMENTS(tests); i++) {
+ str_truncate(str, 0);
+ input = i_stream_create_from_data(tests[i].input, strlen(tests[i].input));
+ test_assert_idx(message_snippet_generate(input, tests[i].max_snippet_chars, str) == 0, i);
+ test_assert_idx(strcmp(tests[i].output, str_c(str)) == 0, i);
+ i_stream_destroy(&input);
+ }
+ test_end();
+}
+
+int main(void)
+{
+ static void (*test_functions[])(void) = {
+ test_message_snippet,
+ NULL
+ };
+ return test_run(test_functions);
+}