From: Timo Sirainen <tss@iki.fi>
Date: Fri, 16 Jan 2015 22:15:44 +0000 (+0200)
Subject: lib-mail: Added mail-html2text API
X-Git-Tag: 2.2.16.rc1~150
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c9141125278100269eb3a907c911afe78c46717c;p=thirdparty%2Fdovecot%2Fcore.git

lib-mail: Added mail-html2text API
What makes it mail-specific is that it allows skipping over data inside
<blockquote>. This code probably doesn't parse HTML perfectly, but hopefully
good enough for HTML emails.
---

diff --git a/src/lib-mail/Makefile.am b/src/lib-mail/Makefile.am
index 9a5a6a516a..085c3c68f1 100644
--- a/src/lib-mail/Makefile.am
+++ b/src/lib-mail/Makefile.am
@@ -13,6 +13,7 @@ libmail_la_SOURCES = \
 	istream-header-filter.c \
 	istream-nonuls.c \
 	istream-qp-decoder.c \
+	mail-html2text.c \
 	mail-user-hash.c \
 	mbox-from.c \
 	message-address.c \
@@ -33,6 +34,9 @@ libmail_la_SOURCES = \
 	rfc2231-parser.c \
 	rfc822-parser.c
 
+noinst_HEADERS = \
+	html-entities.h
+
 headers = \
 	istream-attachment-connector.h \
 	istream-attachment-extractor.h \
@@ -43,6 +47,7 @@ headers = \
 	istream-qp.h \
 	mail-user-hash.h \
 	mbox-from.h \
+	mail-html2text.h \
 	mail-types.h \
 	message-address.h \
 	message-binary-part.h \
@@ -71,6 +76,7 @@ test_programs = \
 	test-istream-binary-converter \
 	test-istream-header-filter \
 	test-istream-qp-decoder \
+	test-mail-html2text \
 	test-mbox-from \
 	test-message-address \
 	test-message-date \
@@ -160,6 +166,10 @@ test_message_part_SOURCES = test-message-part.c
 test_message_part_LDADD = message-part.lo message-parser.lo message-header-parser.lo message-size.lo rfc822-parser.lo rfc2231-parser.lo $(test_libs)
 test_message_part_DEPENDENCIES = $(test_deps)
 
+test_mail_html2text_SOURCES = test-mail-html2text.c
+test_mail_html2text_LDADD = mail-html2text.lo $(test_libs)
+test_mail_html2text_DEPENDENCIES = $(test_deps)
+
 test_ostream_dot_SOURCES = test-ostream-dot.c
 test_ostream_dot_LDADD = ostream-dot.lo $(test_libs)
 test_ostream_dot_DEPENDENCIES = $(test_deps)
diff --git a/src/lib-mail/html-entities.h b/src/lib-mail/html-entities.h
new file mode 100644
index 0000000000..a3a9f96348
--- /dev/null
+++ b/src/lib-mail/html-entities.h
@@ -0,0 +1,253 @@
+{ "quot",	0x0022 },
+{ "amp",	0x0026 },
+{ "apos",	0x0027 },
+{ "lt",		0x003C },
+{ "gt",		0x003E },
+{ "nbsp",	0x00A0 },
+{ "iexcl",	0x00A1 },
+{ "cent",	0x00A2 },
+{ "pound",	0x00A3 },
+{ "curren",	0x00A4 },
+{ "yen",	0x00A5 },
+{ "brvbar",	0x00A6 },
+{ "sect",	0x00A7 },
+{ "uml",	0x00A8 },
+{ "copy",	0x00A9 },
+{ "ordf",	0x00AA },
+{ "laquo",	0x00AB },
+{ "not",	0x00AC },
+{ "shy",	0x00AD },
+{ "reg",	0x00AE },
+{ "macr",	0x00AF },
+{ "deg",	0x00B0 },
+{ "plusmn",	0x00B1 },
+{ "sup2",	0x00B2 },
+{ "sup3",	0x00B3 },
+{ "acute",	0x00B4 },
+{ "micro",	0x00B5 },
+{ "para",	0x00B6 },
+{ "middot",	0x00B7 },
+{ "cedil",	0x00B8 },
+{ "sup1",	0x00B9 },
+{ "ordm",	0x00BA },
+{ "raquo",	0x00BB },
+{ "frac14",	0x00BC },
+{ "frac12",	0x00BD },
+{ "frac34",	0x00BE },
+{ "iquest",	0x00BF },
+{ "Agrave",	0x00C0 },
+{ "Aacute",	0x00C1 },
+{ "Acirc",	0x00C2 },
+{ "Atilde",	0x00C3 },
+{ "Auml",	0x00C4 },
+{ "Aring",	0x00C5 },
+{ "AElig",	0x00C6 },
+{ "Ccedil",	0x00C7 },
+{ "Egrave",	0x00C8 },
+{ "Eacute",	0x00C9 },
+{ "Ecirc",	0x00CA },
+{ "Euml",	0x00CB },
+{ "Igrave",	0x00CC },
+{ "Iacute",	0x00CD },
+{ "Icirc",	0x00CE },
+{ "Iuml",	0x00CF },
+{ "ETH",	0x00D0 },
+{ "Ntilde",	0x00D1 },
+{ "Ograve",	0x00D2 },
+{ "Oacute",	0x00D3 },
+{ "Ocirc",	0x00D4 },
+{ "Otilde",	0x00D5 },
+{ "Ouml",	0x00D6 },
+{ "times",	0x00D7 },
+{ "Oslash",	0x00D8 },
+{ "Ugrave",	0x00D9 },
+{ "Uacute",	0x00DA },
+{ "Ucirc",	0x00DB },
+{ "Uuml",	0x00DC },
+{ "Yacute",	0x00DD },
+{ "THORN",	0x00DE },
+{ "szlig",	0x00DF },
+{ "agrave",	0x00E0 },
+{ "aacute",	0x00E1 },
+{ "acirc",	0x00E2 },
+{ "atilde",	0x00E3 },
+{ "auml",	0x00E4 },
+{ "aring",	0x00E5 },
+{ "aelig",	0x00E6 },
+{ "ccedil",	0x00E7 },
+{ "egrave",	0x00E8 },
+{ "eacute",	0x00E9 },
+{ "ecirc",	0x00EA },
+{ "euml",	0x00EB },
+{ "igrave",	0x00EC },
+{ "iacute",	0x00ED },
+{ "icirc",	0x00EE },
+{ "iuml",	0x00EF },
+{ "eth",	0x00F0 },
+{ "ntilde",	0x00F1 },
+{ "ograve",	0x00F2 },
+{ "oacute",	0x00F3 },
+{ "ocirc",	0x00F4 },
+{ "otilde",	0x00F5 },
+{ "ouml",	0x00F6 },
+{ "divide",	0x00F7 },
+{ "oslash",	0x00F8 },
+{ "ugrave",	0x00F9 },
+{ "uacute",	0x00FA },
+{ "ucirc",	0x00FB },
+{ "uuml",	0x00FC },
+{ "yacute",	0x00FD },
+{ "thorn",	0x00FE },
+{ "yuml",	0x00FF },
+{ "OElig",	0x0152 },
+{ "oelig",	0x0153 },
+{ "Scaron",	0x0160 },
+{ "scaron",	0x0161 },
+{ "Yuml",	0x0178 },
+{ "fnof",	0x0192 },
+{ "circ",	0x02C6 },
+{ "tilde",	0x02DC },
+{ "Alpha",	0x0391 },
+{ "Beta",	0x0392 },
+{ "Gamma",	0x0393 },
+{ "Delta",	0x0394 },
+{ "Epsilon",	0x0395 },
+{ "Zeta",	0x0396 },
+{ "Eta",	0x0397 },
+{ "Theta",	0x0398 },
+{ "Iota",	0x0399 },
+{ "Kappa",	0x039A },
+{ "Lambda",	0x039B },
+{ "Mu",		0x039C },
+{ "Nu",		0x039D },
+{ "Xi",		0x039E },
+{ "Omicron",	0x039F },
+{ "Pi",		0x03A0 },
+{ "Rho",	0x03A1 },
+{ "Sigma",	0x03A3 },
+{ "Tau",	0x03A4 },
+{ "Upsilon",	0x03A5 },
+{ "Phi",	0x03A6 },
+{ "Chi",	0x03A7 },
+{ "Psi",	0x03A8 },
+{ "Omega",	0x03A9 },
+{ "alpha",	0x03B1 },
+{ "beta",	0x03B2 },
+{ "gamma",	0x03B3 },
+{ "delta",	0x03B4 },
+{ "epsilon",	0x03B5 },
+{ "zeta",	0x03B6 },
+{ "eta",	0x03B7 },
+{ "theta",	0x03B8 },
+{ "iota",	0x03B9 },
+{ "kappa",	0x03BA },
+{ "lambda",	0x03BB },
+{ "mu",		0x03BC },
+{ "nu",		0x03BD },
+{ "xi",		0x03BE },
+{ "omicron",	0x03BF },
+{ "pi",		0x03C0 },
+{ "rho",	0x03C1 },
+{ "sigmaf",	0x03C2 },
+{ "sigma",	0x03C3 },
+{ "tau",	0x03C4 },
+{ "upsilon",	0x03C5 },
+{ "phi",	0x03C6 },
+{ "chi",	0x03C7 },
+{ "psi",	0x03C8 },
+{ "omega",	0x03C9 },
+{ "thetasym",	0x03D1 },
+{ "upsih",	0x03D2 },
+{ "piv",	0x03D6 },
+{ "ensp",	0x2002 },
+{ "emsp",	0x2003 },
+{ "thinsp",	0x2009 },
+{ "zwnj",	0x200C },
+{ "zwj",	0x200D },
+{ "lrm",	0x200E },
+{ "rlm",	0x200F },
+{ "ndash",	0x2013 },
+{ "mdash",	0x2014 },
+{ "lsquo",	0x2018 },
+{ "rsquo",	0x2019 },
+{ "sbquo",	0x201A },
+{ "ldquo",	0x201C },
+{ "rdquo",	0x201D },
+{ "bdquo",	0x201E },
+{ "dagger",	0x2020 },
+{ "Dagger",	0x2021 },
+{ "bull",	0x2022 },
+{ "hellip",	0x2026 },
+{ "permil",	0x2030 },
+{ "prime",	0x2032 },
+{ "Prime",	0x2033 },
+{ "lsaquo",	0x2039 },
+{ "rsaquo",	0x203A },
+{ "oline",	0x203E },
+{ "frasl",	0x2044 },
+{ "euro",	0x20AC },
+{ "image",	0x2111 },
+{ "weierp",	0x2118 },
+{ "real",	0x211C },
+{ "trade",	0x2122 },
+{ "alefsym",	0x2135 },
+{ "larr",	0x2190 },
+{ "uarr",	0x2191 },
+{ "rarr",	0x2192 },
+{ "darr",	0x2193 },
+{ "harr",	0x2194 },
+{ "crarr",	0x21B5 },
+{ "lArr",	0x21D0 },
+{ "uArr",	0x21D1 },
+{ "rArr",	0x21D2 },
+{ "dArr",	0x21D3 },
+{ "hArr",	0x21D4 },
+{ "forall",	0x2200 },
+{ "part",	0x2202 },
+{ "exist",	0x2203 },
+{ "empty",	0x2205 },
+{ "nabla",	0x2207 },
+{ "isin",	0x2208 },
+{ "notin",	0x2209 },
+{ "ni",		0x220B },
+{ "prod",	0x220F },
+{ "sum",	0x2211 },
+{ "minus",	0x2212 },
+{ "lowast",	0x2217 },
+{ "radic",	0x221A },
+{ "prop",	0x221D },
+{ "infin",	0x221E },
+{ "ang",	0x2220 },
+{ "and",	0x2227 },
+{ "or",		0x2228 },
+{ "cap",	0x2229 },
+{ "cup",	0x222A },
+{ "int",	0x222B },
+{ "there4",	0x2234 },
+{ "sim",	0x223C },
+{ "cong",	0x2245 },
+{ "asymp",	0x2248 },
+{ "ne",		0x2260 },
+{ "equiv",	0x2261 },
+{ "le",		0x2264 },
+{ "ge",		0x2265 },
+{ "sub",	0x2282 },
+{ "sup",	0x2283 },
+{ "nsub",	0x2284 },
+{ "sube",	0x2286 },
+{ "supe",	0x2287 },
+{ "oplus",	0x2295 },
+{ "otimes",	0x2297 },
+{ "perp",	0x22A5 },
+{ "sdot",	0x22C5 },
+{ "lceil",	0x2308 },
+{ "rceil",	0x2309 },
+{ "lfloor",	0x230A },
+{ "rfloor",	0x230B },
+{ "lang",	0x27E8 },
+{ "rang",	0x27E9 },
+{ "loz",	0x25CA },
+{ "spades",	0x2660 },
+{ "clubs",	0x2663 },
+{ "hearts",	0x2665 },
+{ "diams",	0x2666 }
diff --git a/src/lib-mail/mail-html2text.c b/src/lib-mail/mail-html2text.c
new file mode 100644
index 0000000000..e9b52ef52b
--- /dev/null
+++ b/src/lib-mail/mail-html2text.c
@@ -0,0 +1,323 @@
+/* Copyright (c) 2011-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "message-parser.h"
+#include "mail-html2text.h"
+
+/* Zero-width space (&#x200B;) apparently also belongs here, but that gets a
+   bit tricky to handle.. is it actually used anywhere? */
+#define HTML_WHITESPACE(c) \
+	((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
+
+enum html_state {
+	/* regular text */
+	HTML_STATE_TEXT,
+	/* tag outside "quoted string" */
+	HTML_STATE_TAG,
+	/* tag inside "double quoted string" */
+	HTML_STATE_TAG_DQUOTED,
+	/* tag -> "escape\ */
+	HTML_STATE_TAG_DQUOTED_ESCAPE,
+	/* tag inside 'single quoted string' */
+	HTML_STATE_TAG_SQUOTED,
+	/* tag -> 'escape\ */
+	HTML_STATE_TAG_SQUOTED_ESCAPE,
+	/* comment */
+	HTML_STATE_COMMENT,
+	/* comment is ending, we've seen "--" and now just waiting for ">" */
+	HTML_STATE_COMMENT_END,
+	/* (java)script */
+	HTML_STATE_SCRIPT,
+	/* CSS style */
+	HTML_STATE_STYLE,
+	/* <![CDATA[...]]> */
+	HTML_STATE_CDATA
+};
+
+struct mail_html2text {
+	enum mail_html2text_flags flags;
+	enum html_state state;
+	buffer_t *input;
+	unsigned int quote_level;
+	bool ignore_next_text;
+};
+
+static struct {
+	const char *name;
+	unichar_t chr;
+} html_entities[] = {
+#include "html-entities.h"
+};
+
+struct mail_html2text *
+mail_html2text_init(enum mail_html2text_flags flags)
+{
+	struct mail_html2text *ht;
+
+	ht = i_new(struct mail_html2text, 1);
+	ht->flags = flags;
+	ht->input = buffer_create_dynamic(default_pool, 512);
+	return ht;
+}
+
+static size_t
+parse_tag_name(struct mail_html2text *ht,
+	       const unsigned char *data, size_t size)
+{
+	size_t i;
+
+	if (size >= 3 && memcmp(data, "!--", 3) == 0) {
+		ht->state = HTML_STATE_COMMENT;
+		return 3 + 1;
+	}
+	if (size >= 7 && i_memcasecmp(data, "script", 6) == 0 &&
+	    (HTML_WHITESPACE(data[6]) || data[6] == '>')) {
+		ht->state = HTML_STATE_SCRIPT;
+		return 7 + 1;
+	}
+	if (size >= 6 && i_memcasecmp(data, "style", 5) == 0 &&
+	    (HTML_WHITESPACE(data[5]) || data[5] == '>')) {
+		ht->state = HTML_STATE_STYLE;
+		return 6 + 1;
+	}
+	if (size >= 8 && i_memcasecmp(data, "![CDATA[", 8) == 0) {
+		ht->state = HTML_STATE_CDATA;
+		return 8 + 1;
+	}
+
+	if ((ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) != 0) {
+		if (size >= 10 && i_memcasecmp(data, "blockquote", 10) == 0 &&
+		    (HTML_WHITESPACE(data[10]) || data[10] == '>')) {
+			ht->quote_level++;
+			ht->state = HTML_STATE_TAG;
+			return 1;
+		} else if (ht->quote_level > 0 &&
+			   size >= 12 && i_memcasecmp(data, "/blockquote>", 12) == 0) {
+			if (--ht->quote_level == 0)
+				ht->ignore_next_text = FALSE;
+			ht->state = HTML_STATE_TAG;
+			return 1;
+		}
+	}
+	if (size < 12) {
+		/* can we see the whole tag name? */
+		for (i = 0; i < size; i++) {
+			if (HTML_WHITESPACE(data[i]) || data[i] == '>')
+				break;
+		}
+		if (i == size) {
+			/* need more data */
+			return 0;
+		}
+	}
+	ht->state = HTML_STATE_TAG;
+	return 1;
+}
+
+static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
+{
+	unsigned int i;
+
+	for (i = 0; i < N_ELEMENTS(html_entities); i++) {
+		if (strcasecmp(html_entities[i].name, name) == 0) {
+			*chr_r = html_entities[i].chr;
+			return TRUE;
+		}
+	}
+	return FALSE;
+}
+
+static size_t parse_entity(const unsigned char *data, size_t size,
+			   buffer_t *output)
+{
+	char entity[10];
+	unichar_t chr;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
+			/* broken entity */
+			return 1;
+		}
+		if (data[i] == ';')
+			break;
+	}
+	if (i == size)
+		return 0;
+
+	i_assert(i < sizeof(entity));
+	memcpy(entity, data, i); entity[i] = '\0';
+
+	if (html_entity_get_unichar(entity, &chr))
+		uni_ucs4_to_utf8_c(chr, output);
+	return i + 1 + 1;
+}
+
+static void mail_html2text_add_space(buffer_t *output)
+{
+	const unsigned char *data = output->data;
+
+	if (output->used > 0 && data[output->used-1] != ' ')
+		buffer_append_c(output, ' ');
+}
+
+static size_t
+parse_data(struct mail_html2text *ht,
+	   const unsigned char *data, size_t size, buffer_t *output)
+{
+	size_t i, ret;
+
+	for (i = 0; i < size; i++) {
+		char c = data[i];
+
+		switch (ht->state) {
+		case HTML_STATE_TEXT:
+			if (c == '<') {
+				ret = parse_tag_name(ht, data+i+1, size-i-1);
+				if (ret == 0)
+					return i;
+				i += ret - 1;
+			} else if (c == '&') {
+				ret = parse_entity(data+i+1, size-i-1, output);
+				if (ret == 0)
+					return i;
+				i += ret - 1;
+			} else if (ht->quote_level == 0) {
+				buffer_append_c(output, c);
+			}
+			break;
+		case HTML_STATE_TAG:
+			if (c == '"')
+				ht->state = HTML_STATE_TAG_DQUOTED;
+			else if (c == '\'')
+				ht->state = HTML_STATE_TAG_DQUOTED;
+			else if (c == '>') {
+				ht->state = HTML_STATE_TEXT;
+				mail_html2text_add_space(output);
+			}
+			break;
+		case HTML_STATE_TAG_DQUOTED:
+			if (c == '"')
+				ht->state = HTML_STATE_TAG;
+			else if (c == '\\')
+				ht->state = HTML_STATE_TAG_DQUOTED_ESCAPE;
+			break;
+		case HTML_STATE_TAG_DQUOTED_ESCAPE:
+			ht->state = HTML_STATE_TAG_DQUOTED;
+			break;
+		case HTML_STATE_TAG_SQUOTED:
+			if (c == '\'')
+				ht->state = HTML_STATE_TAG;
+			else if (c == '\\')
+				ht->state = HTML_STATE_TAG_SQUOTED_ESCAPE;
+			break;
+		case HTML_STATE_TAG_SQUOTED_ESCAPE:
+			ht->state = HTML_STATE_TAG_SQUOTED;
+			break;
+		case HTML_STATE_COMMENT:
+			if (c == '-') {
+				if (i+1 == size)
+					return i;
+				if (data[i+1] == '-') {
+					ht->state = HTML_STATE_COMMENT_END;
+					i++;
+				}
+			}
+			break;
+		case HTML_STATE_COMMENT_END:
+			if (c == '>')
+				ht->state = HTML_STATE_TEXT;
+			else if (!HTML_WHITESPACE(c))
+				ht->state = HTML_STATE_COMMENT;
+			break;
+		case HTML_STATE_SCRIPT:
+			if (c == '<') {
+				unsigned int max_len = I_MIN(size-i, 9);
+
+				if (i_memcasecmp(data+i, "</script>", max_len) == 0) {
+					if (max_len < 9)
+						return i;
+					mail_html2text_add_space(output);
+					ht->state = HTML_STATE_TEXT;
+					i += 8;
+				}
+			}
+			break;
+		case HTML_STATE_STYLE:
+			if (c == '<') {
+				unsigned int max_len = I_MIN(size-i, 8);
+
+				if (i_memcasecmp(data+i, "</style>", max_len) == 0) {
+					if (max_len < 8)
+						return i;
+					mail_html2text_add_space(output);
+					ht->state = HTML_STATE_TEXT;
+					i += 7;
+				}
+			}
+			break;
+		case HTML_STATE_CDATA:
+			if (c == ']') {
+				unsigned int max_len = I_MIN(size-i, 3);
+
+				if (i_memcasecmp(data+i, "]]>", max_len) == 0) {
+					if (max_len < 3)
+						return i;
+					ht->state = HTML_STATE_TEXT;
+					i += 2;
+					break;
+				}
+			}
+			if (ht->quote_level == 0)
+				buffer_append_c(output, c);
+			break;
+		}
+	}
+	return i;
+}
+
+void mail_html2text_more(struct mail_html2text *ht,
+			 const unsigned char *data, size_t size,
+			 buffer_t *output)
+{
+	size_t pos, inc_size, buf_orig_size;
+
+	i_assert(size > 0);
+
+	while (ht->input->used > 0) {
+		/* we didn't get enough input the last time to know
+		   what to do. */
+		buf_orig_size = ht->input->used;
+
+		inc_size = I_MIN(size, 128);
+		buffer_append(ht->input, data, inc_size);
+		pos = parse_data(ht, ht->input->data,
+				 ht->input->used, output);
+		if (pos != 0) {
+			/* we parsed forward */
+			i_assert(pos >= buf_orig_size);
+			data += pos - buf_orig_size;
+			size -= pos - buf_orig_size;
+			buffer_set_used_size(ht->input, 0);
+		} else {
+			/* we need to add more data into buffer */
+			data += inc_size;
+			size -= inc_size;
+			if (size == 0)
+				return;
+		}
+	}
+	pos = parse_data(ht, data, size, output);
+	buffer_append(ht->input, data + pos, size - pos);
+}
+
+void mail_html2text_deinit(struct mail_html2text **_ht)
+{
+	struct mail_html2text *ht = *_ht;
+
+	*_ht = NULL;
+	buffer_free(&ht->input);
+	i_free(ht);
+}
diff --git a/src/lib-mail/mail-html2text.h b/src/lib-mail/mail-html2text.h
new file mode 100644
index 0000000000..7a2b54530c
--- /dev/null
+++ b/src/lib-mail/mail-html2text.h
@@ -0,0 +1,15 @@
+#ifndef MAIL_HTML2TEXT_H
+#define MAIL_HTML2TEXT_H
+
+enum mail_html2text_flags {
+	MAIL_HTML2TEXT_FLAG_SKIP_QUOTED	= 0x01
+};
+
+struct mail_html2text *
+mail_html2text_init(enum mail_html2text_flags flags);
+void mail_html2text_more(struct mail_html2text *ht,
+			 const unsigned char *data, size_t size,
+			 buffer_t *output);
+void mail_html2text_deinit(struct mail_html2text **ht);
+
+#endif
diff --git a/src/lib-mail/test-mail-html2text.c b/src/lib-mail/test-mail-html2text.c
new file mode 100644
index 0000000000..18f936bcb0
--- /dev/null
+++ b/src/lib-mail/test-mail-html2text.c
@@ -0,0 +1,80 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "istream.h"
+#include "mail-html2text.h"
+#include "test-common.h"
+
+static struct {
+	const char *input;
+	const char *output;
+} tests[] = {
+	{ "a&amp;&lt;&clubs;&gt;b",
+	  "a&<\xE2\x99\xA3>b" },
+	{ "&", "" },
+	{ "&amp", "" },
+
+	{ "a<style>stylesheet is ignored</style>b",
+	  "a b" },
+	{ "a<stylea>b</stylea>c",
+	  "a b c" },
+	{ "a<!--x <p foo=\"bar\">commented tags ignored also</p> y-->b",
+	  "ab" },
+	{ "a<script>javascript <p>foo</p> ignored</script>b",
+	  "a b" },
+	{ "a<scripta>b</scripta>c",
+	  "a b c" },
+	{ "a<blockquote><blockquote>second level</blockquote>ignored</blockquote>b",
+	  "a b" },
+	{ "a<![CDATA[<style>]] >b</style>]]>c",
+	  "a<style>]] >b</style>c" },
+
+	{ "a<foo", "a" },
+	{ "a<blockquote", "a" },
+	{ "a<blockquote>foo</blockquote", "a " },
+	{ "a<", "a" },
+	{ "a<![CDATA[b", "ab" },
+	{ "a<![CDATA[b]]", "ab" }
+};
+
+static const char *test_blockquote_input =
+	"a<blockquote>b<blockquote><blockquote>c</blockquote>d</blockquote>e</blockquote>f";
+static const char *test_blockquote_output = "a b c d e f";
+
+static void test_mail_html2text(void)
+{
+	string_t *str = t_str_new(128);
+	struct mail_html2text *ht;
+	unsigned int i, j;
+
+	test_begin("mail_html2text()");
+	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		ht = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
+		for (j = 0; tests[i].input[j] != '\0'; j++) {
+			unsigned char c = tests[i].input[j];
+			mail_html2text_more(ht, &c, 1, str);
+		}
+		test_assert_idx(strcmp(str_c(str), tests[i].output) == 0, i);
+		mail_html2text_deinit(&ht);
+		str_truncate(str, 0);
+	}
+
+	/* test without skipping quoted */
+	ht = mail_html2text_init(0);
+	mail_html2text_more(ht, (const void *)test_blockquote_input,
+			    strlen(test_blockquote_input), str);
+	test_assert(strcmp(str_c(str), test_blockquote_output) == 0);
+	mail_html2text_deinit(&ht);
+
+	test_end();
+}
+
+int main(void)
+{
+	static void (*test_functions[])(void) = {
+		test_mail_html2text,
+		NULL
+	};
+	return test_run(test_functions);
+}