]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-mail: Added mail-html2text API
authorTimo Sirainen <tss@iki.fi>
Fri, 16 Jan 2015 22:15:44 +0000 (00:15 +0200)
committerTimo Sirainen <tss@iki.fi>
Fri, 16 Jan 2015 22:15:44 +0000 (00:15 +0200)
What makes it mail-specific is that it allows skipping over data inside
<blockquote>. This code probably doesn't parse HTML perfectly, but hopefully
good enough for HTML emails.

src/lib-mail/Makefile.am
src/lib-mail/html-entities.h [new file with mode: 0644]
src/lib-mail/mail-html2text.c [new file with mode: 0644]
src/lib-mail/mail-html2text.h [new file with mode: 0644]
src/lib-mail/test-mail-html2text.c [new file with mode: 0644]

index 9a5a6a516a92adcc6cb8767a7602caa19aed2cd0..085c3c68f117e0c783a8ef0bf9cfe2d15a934ed7 100644 (file)
@@ -13,6 +13,7 @@ libmail_la_SOURCES = \
        istream-header-filter.c \
        istream-nonuls.c \
        istream-qp-decoder.c \
+       mail-html2text.c \
        mail-user-hash.c \
        mbox-from.c \
        message-address.c \
@@ -33,6 +34,9 @@ libmail_la_SOURCES = \
        rfc2231-parser.c \
        rfc822-parser.c
 
+noinst_HEADERS = \
+       html-entities.h
+
 headers = \
        istream-attachment-connector.h \
        istream-attachment-extractor.h \
@@ -43,6 +47,7 @@ headers = \
        istream-qp.h \
        mail-user-hash.h \
        mbox-from.h \
+       mail-html2text.h \
        mail-types.h \
        message-address.h \
        message-binary-part.h \
@@ -71,6 +76,7 @@ test_programs = \
        test-istream-binary-converter \
        test-istream-header-filter \
        test-istream-qp-decoder \
+       test-mail-html2text \
        test-mbox-from \
        test-message-address \
        test-message-date \
@@ -160,6 +166,10 @@ test_message_part_SOURCES = test-message-part.c
 test_message_part_LDADD = message-part.lo message-parser.lo message-header-parser.lo message-size.lo rfc822-parser.lo rfc2231-parser.lo $(test_libs)
 test_message_part_DEPENDENCIES = $(test_deps)
 
+test_mail_html2text_SOURCES = test-mail-html2text.c
+test_mail_html2text_LDADD = mail-html2text.lo $(test_libs)
+test_mail_html2text_DEPENDENCIES = $(test_deps)
+
 test_ostream_dot_SOURCES = test-ostream-dot.c
 test_ostream_dot_LDADD = ostream-dot.lo $(test_libs)
 test_ostream_dot_DEPENDENCIES = $(test_deps)
diff --git a/src/lib-mail/html-entities.h b/src/lib-mail/html-entities.h
new file mode 100644 (file)
index 0000000..a3a9f96
--- /dev/null
@@ -0,0 +1,253 @@
+{ "quot",      0x0022 },
+{ "amp",       0x0026 },
+{ "apos",      0x0027 },
+{ "lt",                0x003C },
+{ "gt",                0x003E },
+{ "nbsp",      0x00A0 },
+{ "iexcl",     0x00A1 },
+{ "cent",      0x00A2 },
+{ "pound",     0x00A3 },
+{ "curren",    0x00A4 },
+{ "yen",       0x00A5 },
+{ "brvbar",    0x00A6 },
+{ "sect",      0x00A7 },
+{ "uml",       0x00A8 },
+{ "copy",      0x00A9 },
+{ "ordf",      0x00AA },
+{ "laquo",     0x00AB },
+{ "not",       0x00AC },
+{ "shy",       0x00AD },
+{ "reg",       0x00AE },
+{ "macr",      0x00AF },
+{ "deg",       0x00B0 },
+{ "plusmn",    0x00B1 },
+{ "sup2",      0x00B2 },
+{ "sup3",      0x00B3 },
+{ "acute",     0x00B4 },
+{ "micro",     0x00B5 },
+{ "para",      0x00B6 },
+{ "middot",    0x00B7 },
+{ "cedil",     0x00B8 },
+{ "sup1",      0x00B9 },
+{ "ordm",      0x00BA },
+{ "raquo",     0x00BB },
+{ "frac14",    0x00BC },
+{ "frac12",    0x00BD },
+{ "frac34",    0x00BE },
+{ "iquest",    0x00BF },
+{ "Agrave",    0x00C0 },
+{ "Aacute",    0x00C1 },
+{ "Acirc",     0x00C2 },
+{ "Atilde",    0x00C3 },
+{ "Auml",      0x00C4 },
+{ "Aring",     0x00C5 },
+{ "AElig",     0x00C6 },
+{ "Ccedil",    0x00C7 },
+{ "Egrave",    0x00C8 },
+{ "Eacute",    0x00C9 },
+{ "Ecirc",     0x00CA },
+{ "Euml",      0x00CB },
+{ "Igrave",    0x00CC },
+{ "Iacute",    0x00CD },
+{ "Icirc",     0x00CE },
+{ "Iuml",      0x00CF },
+{ "ETH",       0x00D0 },
+{ "Ntilde",    0x00D1 },
+{ "Ograve",    0x00D2 },
+{ "Oacute",    0x00D3 },
+{ "Ocirc",     0x00D4 },
+{ "Otilde",    0x00D5 },
+{ "Ouml",      0x00D6 },
+{ "times",     0x00D7 },
+{ "Oslash",    0x00D8 },
+{ "Ugrave",    0x00D9 },
+{ "Uacute",    0x00DA },
+{ "Ucirc",     0x00DB },
+{ "Uuml",      0x00DC },
+{ "Yacute",    0x00DD },
+{ "THORN",     0x00DE },
+{ "szlig",     0x00DF },
+{ "agrave",    0x00E0 },
+{ "aacute",    0x00E1 },
+{ "acirc",     0x00E2 },
+{ "atilde",    0x00E3 },
+{ "auml",      0x00E4 },
+{ "aring",     0x00E5 },
+{ "aelig",     0x00E6 },
+{ "ccedil",    0x00E7 },
+{ "egrave",    0x00E8 },
+{ "eacute",    0x00E9 },
+{ "ecirc",     0x00EA },
+{ "euml",      0x00EB },
+{ "igrave",    0x00EC },
+{ "iacute",    0x00ED },
+{ "icirc",     0x00EE },
+{ "iuml",      0x00EF },
+{ "eth",       0x00F0 },
+{ "ntilde",    0x00F1 },
+{ "ograve",    0x00F2 },
+{ "oacute",    0x00F3 },
+{ "ocirc",     0x00F4 },
+{ "otilde",    0x00F5 },
+{ "ouml",      0x00F6 },
+{ "divide",    0x00F7 },
+{ "oslash",    0x00F8 },
+{ "ugrave",    0x00F9 },
+{ "uacute",    0x00FA },
+{ "ucirc",     0x00FB },
+{ "uuml",      0x00FC },
+{ "yacute",    0x00FD },
+{ "thorn",     0x00FE },
+{ "yuml",      0x00FF },
+{ "OElig",     0x0152 },
+{ "oelig",     0x0153 },
+{ "Scaron",    0x0160 },
+{ "scaron",    0x0161 },
+{ "Yuml",      0x0178 },
+{ "fnof",      0x0192 },
+{ "circ",      0x02C6 },
+{ "tilde",     0x02DC },
+{ "Alpha",     0x0391 },
+{ "Beta",      0x0392 },
+{ "Gamma",     0x0393 },
+{ "Delta",     0x0394 },
+{ "Epsilon",   0x0395 },
+{ "Zeta",      0x0396 },
+{ "Eta",       0x0397 },
+{ "Theta",     0x0398 },
+{ "Iota",      0x0399 },
+{ "Kappa",     0x039A },
+{ "Lambda",    0x039B },
+{ "Mu",                0x039C },
+{ "Nu",                0x039D },
+{ "Xi",                0x039E },
+{ "Omicron",   0x039F },
+{ "Pi",                0x03A0 },
+{ "Rho",       0x03A1 },
+{ "Sigma",     0x03A3 },
+{ "Tau",       0x03A4 },
+{ "Upsilon",   0x03A5 },
+{ "Phi",       0x03A6 },
+{ "Chi",       0x03A7 },
+{ "Psi",       0x03A8 },
+{ "Omega",     0x03A9 },
+{ "alpha",     0x03B1 },
+{ "beta",      0x03B2 },
+{ "gamma",     0x03B3 },
+{ "delta",     0x03B4 },
+{ "epsilon",   0x03B5 },
+{ "zeta",      0x03B6 },
+{ "eta",       0x03B7 },
+{ "theta",     0x03B8 },
+{ "iota",      0x03B9 },
+{ "kappa",     0x03BA },
+{ "lambda",    0x03BB },
+{ "mu",                0x03BC },
+{ "nu",                0x03BD },
+{ "xi",                0x03BE },
+{ "omicron",   0x03BF },
+{ "pi",                0x03C0 },
+{ "rho",       0x03C1 },
+{ "sigmaf",    0x03C2 },
+{ "sigma",     0x03C3 },
+{ "tau",       0x03C4 },
+{ "upsilon",   0x03C5 },
+{ "phi",       0x03C6 },
+{ "chi",       0x03C7 },
+{ "psi",       0x03C8 },
+{ "omega",     0x03C9 },
+{ "thetasym",  0x03D1 },
+{ "upsih",     0x03D2 },
+{ "piv",       0x03D6 },
+{ "ensp",      0x2002 },
+{ "emsp",      0x2003 },
+{ "thinsp",    0x2009 },
+{ "zwnj",      0x200C },
+{ "zwj",       0x200D },
+{ "lrm",       0x200E },
+{ "rlm",       0x200F },
+{ "ndash",     0x2013 },
+{ "mdash",     0x2014 },
+{ "lsquo",     0x2018 },
+{ "rsquo",     0x2019 },
+{ "sbquo",     0x201A },
+{ "ldquo",     0x201C },
+{ "rdquo",     0x201D },
+{ "bdquo",     0x201E },
+{ "dagger",    0x2020 },
+{ "Dagger",    0x2021 },
+{ "bull",      0x2022 },
+{ "hellip",    0x2026 },
+{ "permil",    0x2030 },
+{ "prime",     0x2032 },
+{ "Prime",     0x2033 },
+{ "lsaquo",    0x2039 },
+{ "rsaquo",    0x203A },
+{ "oline",     0x203E },
+{ "frasl",     0x2044 },
+{ "euro",      0x20AC },
+{ "image",     0x2111 },
+{ "weierp",    0x2118 },
+{ "real",      0x211C },
+{ "trade",     0x2122 },
+{ "alefsym",   0x2135 },
+{ "larr",      0x2190 },
+{ "uarr",      0x2191 },
+{ "rarr",      0x2192 },
+{ "darr",      0x2193 },
+{ "harr",      0x2194 },
+{ "crarr",     0x21B5 },
+{ "lArr",      0x21D0 },
+{ "uArr",      0x21D1 },
+{ "rArr",      0x21D2 },
+{ "dArr",      0x21D3 },
+{ "hArr",      0x21D4 },
+{ "forall",    0x2200 },
+{ "part",      0x2202 },
+{ "exist",     0x2203 },
+{ "empty",     0x2205 },
+{ "nabla",     0x2207 },
+{ "isin",      0x2208 },
+{ "notin",     0x2209 },
+{ "ni",                0x220B },
+{ "prod",      0x220F },
+{ "sum",       0x2211 },
+{ "minus",     0x2212 },
+{ "lowast",    0x2217 },
+{ "radic",     0x221A },
+{ "prop",      0x221D },
+{ "infin",     0x221E },
+{ "ang",       0x2220 },
+{ "and",       0x2227 },
+{ "or",                0x2228 },
+{ "cap",       0x2229 },
+{ "cup",       0x222A },
+{ "int",       0x222B },
+{ "there4",    0x2234 },
+{ "sim",       0x223C },
+{ "cong",      0x2245 },
+{ "asymp",     0x2248 },
+{ "ne",                0x2260 },
+{ "equiv",     0x2261 },
+{ "le",                0x2264 },
+{ "ge",                0x2265 },
+{ "sub",       0x2282 },
+{ "sup",       0x2283 },
+{ "nsub",      0x2284 },
+{ "sube",      0x2286 },
+{ "supe",      0x2287 },
+{ "oplus",     0x2295 },
+{ "otimes",    0x2297 },
+{ "perp",      0x22A5 },
+{ "sdot",      0x22C5 },
+{ "lceil",     0x2308 },
+{ "rceil",     0x2309 },
+{ "lfloor",    0x230A },
+{ "rfloor",    0x230B },
+{ "lang",      0x27E8 },
+{ "rang",      0x27E9 },
+{ "loz",       0x25CA },
+{ "spades",    0x2660 },
+{ "clubs",     0x2663 },
+{ "hearts",    0x2665 },
+{ "diams",     0x2666 }
diff --git a/src/lib-mail/mail-html2text.c b/src/lib-mail/mail-html2text.c
new file mode 100644 (file)
index 0000000..e9b52ef
--- /dev/null
@@ -0,0 +1,323 @@
+/* Copyright (c) 2011-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "message-parser.h"
+#include "mail-html2text.h"
+
+/* Zero-width space (&#x200B;) apparently also belongs here, but that gets a
+   bit tricky to handle.. is it actually used anywhere? */
+#define HTML_WHITESPACE(c) \
+       ((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
+
+enum html_state {
+       /* regular text */
+       HTML_STATE_TEXT,
+       /* tag outside "quoted string" */
+       HTML_STATE_TAG,
+       /* tag inside "double quoted string" */
+       HTML_STATE_TAG_DQUOTED,
+       /* tag -> "escape\ */
+       HTML_STATE_TAG_DQUOTED_ESCAPE,
+       /* tag inside 'single quoted string' */
+       HTML_STATE_TAG_SQUOTED,
+       /* tag -> 'escape\ */
+       HTML_STATE_TAG_SQUOTED_ESCAPE,
+       /* comment */
+       HTML_STATE_COMMENT,
+       /* comment is ending, we've seen "--" and now just waiting for ">" */
+       HTML_STATE_COMMENT_END,
+       /* (java)script */
+       HTML_STATE_SCRIPT,
+       /* CSS style */
+       HTML_STATE_STYLE,
+       /* <![CDATA[...]]> */
+       HTML_STATE_CDATA
+};
+
+struct mail_html2text {
+       enum mail_html2text_flags flags;
+       enum html_state state;
+       buffer_t *input;
+       unsigned int quote_level;
+       bool ignore_next_text;
+};
+
+static struct {
+       const char *name;
+       unichar_t chr;
+} html_entities[] = {
+#include "html-entities.h"
+};
+
+struct mail_html2text *
+mail_html2text_init(enum mail_html2text_flags flags)
+{
+       struct mail_html2text *ht;
+
+       ht = i_new(struct mail_html2text, 1);
+       ht->flags = flags;
+       ht->input = buffer_create_dynamic(default_pool, 512);
+       return ht;
+}
+
+static size_t
+parse_tag_name(struct mail_html2text *ht,
+              const unsigned char *data, size_t size)
+{
+       size_t i;
+
+       if (size >= 3 && memcmp(data, "!--", 3) == 0) {
+               ht->state = HTML_STATE_COMMENT;
+               return 3 + 1;
+       }
+       if (size >= 7 && i_memcasecmp(data, "script", 6) == 0 &&
+           (HTML_WHITESPACE(data[6]) || data[6] == '>')) {
+               ht->state = HTML_STATE_SCRIPT;
+               return 7 + 1;
+       }
+       if (size >= 6 && i_memcasecmp(data, "style", 5) == 0 &&
+           (HTML_WHITESPACE(data[5]) || data[5] == '>')) {
+               ht->state = HTML_STATE_STYLE;
+               return 6 + 1;
+       }
+       if (size >= 8 && i_memcasecmp(data, "![CDATA[", 8) == 0) {
+               ht->state = HTML_STATE_CDATA;
+               return 8 + 1;
+       }
+
+       if ((ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) != 0) {
+               if (size >= 10 && i_memcasecmp(data, "blockquote", 10) == 0 &&
+                   (HTML_WHITESPACE(data[10]) || data[10] == '>')) {
+                       ht->quote_level++;
+                       ht->state = HTML_STATE_TAG;
+                       return 1;
+               } else if (ht->quote_level > 0 &&
+                          size >= 12 && i_memcasecmp(data, "/blockquote>", 12) == 0) {
+                       if (--ht->quote_level == 0)
+                               ht->ignore_next_text = FALSE;
+                       ht->state = HTML_STATE_TAG;
+                       return 1;
+               }
+       }
+       if (size < 12) {
+               /* can we see the whole tag name? */
+               for (i = 0; i < size; i++) {
+                       if (HTML_WHITESPACE(data[i]) || data[i] == '>')
+                               break;
+               }
+               if (i == size) {
+                       /* need more data */
+                       return 0;
+               }
+       }
+       ht->state = HTML_STATE_TAG;
+       return 1;
+}
+
+static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
+{
+       unsigned int i;
+
+       for (i = 0; i < N_ELEMENTS(html_entities); i++) {
+               if (strcasecmp(html_entities[i].name, name) == 0) {
+                       *chr_r = html_entities[i].chr;
+                       return TRUE;
+               }
+       }
+       return FALSE;
+}
+
+static size_t parse_entity(const unsigned char *data, size_t size,
+                          buffer_t *output)
+{
+       char entity[10];
+       unichar_t chr;
+       size_t i;
+
+       for (i = 0; i < size; i++) {
+               if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
+                       /* broken entity */
+                       return 1;
+               }
+               if (data[i] == ';')
+                       break;
+       }
+       if (i == size)
+               return 0;
+
+       i_assert(i < sizeof(entity));
+       memcpy(entity, data, i); entity[i] = '\0';
+
+       if (html_entity_get_unichar(entity, &chr))
+               uni_ucs4_to_utf8_c(chr, output);
+       return i + 1 + 1;
+}
+
+static void mail_html2text_add_space(buffer_t *output)
+{
+       const unsigned char *data = output->data;
+
+       if (output->used > 0 && data[output->used-1] != ' ')
+               buffer_append_c(output, ' ');
+}
+
+static size_t
+parse_data(struct mail_html2text *ht,
+          const unsigned char *data, size_t size, buffer_t *output)
+{
+       size_t i, ret;
+
+       for (i = 0; i < size; i++) {
+               char c = data[i];
+
+               switch (ht->state) {
+               case HTML_STATE_TEXT:
+                       if (c == '<') {
+                               ret = parse_tag_name(ht, data+i+1, size-i-1);
+                               if (ret == 0)
+                                       return i;
+                               i += ret - 1;
+                       } else if (c == '&') {
+                               ret = parse_entity(data+i+1, size-i-1, output);
+                               if (ret == 0)
+                                       return i;
+                               i += ret - 1;
+                       } else if (ht->quote_level == 0) {
+                               buffer_append_c(output, c);
+                       }
+                       break;
+               case HTML_STATE_TAG:
+                       if (c == '"')
+                               ht->state = HTML_STATE_TAG_DQUOTED;
+                       else if (c == '\'')
+                               ht->state = HTML_STATE_TAG_DQUOTED;
+                       else if (c == '>') {
+                               ht->state = HTML_STATE_TEXT;
+                               mail_html2text_add_space(output);
+                       }
+                       break;
+               case HTML_STATE_TAG_DQUOTED:
+                       if (c == '"')
+                               ht->state = HTML_STATE_TAG;
+                       else if (c == '\\')
+                               ht->state = HTML_STATE_TAG_DQUOTED_ESCAPE;
+                       break;
+               case HTML_STATE_TAG_DQUOTED_ESCAPE:
+                       ht->state = HTML_STATE_TAG_DQUOTED;
+                       break;
+               case HTML_STATE_TAG_SQUOTED:
+                       if (c == '\'')
+                               ht->state = HTML_STATE_TAG;
+                       else if (c == '\\')
+                               ht->state = HTML_STATE_TAG_SQUOTED_ESCAPE;
+                       break;
+               case HTML_STATE_TAG_SQUOTED_ESCAPE:
+                       ht->state = HTML_STATE_TAG_SQUOTED;
+                       break;
+               case HTML_STATE_COMMENT:
+                       if (c == '-') {
+                               if (i+1 == size)
+                                       return i;
+                               if (data[i+1] == '-') {
+                                       ht->state = HTML_STATE_COMMENT_END;
+                                       i++;
+                               }
+                       }
+                       break;
+               case HTML_STATE_COMMENT_END:
+                       if (c == '>')
+                               ht->state = HTML_STATE_TEXT;
+                       else if (!HTML_WHITESPACE(c))
+                               ht->state = HTML_STATE_COMMENT;
+                       break;
+               case HTML_STATE_SCRIPT:
+                       if (c == '<') {
+                               unsigned int max_len = I_MIN(size-i, 9);
+
+                               if (i_memcasecmp(data+i, "</script>", max_len) == 0) {
+                                       if (max_len < 9)
+                                               return i;
+                                       mail_html2text_add_space(output);
+                                       ht->state = HTML_STATE_TEXT;
+                                       i += 8;
+                               }
+                       }
+                       break;
+               case HTML_STATE_STYLE:
+                       if (c == '<') {
+                               unsigned int max_len = I_MIN(size-i, 8);
+
+                               if (i_memcasecmp(data+i, "</style>", max_len) == 0) {
+                                       if (max_len < 8)
+                                               return i;
+                                       mail_html2text_add_space(output);
+                                       ht->state = HTML_STATE_TEXT;
+                                       i += 7;
+                               }
+                       }
+                       break;
+               case HTML_STATE_CDATA:
+                       if (c == ']') {
+                               unsigned int max_len = I_MIN(size-i, 3);
+
+                               if (i_memcasecmp(data+i, "]]>", max_len) == 0) {
+                                       if (max_len < 3)
+                                               return i;
+                                       ht->state = HTML_STATE_TEXT;
+                                       i += 2;
+                                       break;
+                               }
+                       }
+                       if (ht->quote_level == 0)
+                               buffer_append_c(output, c);
+                       break;
+               }
+       }
+       return i;
+}
+
+void mail_html2text_more(struct mail_html2text *ht,
+                        const unsigned char *data, size_t size,
+                        buffer_t *output)
+{
+       size_t pos, inc_size, buf_orig_size;
+
+       i_assert(size > 0);
+
+       while (ht->input->used > 0) {
+               /* we didn't get enough input the last time to know
+                  what to do. */
+               buf_orig_size = ht->input->used;
+
+               inc_size = I_MIN(size, 128);
+               buffer_append(ht->input, data, inc_size);
+               pos = parse_data(ht, ht->input->data,
+                                ht->input->used, output);
+               if (pos != 0) {
+                       /* we parsed forward */
+                       i_assert(pos >= buf_orig_size);
+                       data += pos - buf_orig_size;
+                       size -= pos - buf_orig_size;
+                       buffer_set_used_size(ht->input, 0);
+               } else {
+                       /* we need to add more data into buffer */
+                       data += inc_size;
+                       size -= inc_size;
+                       if (size == 0)
+                               return;
+               }
+       }
+       pos = parse_data(ht, data, size, output);
+       buffer_append(ht->input, data + pos, size - pos);
+}
+
+void mail_html2text_deinit(struct mail_html2text **_ht)
+{
+       struct mail_html2text *ht = *_ht;
+
+       *_ht = NULL;
+       buffer_free(&ht->input);
+       i_free(ht);
+}
diff --git a/src/lib-mail/mail-html2text.h b/src/lib-mail/mail-html2text.h
new file mode 100644 (file)
index 0000000..7a2b545
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef MAIL_HTML2TEXT_H
+#define MAIL_HTML2TEXT_H
+
+enum mail_html2text_flags {
+       MAIL_HTML2TEXT_FLAG_SKIP_QUOTED = 0x01
+};
+
+struct mail_html2text *
+mail_html2text_init(enum mail_html2text_flags flags);
+void mail_html2text_more(struct mail_html2text *ht,
+                        const unsigned char *data, size_t size,
+                        buffer_t *output);
+void mail_html2text_deinit(struct mail_html2text **ht);
+
+#endif
diff --git a/src/lib-mail/test-mail-html2text.c b/src/lib-mail/test-mail-html2text.c
new file mode 100644 (file)
index 0000000..18f936b
--- /dev/null
@@ -0,0 +1,80 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "istream.h"
+#include "mail-html2text.h"
+#include "test-common.h"
+
+static struct {
+       const char *input;
+       const char *output;
+} tests[] = {
+       { "a&amp;&lt;&clubs;&gt;b",
+         "a&<\xE2\x99\xA3>b" },
+       { "&", "" },
+       { "&amp", "" },
+
+       { "a<style>stylesheet is ignored</style>b",
+         "a b" },
+       { "a<stylea>b</stylea>c",
+         "a b c" },
+       { "a<!--x <p foo=\"bar\">commented tags ignored also</p> y-->b",
+         "ab" },
+       { "a<script>javascript <p>foo</p> ignored</script>b",
+         "a b" },
+       { "a<scripta>b</scripta>c",
+         "a b c" },
+       { "a<blockquote><blockquote>second level</blockquote>ignored</blockquote>b",
+         "a b" },
+       { "a<![CDATA[<style>]] >b</style>]]>c",
+         "a<style>]] >b</style>c" },
+
+       { "a<foo", "a" },
+       { "a<blockquote", "a" },
+       { "a<blockquote>foo</blockquote", "a " },
+       { "a<", "a" },
+       { "a<![CDATA[b", "ab" },
+       { "a<![CDATA[b]]", "ab" }
+};
+
+static const char *test_blockquote_input =
+       "a<blockquote>b<blockquote><blockquote>c</blockquote>d</blockquote>e</blockquote>f";
+static const char *test_blockquote_output = "a b c d e f";
+
+static void test_mail_html2text(void)
+{
+       string_t *str = t_str_new(128);
+       struct mail_html2text *ht;
+       unsigned int i, j;
+
+       test_begin("mail_html2text()");
+       for (i = 0; i < N_ELEMENTS(tests); i++) {
+               ht = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
+               for (j = 0; tests[i].input[j] != '\0'; j++) {
+                       unsigned char c = tests[i].input[j];
+                       mail_html2text_more(ht, &c, 1, str);
+               }
+               test_assert_idx(strcmp(str_c(str), tests[i].output) == 0, i);
+               mail_html2text_deinit(&ht);
+               str_truncate(str, 0);
+       }
+
+       /* test without skipping quoted */
+       ht = mail_html2text_init(0);
+       mail_html2text_more(ht, (const void *)test_blockquote_input,
+                           strlen(test_blockquote_input), str);
+       test_assert(strcmp(str_c(str), test_blockquote_output) == 0);
+       mail_html2text_deinit(&ht);
+
+       test_end();
+}
+
+int main(void)
+{
+       static void (*test_functions[])(void) = {
+               test_mail_html2text,
+               NULL
+       };
+       return test_run(test_functions);
+}