]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
* Add simple html parser and tag balancing detector
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 15 May 2009 14:15:54 +0000 (18:15 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 15 May 2009 14:15:54 +0000 (18:15 +0400)
* Add function for searching html tag

CMakeLists.txt
README.utf8.txt
src/expressions.c
src/html.c [new file with mode: 0644]
src/html.h [new file with mode: 0644]
src/message.c
src/message.h

index e346370d175457407ef313bb14830f20482af3b1..f744fff03a370b9b33fd0b4fe37e38d0a112d39c 100644 (file)
@@ -7,7 +7,7 @@ PROJECT(rspamd C)
 
 SET(RSPAMD_VERSION_MAJOR 0)
 SET(RSPAMD_VERSION_MINOR 1)
-SET(RSPAMD_VERSION_PATCH 1)
+SET(RSPAMD_VERSION_PATCH 2)
 
 SET(RSPAMD_VERSION         "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}")
 SET(RSPAMD_MASTER_SITE_URL "http://cebka.pp.ru/hg/rspamd")
@@ -304,6 +304,7 @@ SET(RSPAMDSRC       src/modules.c
                                src/controller.c
                                src/cfg_utils.c
                                src/buffer.c
+                               src/html.c
                                src/lmtp.c
                                src/lmtp_proto.c)
 
@@ -339,6 +340,7 @@ SET(TESTDEPENDS     src/mem_pool.c
                                src/fuzzy.c
                                src/memcached.c
                                src/message.c
+                               src/html.c
                                src/expressions.c
                                src/statfile.c)
 
@@ -351,6 +353,7 @@ SET(UTILSDEPENDS src/mem_pool.c
                                src/fuzzy.c
                                src/expressions.c
                                src/message.c
+                               src/html.c
                                src/util.c)
 
 LIST(LENGTH PLUGINSSRC RSPAMD_MODULES_NUM)
index a52e380f4e2a8c1d33d1ca78e72f626aee5a4cf3..b27e2876f1ba63f3c80ffdaed1320fac7cce6999 100644 (file)
@@ -156,6 +156,8 @@ SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})"
   * has_only_html_part - функция возвращает TRUE, если в сообщении есть только одна HTML часть
   * compare_recipients_distance - вычисляет процент схожих получателей письма. Принимает аргумент - порог в процентах похожести.
   * is_recipients_sorted - возвращает TRUE, если список получателей сортирован (работает только если число получателей >= 5).
+  * is_html_balanced - возвращает TRUE, если теги всех html частей сбалансированы
+  * has_html_tag - возвращает TRUE, если заданный html тег найден
 
 Модуль chartable.
 ================
index c7b88adb9560782634e4d676008cd3a69560be70..05bc12e88716c013be36deccf7666c86e86abe07 100644 (file)
@@ -29,6 +29,7 @@
 #include "message.h"
 #include "fuzzy.h"
 #include "expressions.h"
+#include "html.h"
 
 gboolean rspamd_compare_encoding (struct worker_task *task, GList *args);
 gboolean rspamd_header_exists (struct worker_task *task, GList *args);
@@ -43,6 +44,8 @@ gboolean rspamd_has_content_part_len (struct worker_task *task, GList *args);
 gboolean rspamd_has_only_html_part (struct worker_task *task, GList *args);
 gboolean rspamd_is_recipients_sorted (struct worker_task *task, GList *args);
 gboolean rspamd_compare_transfer_encoding (struct worker_task *task, GList *args);
+gboolean rspamd_is_html_balanced (struct worker_task *task, GList *args);
+gboolean rspamd_has_html_tag (struct worker_task *task, GList *args);
 
 /*
  * List of internal functions of rspamd
@@ -62,8 +65,10 @@ static struct _fl {
        { "content_type_is_type", rspamd_content_type_is_type },
        { "has_content_part", rspamd_has_content_part },
        { "has_content_part_len", rspamd_has_content_part_len },
+       { "has_html_tag", rspamd_has_html_tag },
        { "has_only_html_part", rspamd_has_only_html_part },
        { "header_exists", rspamd_header_exists },
+       { "is_html_balanced", rspamd_is_html_balanced },
        { "is_recipients_sorted", rspamd_is_recipients_sorted },
 };
 
@@ -1523,6 +1528,92 @@ rspamd_compare_transfer_encoding (struct worker_task *task, GList *args)
        return FALSE;
 }
 
+gboolean 
+rspamd_is_html_balanced (struct worker_task *task, GList *args)
+{
+       struct mime_text_part *p;
+       GList *cur;
+       gboolean res = TRUE;
+
+       cur = g_list_first (task->text_parts);
+       while (cur) {
+               p = cur->data;
+               if (p->is_html) {
+                       if (p->is_balanced) {
+                               res = TRUE;
+                       }
+                       else {
+                               res = FALSE;
+                               break;
+                       }
+               }
+               cur = g_list_next (cur);
+       }
+
+       return res;
+
+}
+
+struct html_callback_data {
+       struct html_tag *tag;
+       gboolean *res;
+};
+
+static gboolean
+search_html_node_callback (GNode *node, gpointer data)
+{
+       struct html_callback_data *cd = data;
+       struct html_node *nd;
+       
+       nd = node->data;
+       if (nd) {
+               if (nd->tag == cd->tag) {
+                       *cd->res = TRUE;
+                       return TRUE;
+               }
+       }
+
+       return FALSE;
+}
+
+gboolean 
+rspamd_has_html_tag (struct worker_task *task, GList *args)
+{
+       struct mime_text_part *p;
+       GList *cur;
+       struct expression_argument *arg;
+       struct html_tag *tag;
+       gboolean res = FALSE;
+       struct html_callback_data cd;
+       
+       if (args == NULL) {
+               msg_warn ("rspamd_has_html_tag: no parameters to function");
+               return FALSE;
+       }
+       
+       arg = get_function_arg (args->data, task, TRUE);
+       tag = get_tag_by_name (arg->data);
+       if (tag == NULL) {
+               msg_warn ("rspamd_has_html_tag: unknown tag type passed as argument: %s", (char *)arg->data);
+               return FALSE;
+       }
+
+       cur = g_list_first (task->text_parts);
+       cd.res = &res;
+       cd.tag = tag;
+
+       while (cur && res == FALSE) {
+               p = cur->data;
+               if (p->is_html && p->html_nodes) {
+                       g_node_traverse (p->html_nodes, G_PRE_ORDER, G_TRAVERSE_ALL, -1, search_html_node_callback, &cd);
+               }
+               cur = g_list_next (cur);
+       }
+
+       return res;
+
+}
+
 /*
  * vi:ts=4
  */
diff --git a/src/html.c b/src/html.c
new file mode 100644 (file)
index 0000000..9a816c4
--- /dev/null
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "util.h"
+#include "main.h"
+#include "message.h"
+#include "html.h"
+
+sig_atomic_t tags_sorted = 0;
+
+static struct html_tag tag_defs[] =
+{
+  /* W3C defined elements */
+  { Tag_A,          "a",          (CM_INLINE)},
+  { Tag_ABBR,       "abbr",       (CM_INLINE)},
+  { Tag_ACRONYM,    "acronym",    (CM_INLINE)},
+  { Tag_ADDRESS,    "address",    (CM_BLOCK)},
+  { Tag_APPLET,     "applet",     (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)},
+  { Tag_AREA,       "area",       (CM_BLOCK|CM_EMPTY)},
+  { Tag_B,          "b",          (CM_INLINE)},
+  { Tag_BASE,       "base",       (CM_HEAD|CM_EMPTY)},
+  { Tag_BASEFONT,   "basefont",   (CM_INLINE|CM_EMPTY)},
+  { Tag_BDO,        "bdo",        (CM_INLINE)},
+  { Tag_BIG,        "big",        (CM_INLINE)},
+  { Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)},
+  { Tag_BODY,       "body",       (CM_HTML|CM_OPT|CM_OMITST)},
+  { Tag_BR,         "br",         (CM_INLINE|CM_EMPTY)},
+  { Tag_BUTTON,     "button",     (CM_INLINE)},
+  { Tag_CAPTION,    "caption",    (CM_TABLE)},
+  { Tag_CENTER,     "center",     (CM_BLOCK)},
+  { Tag_CITE,       "cite",       (CM_INLINE)},
+  { Tag_CODE,       "code",       (CM_INLINE)},
+  { Tag_COL,        "col",        (CM_TABLE|CM_EMPTY)},
+  { Tag_COLGROUP,   "colgroup",   (CM_TABLE|CM_OPT)},
+  { Tag_DD,         "dd",         (CM_DEFLIST|CM_OPT|CM_NO_INDENT)},
+  { Tag_DEL,        "del",        (CM_INLINE|CM_BLOCK|CM_MIXED)},
+  { Tag_DFN,        "dfn",        (CM_INLINE)},
+  { Tag_DIR,        "dir",        (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_DIV,        "div",        (CM_BLOCK)},
+  { Tag_DL,         "dl",         (CM_BLOCK)},
+  { Tag_DT,         "dt",         (CM_DEFLIST|CM_OPT|CM_NO_INDENT)},
+  { Tag_EM,         "em",         (CM_INLINE)},
+  { Tag_FIELDSET,   "fieldset",   (CM_BLOCK)},
+  { Tag_FONT,       "font",       (CM_INLINE)},
+  { Tag_FORM,       "form",       (CM_BLOCK)},
+  { Tag_FRAME,      "frame",      (CM_FRAMES|CM_EMPTY)},
+  { Tag_FRAMESET,   "frameset",   (CM_HTML|CM_FRAMES)},
+  { Tag_H1,         "h1",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H2,         "h2",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H3,         "h3",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H4,         "h4",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H5,         "h5",         (CM_BLOCK|CM_HEADING)},
+  { Tag_H6,         "h6",         (CM_BLOCK|CM_HEADING)},
+  { Tag_HEAD,       "head",       (CM_HTML|CM_OPT|CM_OMITST)},
+  { Tag_HR,         "hr",         (CM_BLOCK|CM_EMPTY)},
+  { Tag_HTML,       "html",       (CM_HTML|CM_OPT|CM_OMITST)},
+  { Tag_I,          "i",          (CM_INLINE)},
+  { Tag_IFRAME,     "iframe",     (CM_INLINE)},
+  { Tag_IMG,        "img",        (CM_INLINE|CM_IMG|CM_EMPTY)},
+  { Tag_INPUT,      "input",      (CM_INLINE|CM_IMG|CM_EMPTY)},
+  { Tag_INS,        "ins",        (CM_INLINE|CM_BLOCK|CM_MIXED)},
+  { Tag_ISINDEX,    "isindex",    (CM_BLOCK|CM_EMPTY)},
+  { Tag_KBD,        "kbd",        (CM_INLINE)},
+  { Tag_LABEL,      "label",      (CM_INLINE)},
+  { Tag_LEGEND,     "legend",     (CM_INLINE)},
+  { Tag_LI,         "li",         (CM_LIST|CM_OPT|CM_NO_INDENT)},
+  { Tag_LINK,       "link",       (CM_HEAD|CM_EMPTY)},
+  { Tag_LISTING,    "listing",    (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_MAP,        "map",        (CM_INLINE)},
+  { Tag_MENU,       "menu",       (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_META,       "meta",       (CM_HEAD|CM_EMPTY)},
+  { Tag_NOFRAMES,   "noframes",   (CM_BLOCK|CM_FRAMES)},
+  { Tag_NOSCRIPT,   "noscript",   (CM_BLOCK|CM_INLINE|CM_MIXED)},
+  { Tag_OBJECT,     "object",     (CM_OBJECT|CM_HEAD|CM_IMG|CM_INLINE|CM_PARAM)},
+  { Tag_OL,         "ol",         (CM_BLOCK)},
+  { Tag_OPTGROUP,   "optgroup",   (CM_FIELD|CM_OPT)},
+  { Tag_OPTION,     "option",     (CM_FIELD|CM_OPT)},
+  { Tag_P,          "p",          (CM_BLOCK|CM_OPT)},
+  { Tag_PARAM,      "param",      (CM_INLINE|CM_EMPTY)},
+  { Tag_PLAINTEXT,  "plaintext",  (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_PRE,        "pre",        (CM_BLOCK)},
+  { Tag_Q,          "q",          (CM_INLINE)},
+  { Tag_RB,         "rb",         (CM_INLINE)},
+  { Tag_RBC,        "rbc",        (CM_INLINE)},
+  { Tag_RP,         "rp",         (CM_INLINE)},
+  { Tag_RT,         "rt",         (CM_INLINE)},
+  { Tag_RTC,        "rtc",        (CM_INLINE)},
+  { Tag_RUBY,       "ruby",       (CM_INLINE)},
+  { Tag_S,          "s",          (CM_INLINE)},
+  { Tag_SAMP,       "samp",       (CM_INLINE)},
+  { Tag_SCRIPT,     "script",     (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)},
+  { Tag_SELECT,     "select",     (CM_INLINE|CM_FIELD)},
+  { Tag_SMALL,      "small",      (CM_INLINE)},
+  { Tag_SPAN,       "span",       (CM_INLINE)},
+  { Tag_STRIKE,     "strike",     (CM_INLINE)},
+  { Tag_STRONG,     "strong",     (CM_INLINE)},
+  { Tag_STYLE,      "style",      (CM_HEAD)},
+  { Tag_SUB,        "sub",        (CM_INLINE)},
+  { Tag_SUP,        "sup",        (CM_INLINE)},
+  { Tag_TABLE,      "table",      (CM_BLOCK)},
+  { Tag_TBODY,      "tbody",      (CM_TABLE|CM_ROWGRP|CM_OPT)},
+  { Tag_TD,         "td",         (CM_ROW|CM_OPT|CM_NO_INDENT)},
+  { Tag_TEXTAREA,   "textarea",   (CM_INLINE|CM_FIELD)},
+  { Tag_TFOOT,      "tfoot",      (CM_TABLE|CM_ROWGRP|CM_OPT)},
+  { Tag_TH,         "th",         (CM_ROW|CM_OPT|CM_NO_INDENT)},
+  { Tag_THEAD,      "thead",      (CM_TABLE|CM_ROWGRP|CM_OPT)},
+  { Tag_TITLE,      "title",      (CM_HEAD)},
+  { Tag_TR,         "tr",         (CM_TABLE|CM_OPT)},
+  { Tag_TT,         "tt",         (CM_INLINE)},
+  { Tag_U,          "u",          (CM_INLINE)},
+  { Tag_UL,         "ul",         (CM_BLOCK)},
+  { Tag_VAR,        "var",        (CM_INLINE)},
+  { Tag_XMP,        "xmp",        (CM_BLOCK|CM_OBSOLETE)},
+  { Tag_NEXTID,     "nextid",     (CM_HEAD|CM_EMPTY)},
+
+  /* proprietary elements */
+  { Tag_ALIGN,      "align",      (CM_BLOCK)},
+  { Tag_BGSOUND,    "bgsound",    (CM_HEAD|CM_EMPTY)},
+  { Tag_BLINK,      "blink",      (CM_INLINE)},
+  { Tag_COMMENT,    "comment",    (CM_INLINE)},
+  { Tag_EMBED,      "embed",      (CM_INLINE|CM_IMG|CM_EMPTY)},
+  { Tag_ILAYER,     "ilayer",     (CM_INLINE)},
+  { Tag_KEYGEN,     "keygen",     (CM_INLINE|CM_EMPTY)},
+  { Tag_LAYER,      "layer",      (CM_BLOCK)},
+  { Tag_MARQUEE,    "marquee",    (CM_INLINE|CM_OPT)},
+  { Tag_MULTICOL,   "multicol",   (CM_BLOCK)},
+  { Tag_NOBR,       "nobr",       (CM_INLINE)},
+  { Tag_NOEMBED,    "noembed",    (CM_INLINE)},
+  { Tag_NOLAYER,    "nolayer",    (CM_BLOCK|CM_INLINE|CM_MIXED)},
+  { Tag_NOSAVE,     "nosave",     (CM_BLOCK)},
+  { Tag_SERVER,     "server",     (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE)},
+  { Tag_SERVLET,    "servlet",    (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM)},
+  { Tag_SPACER,     "spacer",     (CM_INLINE|CM_EMPTY)},
+  { Tag_WBR,        "wbr",        (CM_INLINE|CM_EMPTY)},
+};
+
+static int
+tag_cmp (const void *m1, const void *m2)
+{
+       const struct html_tag *p1 = m1;
+       const struct html_tag *p2 = m2;
+
+       return g_ascii_strcasecmp (p1->name, p2->name);
+}
+
+static GNode* 
+construct_html_node (memory_pool_t *pool, char *text)
+{
+       struct html_node *html;
+       GNode *n = NULL;
+       struct html_tag key, *found;
+       char t;
+       int taglen = strlen (text);
+
+       if (text == NULL || *text == '\0') {
+               return NULL;
+       }
+       
+       html = memory_pool_alloc0 (pool, sizeof (struct html_node));
+
+       /* Check whether this tag is fully closed */
+       if (*(text + taglen - 1) == '/') {
+               html->flags |= FL_CLOSED;
+       }
+
+       /* Check xml tag */
+       if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) {
+                       html->flags |= FL_XML;
+                       html->tag = NULL;
+       }
+       else {
+               if (*text == '/') {
+                       html->flags |= FL_CLOSING;
+                       text ++;
+               }
+
+               /* Find end of tag name */
+               key.name = text;
+               while (*text && g_ascii_isalnum (*(++text)));
+
+               t = *text;
+               *text = '\0';
+
+               /* Match tag id by tag name */
+               if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) {
+                       *text = t;
+                       html->tag = found;
+               }
+               else {
+                       *text = t;
+                       return NULL;
+               }
+       }
+
+       n = g_node_new (html);
+
+       return n;
+}
+
+static gboolean
+check_balance (GNode *node, GNode **cur_level)
+{
+       struct html_node *arg = node->data, *tmp;
+       GNode *cur;
+       
+       if (arg->flags & FL_CLOSING) {
+               /* First of all check whether this tag is closing tag for parent node */
+               cur = node->parent;
+               while (cur && cur->data) {
+                       tmp = cur->data;
+                       if (tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) {
+                               msg_debug ("check_balance: found closing tag for parent '%s'", tmp->tag->name);
+                               tmp->flags |= FL_CLOSED;
+                               /* Destroy current node as we find corresponding parent node */
+                               g_node_destroy (node);
+                               /* Change level */
+                               *cur_level = cur->parent;
+                               return TRUE;
+                       }
+                       cur = cur->parent;
+               }
+       }
+       else {
+               return TRUE;
+       }
+       
+       msg_debug ("check_balance: found unbalanced tag %s", arg->tag->name);
+       return FALSE;
+}
+
+struct html_tag * 
+get_tag_by_name (const char *name)
+{
+       struct html_tag key;
+
+       key.name = name;
+
+       return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+}
+
+gboolean
+add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level)
+{
+       GNode *new;
+       struct html_node *data;
+
+       if (!tags_sorted) {
+               qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
+               tags_sorted = 1;
+       }
+
+       /* First call of this function */
+       if (part->html_nodes == NULL) {
+               /* Insert root node */
+               new = g_node_new (NULL);
+               *cur_level = new;
+               part->html_nodes = new;
+               memory_pool_add_destructor (pool, (pool_destruct_func)g_node_destroy, part->html_nodes);
+               /* Call once again with root node */
+               return add_html_node (pool, part, tag_text, cur_level);
+       }
+       else {
+               new = construct_html_node (pool, tag_text);
+               if (new == NULL) {
+                       msg_debug ("add_html_node: cannot construct HTML node for text '%s'", tag_text);
+                       return -1;
+               }
+               data = new->data;
+               if (data->flags & FL_CLOSING) {
+                       if (! *cur_level) {
+                               msg_debug ("add_html_node: bad parent node");
+                               return FALSE;
+                       }
+                       g_node_append (*cur_level, new);
+                       if (!check_balance (new, cur_level)) {
+                               msg_debug ("add_html_node: mark part as unbalanced as it has not pairable closing tags");
+                               part->is_balanced = FALSE;
+                       }
+               }
+               else {
+                       g_node_append (*cur_level, new);
+                       if ((data->flags & FL_CLOSED) == 0) {
+                               msg_debug ("add_html_node: append opening tag: '%s'", data->tag->name);
+                               *cur_level = new;
+                       }
+                       else {
+                               msg_debug ("add_html_node: append closed tag: '%s'", data->tag->name);
+                       }
+               }
+       }
+
+       return TRUE;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/html.h b/src/html.h
new file mode 100644 (file)
index 0000000..70f20de
--- /dev/null
@@ -0,0 +1,210 @@
+/*
+ * Functions for simple html parsing
+ */
+
+#ifndef RSPAMD_HTML_H
+#define RSPAMD_HTML_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+/* Known HTML tags */
+typedef enum
+{
+  Tag_UNKNOWN,  /**< Unknown tag! */
+  Tag_A,        /**< A */
+  Tag_ABBR,     /**< ABBR */
+  Tag_ACRONYM,  /**< ACRONYM */
+  Tag_ADDRESS,  /**< ADDRESS */
+  Tag_ALIGN,    /**< ALIGN */
+  Tag_APPLET,   /**< APPLET */
+  Tag_AREA,     /**< AREA */
+  Tag_B,        /**< B */
+  Tag_BASE,     /**< BASE */
+  Tag_BASEFONT, /**< BASEFONT */
+  Tag_BDO,      /**< BDO */
+  Tag_BGSOUND,  /**< BGSOUND */
+  Tag_BIG,      /**< BIG */
+  Tag_BLINK,    /**< BLINK */
+  Tag_BLOCKQUOTE,   /**< BLOCKQUOTE */
+  Tag_BODY,     /**< BODY */
+  Tag_BR,       /**< BR */
+  Tag_BUTTON,   /**< BUTTON */
+  Tag_CAPTION,  /**< CAPTION */
+  Tag_CENTER,   /**< CENTER */
+  Tag_CITE,     /**< CITE */
+  Tag_CODE,     /**< CODE */
+  Tag_COL,      /**< COL */
+  Tag_COLGROUP, /**< COLGROUP */
+  Tag_COMMENT,  /**< COMMENT */
+  Tag_DD,       /**< DD */
+  Tag_DEL,      /**< DEL */
+  Tag_DFN,      /**< DFN */
+  Tag_DIR,      /**< DIR */
+  Tag_DIV,      /**< DIF */
+  Tag_DL,       /**< DL */
+  Tag_DT,       /**< DT */
+  Tag_EM,       /**< EM */
+  Tag_EMBED,    /**< EMBED */
+  Tag_FIELDSET, /**< FIELDSET */
+  Tag_FONT,     /**< FONT */
+  Tag_FORM,     /**< FORM */
+  Tag_FRAME,    /**< FRAME */
+  Tag_FRAMESET, /**< FRAMESET */
+  Tag_H1,       /**< H1 */
+  Tag_H2,       /**< H2 */
+  Tag_H3,       /**< H3 */
+  Tag_H4,       /**< H4 */
+  Tag_H5,       /**< H5 */
+  Tag_H6,       /**< H6 */
+  Tag_HEAD,     /**< HEAD */
+  Tag_HR,       /**< HR */
+  Tag_HTML,     /**< HTML */
+  Tag_I,        /**< I */
+  Tag_IFRAME,   /**< IFRAME */
+  Tag_ILAYER,   /**< ILAYER */
+  Tag_IMG,      /**< IMG */
+  Tag_INPUT,    /**< INPUT */
+  Tag_INS,      /**< INS */
+  Tag_ISINDEX,  /**< ISINDEX */
+  Tag_KBD,      /**< KBD */
+  Tag_KEYGEN,   /**< KEYGEN */
+  Tag_LABEL,    /**< LABEL */
+  Tag_LAYER,    /**< LAYER */
+  Tag_LEGEND,   /**< LEGEND */
+  Tag_LI,       /**< LI */
+  Tag_LINK,     /**< LINK */
+  Tag_LISTING,  /**< LISTING */
+  Tag_MAP,      /**< MAP */
+  Tag_MARQUEE,  /**< MARQUEE */
+  Tag_MENU,     /**< MENU */
+  Tag_META,     /**< META */
+  Tag_MULTICOL, /**< MULTICOL */
+  Tag_NOBR,     /**< NOBR */
+  Tag_NOEMBED,  /**< NOEMBED */
+  Tag_NOFRAMES, /**< NOFRAMES */
+  Tag_NOLAYER,  /**< NOLAYER */
+  Tag_NOSAVE,   /**< NOSAVE */
+  Tag_NOSCRIPT, /**< NOSCRIPT */
+  Tag_OBJECT,   /**< OBJECT */
+  Tag_OL,       /**< OL */
+  Tag_OPTGROUP, /**< OPTGROUP */
+  Tag_OPTION,   /**< OPTION */
+  Tag_P,        /**< P */
+  Tag_PARAM,    /**< PARAM */
+  Tag_PLAINTEXT,/**< PLAINTEXT */
+  Tag_PRE,      /**< PRE */
+  Tag_Q,        /**< Q */
+  Tag_RB,       /**< RB */
+  Tag_RBC,      /**< RBC */
+  Tag_RP,       /**< RP */
+  Tag_RT,       /**< RT */
+  Tag_RTC,      /**< RTC */
+  Tag_RUBY,     /**< RUBY */
+  Tag_S,        /**< S */
+  Tag_SAMP,     /**< SAMP */
+  Tag_SCRIPT,   /**< SCRIPT */
+  Tag_SELECT,   /**< SELECT */
+  Tag_SERVER,   /**< SERVER */
+  Tag_SERVLET,  /**< SERVLET */
+  Tag_SMALL,    /**< SMALL */
+  Tag_SPACER,   /**< SPACER */
+  Tag_SPAN,     /**< SPAN */
+  Tag_STRIKE,   /**< STRIKE */
+  Tag_STRONG,   /**< STRONG */
+  Tag_STYLE,    /**< STYLE */
+  Tag_SUB,      /**< SUB */
+  Tag_SUP,      /**< SUP */
+  Tag_TABLE,    /**< TABLE */
+  Tag_TBODY,    /**< TBODY */
+  Tag_TD,       /**< TD */
+  Tag_TEXTAREA, /**< TEXTAREA */
+  Tag_TFOOT,    /**< TFOOT */
+  Tag_TH,       /**< TH */
+  Tag_THEAD,    /**< THEAD */
+  Tag_TITLE,    /**< TITLE */
+  Tag_TR,       /**< TR */
+  Tag_TT,       /**< TT */
+  Tag_U,        /**< U */
+  Tag_UL,       /**< UL */
+  Tag_VAR,      /**< VAR */
+  Tag_WBR,      /**< WBR */
+  Tag_XMP,      /**< XMP */
+  Tag_XML,             /**< XML */
+  Tag_NEXTID,   /**< NEXTID */
+
+  N_TAGS        /**< Must be last */
+} tag_id_t;
+
+#define CM_UNKNOWN      0
+/* Elements with no content. Map to HTML specification. */
+#define CM_EMPTY        (1 << 0)
+/* Elements that appear outside of "BODY". */
+#define CM_HTML         (1 << 1)
+/* Elements that can appear within HEAD. */
+#define CM_HEAD         (1 << 2)
+/* HTML "block" elements. */
+#define CM_BLOCK        (1 << 3)
+/* HTML "inline" elements. */
+#define CM_INLINE       (1 << 4)
+/* Elements that mark list item ("LI"). */
+#define CM_LIST         (1 << 5)
+/* Elements that mark definition list item ("DL", "DT"). */
+#define CM_DEFLIST      (1 << 6)
+/* Elements that can appear inside TABLE. */
+#define CM_TABLE        (1 << 7)
+/* Used for "THEAD", "TFOOT" or "TBODY". */
+#define CM_ROWGRP       (1 << 8)
+/* Used for "TD", "TH" */
+#define CM_ROW          (1 << 9)
+/* Elements whose content must be protected against white space movement.
+   Includes some elements that can found in forms. */
+#define CM_FIELD        (1 << 10)
+/* Used to avoid propagating inline emphasis inside some elements
+   such as OBJECT or APPLET. */
+#define CM_OBJECT       (1 << 11)
+/* Elements that allows "PARAM". */
+#define CM_PARAM        (1 << 12)
+/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
+#define CM_FRAMES       (1 << 13)
+/* Heading elements (h1, h2, ...). */
+#define CM_HEADING      (1 << 14)
+/* Elements with an optional end tag. */
+#define CM_OPT          (1 << 15)
+/* Elements that use "align" attribute for vertical position. */
+#define CM_IMG          (1 << 16)
+/* Elements with inline and block model. Used to avoid calling InlineDup. */
+#define CM_MIXED        (1 << 17)
+/* Elements whose content needs to be indented only if containing one 
+   CM_BLOCK element. */
+#define CM_NO_INDENT    (1 << 18)
+/* Elements that are obsolete (such as "dir", "menu"). */
+#define CM_OBSOLETE     (1 << 19)
+/* User defined elements. Used to determine how attributes wihout value
+   should be printed. */
+#define CM_NEW          (1 << 20)
+/* Elements that cannot be omitted. */
+#define CM_OMITST       (1 << 21)
+
+/* XML tag */
+#define FL_XML                 (1 << 0)
+/* Closing tag */
+#define FL_CLOSING             (1 << 1)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED              (1 << 2)
+
+struct html_tag {
+       tag_id_t id;
+       const char *name;
+       int flags;
+};
+
+struct html_node {
+       struct html_tag *tag;
+       int flags;
+};
+
+gboolean add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level);
+struct html_tag * get_tag_by_name (const char *name);
+
+#endif
index 5d344db62602abdf0b8c3dd3a91c9dbc294fe7a4..3024377d51ec325f72735b158489dc965482d320 100644 (file)
 #include "main.h"
 #include "message.h"
 #include "cfg_file.h"
+#include "html.h"
 #include "modules.h"
 
 GByteArray*
-strip_html_tags (GByteArray *src, int *stateptr)
+strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr)
 {
-       uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, c, lc;
+       uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin, c, lc;
        int br, i = 0, depth = 0, in_q = 0;
        int state = 0;
        GByteArray *buf;
+       GNode *level_ptr = NULL;
 
        if (stateptr)
                state = *stateptr;
@@ -59,6 +61,7 @@ strip_html_tags (GByteArray *src, int *stateptr)
                                }
                                if (state == 0) {
                                        lc = '<';
+                                       tbegin = p + 1;
                                        state = 1;
                                } else if (state == 1) {
                                        depth++;
@@ -101,7 +104,9 @@ strip_html_tags (GByteArray *src, int *stateptr)
                                        case 1: /* HTML/XML */
                                                lc = '>';
                                                in_q = state = 0;
-                                               
+                                               *p = '\0';
+                                               add_html_node (pool, part, tbegin, &level_ptr);
+                                               *p = '>';
                                                break;
                                                
                                        case 2: /* PHP */
@@ -220,9 +225,15 @@ reg_char:
                *rp = '\0';
                g_byte_array_set_size (buf, rp - buf->data);
        }
+       
+       /* Check tag balancing */
+       if (level_ptr && level_ptr->data != NULL) {
+                       part->is_balanced = FALSE;
+       }
 
-       if (stateptr)
+       if (stateptr) {
                *stateptr = state;
+       }
 
        return buf;
 }
@@ -287,8 +298,10 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 
                text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
                text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
-               text_part->content = strip_html_tags (part_content, NULL);
                text_part->is_html = TRUE;
+               text_part->is_balanced = TRUE;
+               text_part->html_nodes = NULL;
+               text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
                text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
                memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
                task->text_parts = g_list_prepend (task->text_parts, text_part);
@@ -591,10 +604,12 @@ process_learn (struct controller_session *session)
        return 0;
 }
 
+/*
+ * XXX: remove this function for learning
+ */
 GByteArray* 
 get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
 {
-       GByteArray *ret = NULL;
        struct mime_part *p;
 
        if (*cur == NULL) {
@@ -611,6 +626,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
                        msg_debug ("get_next_text_part: text/plain part");
                        return p->content;
                }
+#if 0
                else if (g_mime_content_type_is_type (p->type, "text", "html")) {
                        msg_debug ("get_next_text_part: try to strip html tags");
                        ret = strip_html_tags (p->content, NULL);
@@ -623,6 +639,7 @@ get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
                        memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
                        return ret;
                }
+#endif
                *cur = g_list_next (*cur);
        }
        
index 9e9b5de1fb05c8f377c4b84344377393d204c7d0..72711638f5e4a3e913c84aba787597cb6a1dbcc9 100644 (file)
@@ -17,8 +17,10 @@ struct mime_part {
 struct mime_text_part {
        gboolean is_html;
        gboolean is_raw;
+       gboolean is_balanced;
        GByteArray *orig;
        GByteArray *content;
+       GNode *html_nodes;
        fuzzy_hash_t *fuzzy;
 };