* Fix learning

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 25 Sep 2009 13:33:16 +0000 (17:33 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 25 Sep 2009 13:33:16 +0000 (17:33 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 25 Sep 2009 13:33:16 +0000 (17:33 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 25 Sep 2009 13:33:16 +0000 (17:33 +0400)
diff --git a/rspamc.pl.in b/rspamc.pl.in

index 7e435bf36e1c10c8c10be41808e3e60e30d38fad..e82ec0b47ec1b1475e1d8b4cb46e223a06b957e1 100755 (executable)
--- a/rspamc.pl.in
+++ b/rspamc.pl.in
@@ -51,7 +51,7 @@ sub parse_config {
          if ($ctrl && $_ =~ /}/) {
              $ctrl = 0;
          }
-        if ($_ =~ /^.*type.*=.*(?:lmtp|delivery).*$/i) {
+        if ($_ =~ /^.*type.*=.*(?:lmtp|delivery|fuzzy).*$/i) {
              $skip = 1;
          }
          if ($skip && $_ =~ /}/) {
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c

index 94b342525866761b87e89a9ee8203de96211e2e0..6abd973edd12e0d633a709bf5be15e4a4079f3d1 100644 (file)
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -149,7 +149,11 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input
  void
  winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class)
  {
-       struct winnow_callback_data data;
+       struct winnow_callback_data data = { 
+               .file = NULL, 
+               .sum = 0,
+               .count = 0,
+       };
         GList *cur;
         struct statfile *st;
         
@@ -157,8 +161,6 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, G
         g_assert (ctx != NULL);
  
         data.pool = pool;
-       data.sum = 0;
-       data.count = 0;
         data.in_class = in_class;
         data.now = time (NULL);
         data.ctx = ctx;
diff --git a/src/controller.c b/src/controller.c

index 0e11b6ae15c4f87367b8d6b144bf1852b1874c7a..82dc02c0f9e16789ff2b5a120eb2ac93b1347049 100644 (file)
--- a/src/controller.c
+++ b/src/controller.c
@@ -306,7 +306,7 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
                                         return;
                                 }
  
-                               session->learn_symbol = *cmd_args;
+                               session->learn_symbol = memory_pool_strdup (session->session_pool, *cmd_args);
                                 cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args);
                                 if (cl == NULL) {
                                         r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args);
@@ -399,12 +399,12 @@ controller_read_socket (f_str_t *in, void *arg)
  {
         struct controller_session *session = (struct controller_session *)arg;
         struct classifier_ctx *cls_ctx;
-       int len, i;
+       int len, i, r;
         char *s, **params, *cmd, out_buf[128];
+    struct worker_task *task;
+    struct mime_text_part *part;
         GList *comp_list, *cur = NULL;
         GTree *tokens = NULL;
-       GByteArray *content = NULL;
-       struct mime_part *p;
         f_str_t c;
  
         switch (session->state) {
@@ -450,33 +450,50 @@ controller_read_socket (f_str_t *in, void *arg)
                         break;
                 case STATE_LEARN:
                         session->learn_buf = in;
-                       process_learn (session);
-                       while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
-                               c.begin = content->data;
-                               c.len = content->len;
+               task = construct_task (session->worker);
+       
+               task->msg = memory_pool_alloc (task->task_pool, sizeof (f_str_t));
+               task->msg->begin = in->begin;
+               task->msg->len = in->len;
+
+               r = process_message (task);
+               if (r == -1) {
+                msg_warn ("read_socket: processing of message failed");
+                free_task (task, FALSE);
+                session->state = STATE_REPLY;
+                r = snprintf (out_buf, sizeof (out_buf), "cannot process message" CRLF);
+                rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
+                return FALSE;
+            } 
+            cur = g_list_first (task->text_parts);
+                       while (cur) {
+                               part = cur->data;
+                               if (part->is_empty) {
+                                       cur = g_list_next (cur);
+                                       continue;
+                               }
+                               c.begin = part->content->data;
+                               c.len = part->content->len;
+
                                 if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer, 
                                                         session->session_pool, &c, &tokens)) {
                                         i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
+                                       free_task (task, FALSE);
                                         if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
                          return FALSE;
                      }
                                         session->state = STATE_REPLY;
                                         return TRUE;
                                 }
+                               cur = g_list_next (cur);
                         }
                         cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier);
                         session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
                                                                                                         session->learn_symbol, tokens, session->in_class);
                         session->worker->srv->stat->messages_learned ++;
  
-                       /* Clean learned parts */
-                       while ((cur = g_list_first (session->parts))) {
-                               session->parts = g_list_remove_link (session->parts, cur);
-                               p = (struct mime_part *)cur->data;
-                               g_byte_array_free (p->content, FALSE);
-                               g_list_free_1 (cur);
-                       }
  
+            free_task (task, FALSE);
                         i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
                         if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
                  return FALSE;
diff --git a/src/message.c b/src/message.c

index 1416f22ab7d4b22cbbbc3715a8575ab86edaa30c..f1886e687abda813dfc1529037b14ce7d340f978 100644 (file)
--- a/src/message.c
+++ b/src/message.c
@@ -770,161 +770,6 @@ process_message (struct worker_task *task)
         return 0;
  }
  
-#ifdef GMIME24
-static void
-mime_learn_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
-#else
-static void
-mime_learn_foreach_callback (GMimeObject *part, gpointer user_data)
-#endif
-{
-       struct controller_session *session = (struct controller_session *)user_data;
-       struct mime_part *mime_part;
-       GMimeContentType *type;
-       GMimeDataWrapper *wrapper;
-       GMimeStream *part_stream;
-       GByteArray *part_content;
-       
-       /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-       
-       /* find out what class 'part' is... */
-       if (GMIME_IS_MESSAGE_PART (part)) {
-               /* message/rfc822 or message/news */
-               GMimeMessage *message;
-               
-               /* g_mime_message_foreach_part() won't descend into
-                   child message parts, so if we want to count any
-                   subparts of this child message, we'll have to call
-                   g_mime_message_foreach_part() again here. */
-               message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-#ifdef GMIME24
-               g_mime_message_foreach (message, mime_learn_foreach_callback, session);
-#else
-               g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
-#endif
-               g_object_unref (message);
-       } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
-               /* message/partial */
-               
-               /* this is an incomplete message part, probably a
-                   large message that the sender has broken into
-                   smaller parts and is sending us bit by bit. we
-                   could save some info about it so that we could
-                   piece this back together again once we get all the
-                   parts? */
-       } else if (GMIME_IS_MULTIPART (part)) {
-               /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-               
-               /* we'll get to finding out if this is a signed/encrypted multipart later... */
-       } else if (GMIME_IS_PART (part)) {
-               /* a normal leaf part, could be text/plain or image/jpeg etc */
-               wrapper = g_mime_part_get_content_object (GMIME_PART (part));
-               if (wrapper != NULL) {
-                       part_stream = g_mime_stream_mem_new ();
-                       if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
-                               g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (part_stream), FALSE);
-                               part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
-                               g_object_unref (part_stream);
-#ifdef GMIME24
-                               type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part));
-#else
-                               type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
-#endif
-                               mime_part = memory_pool_alloc (session->session_pool, sizeof (struct mime_part));
-                               mime_part->type = type;
-                               mime_part->content = part_content;
-                               session->parts = g_list_prepend (session->parts, mime_part);
-                       }
-                       g_object_unref (wrapper);
-               }
-       } else {
-               g_assert_not_reached ();
-       }
-}
-
-int
-process_learn (struct controller_session *session)
-{
-       GMimeMessage *message;
-       GMimeParser *parser;
-       GMimeStream *stream;
-       GByteArray *tmp;
-    
-       tmp = memory_pool_alloc (session->session_pool, sizeof (GByteArray));
-       tmp->data = session->learn_buf->begin;
-       tmp->len = session->learn_buf->len;
-       stream = g_mime_stream_mem_new_with_byte_array (tmp);
-       /* 
-        * This causes g_mime_stream not to free memory by itself as it is memory allocated by
-        * pool allocator
-        */
-       g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
-
-       /* create a new parser object to parse the stream */
-       parser = g_mime_parser_new_with_stream (stream);
-
-       /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
-       g_object_unref (stream);
-
-       /* parse the message from the stream */
-       message = g_mime_parser_construct_message (parser);
-       
-       memory_pool_add_destructor (session->session_pool, (pool_destruct_func)g_object_unref, message);
-
-#ifdef GMIME24
-       g_mime_message_foreach (message, mime_learn_foreach_callback, session);
-#else
-       g_mime_message_foreach_part (message, mime_learn_foreach_callback, session);
-#endif
-
-       /* free the parser (and the stream) */
-       g_object_unref (parser);
-       
-       return 0;
-}
-
-/*
- * XXX: remove this function for learning
- */
-GByteArray* 
-get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur)
-{
-       struct mime_part *p;
-
-       if (*cur == NULL) {
-               *cur = g_list_first (parts);
-       }
-       else {
-               *cur = g_list_next (*cur);
-       }
-       
-       while (*cur) {
-               p = (*cur)->data;
-               /* For text/plain just return bytes */
-               if (g_mime_content_type_is_type (p->type, "text", "plain")) {
-                       msg_debug ("get_next_text_part: text/plain part");
-                       return p->content;
-               }
-#if 0
-               else if (g_mime_content_type_is_type (p->type, "text", "html")) {
-                       msg_debug ("get_next_text_part: try to strip html tags");
-                       ret = strip_html_tags (p->content, NULL);
-                       memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
-                       return ret;
-               }
-               else if (g_mime_content_type_is_type (p->type, "text", "xhtml")) {
-                       msg_debug ("get_next_text_part: try to strip html tags");
-                       ret = strip_html_tags (p->content, NULL);
-                       memory_pool_add_destructor (pool, (pool_destruct_func)free_byte_array_callback, ret);
-                       return ret;
-               }
-#endif
-               *cur = g_list_next (*cur);
-       }
-       
-       return NULL;
-}
-
  struct raw_header {
      struct raw_header *next;
      char *name;
diff --git a/src/message.h b/src/message.h

index 939379cedc0f6bed1d171586e9f63add676b2488..13b93b881bf05ca7e70d40514e527dadcb1d5b77 100644 (file)
--- a/src/message.h
+++ b/src/message.h
@@ -47,22 +47,6 @@ struct received_header {
   */
  int process_message (struct worker_task *task);
  
-/*
- * Process message for learning statfile classifier. 
- * It extract text and html parts and strip tags from html parts
- * @param session session that contains message
- * @return 0 allways (may be changed in future) 
- */
-int process_learn (struct controller_session *session);
-
-/**
- * Return next text part (or html with stripped tags) for specified list
- * @param pool memory pool in which place object
- * @param parts current position in list
- * @param cur pointer to which we save current position after processing
- */
-GByteArray* get_next_text_part (memory_pool_t *pool, GList *parts, GList **cur);
-
  void message_set_header (GMimeMessage *message, const char *field, const char *value);
  GList* message_get_header (memory_pool_t *pool, GMimeMessage *message, const char *field);
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 25 Sep 2009 13:33:16 +0000 (17:33 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 25 Sep 2009 13:33:16 +0000 (17:33 +0400)
rspamc.pl.in		patch \| blob \| blame \| history
src/classifiers/winnow.c		patch \| blob \| blame \| history
src/controller.c		patch \| blob \| blame \| history
src/message.c		patch \| blob \| blame \| history
src/message.h		patch \| blob \| blame \| history