* New trie based url scanner (based on libcamel)

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Tue, 21 Sep 2010 16:11:34 +0000 (20:11 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Tue, 21 Sep 2010 16:11:34 +0000 (20:11 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 21 Sep 2010 16:11:34 +0000 (20:11 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 21 Sep 2010 16:11:34 +0000 (20:11 +0400)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 39eb3fa2db682c37c8f8d512130f838478174d4c..e017e687517ac36e27e6150b6e095c323a572743 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -509,6 +509,7 @@ SET(RSPAMDSRC       src/modules.c
                                 src/statfile.c
                                 src/statfile_sync.c
                                 src/symbols_cache.c
+                               src/trie.c
                                 src/upstream.c
                                 src/url.c
                                 src/util.c
@@ -550,6 +551,7 @@ SET(TESTSRC         test/rspamd_expression_test.c
  SET(TESTDEPENDS        src/mem_pool.c
                                 src/hash.c
                                 src/url.c
+                               src/trie.c
                                 src/util.c
                                 src/radix.c
                                 src/fuzzy.c
@@ -570,6 +572,7 @@ SET(EXPRSRC utils/expression_parser.c)
  SET(UTILSDEPENDS src/mem_pool.c
                                 src/hash.c
                                 src/url.c
+                               src/trie.c
                                 src/fuzzy.c
                                 src/expressions.c
                                 src/message.c
diff --git a/perl/lib/Mail/Rspamd/Client.pm b/perl/lib/Mail/Rspamd/Client.pm

index 4af7a02d908f26278e63c74a1417337626beb6bc..cbafcec8f5a12745db6f6a48ed3112c1c743fd9b 100644 (file)
--- a/perl/lib/Mail/Rspamd/Client.pm
+++ b/perl/lib/Mail/Rspamd/Client.pm
@@ -577,7 +577,10 @@ sub _create_connection {
                 do {
                         $server = $self->_select_server();
                         $tries ++;
-
+                       
+                       if ($server->{host} eq '*') {
+                               $server->{host} = '127.0.0.1';  
+                       }
                         $remote = IO::Socket::INET->new( Proto     => "tcp",
                                                 PeerAddr  => $server->{host},
                                                 PeerPort  => $server->{port},
@@ -610,6 +613,9 @@ sub _create_connection {
                 }
      }
      elsif ($hostdef =~ /^\s*(([^:]+):(\d+))\s*$/) {
+               if ($2 eq '*') {
+                       $2 = '127.0.0.1';       
+               }
                 $remote = IO::Socket::INET->new( Proto     => "tcp",
                                         PeerAddr  => $2,
                                         PeerPort  => $3,
@@ -627,6 +633,9 @@ sub _create_connection {
                 }
      }
      elsif ($hostdef =~ /^\s*([^:]+)\s*$/) {
+               if ($1 eq '*') {
+                       $1 = '127.0.0.1';       
+               }
                 $remote = IO::Socket::INET->new( Proto     => "tcp",
                                         PeerAddr  => $1,
                                         PeerPort  => $self->{control} ? 11334 : 11333,
diff --git a/rspamc.pl.in b/rspamc.pl.in

index a261f66c94c74f8dd5c285e8e96caedef7fb3c07..9843bf3848e02c1dda406666c8df068f45bec18b 100755 (executable)
--- a/rspamc.pl.in
+++ b/rspamc.pl.in
@@ -14,7 +14,7 @@ use Mail::Rspamd::Config;
  use Data::Dumper;
  
  my %cfg = (
-    'conf_file' => '@CMAKE_INSTALL_PREFIX@/etc/rspamd.conf',
+    'conf_file' => '@CMAKE_INSTALL_PREFIX@/etc/rspamd.xml',
      'command'   => 'SYMBOLS',
      'hosts'      => ['localhost:11333', ],
         'require_input' => 0,
@@ -53,21 +53,6 @@ imap format: imap:user:<username>:password:[<password>]:host:<hostname>:mbox:<mb
  Password may be omitted and then it would be asked in terminal
  imaps requires IO::Socket::SSL
  
-IMAP search strings samples:
-ALL - All messages in the mailbox;
-FROM <string> - Messages that contain the specified string in the envelope structure's FROM field;
-HEADER <field-name> <string> - Messages that have a header with the specified field-name and that 
-             contains the specified string in the text of the header (what comes after the colon);
-NEW - Messages that have the \\Recent flag set but not the \\Seen flag. 
-             This is functionally equivalent to "(RECENT UNSEEN)".
-OLD - Messages that do not have the \\Recent flag set.
-SEEN - Messages that have the \\Seen flag set.
-SENTBEFORE <date> - Messages whose [RFC-2822] Date: header (disregarding time and timezone) 
-             is earlier than the specified date.
-TO <string> - Messages that contain the specified string in the envelope structure's TO field.
-TEXT <string> - Messages that contain the specified string in the header or body of the message.
-OR <search-key1> <search-key2> - Messages that match either search key (same for AND and NOT operations).
-
  Version:   @RSPAMD_VERSION@
  EOD
         exit;
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c

index 4c78d33b7bb17c1a31b5c24b50825cf26ca6ffaa..edfc1caa8a8d51c39e3771c3187f8c350db21590 100644 (file)
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -118,6 +118,7 @@ struct fuzzy_learn_session {
  };
  
  static struct fuzzy_ctx        *fuzzy_module_ctx = NULL;
+static const gchar              hex_digits[] = "0123456789abcdef";
  
  static int                      fuzzy_mime_filter (struct worker_task *task);
  static void                     fuzzy_symbol_callback (struct worker_task *task, void *unused);
@@ -296,6 +297,27 @@ fuzzy_normalize (int32_t in, double weight)
         return (double)in;
  }
  
+static const char *
+fuzzy_to_string (fuzzy_hash_t *h)
+{
+       static char strbuf [FUZZY_HASHLEN * 2 + 1];
+       int i;
+       guint8 byte;
+
+       for (i = 0; i < FUZZY_HASHLEN; i ++) {
+               byte = h->hash_pipe[i];
+               if (byte == '\0') {
+                       break;
+               }
+               strbuf[i * 2] = hex_digits[byte >> 4];
+               strbuf[i * 2 + 1] = hex_digits[byte & 0xf];
+       }
+
+       strbuf[i * 2] = '\0';
+
+       return strbuf;
+}
+
  int
  fuzzy_check_module_init (struct config_file *cfg, struct module_ctx **ctx)
  {
@@ -463,8 +485,8 @@ fuzzy_io_callback (int fd, short what, void *arg)
                                 symbol = map->symbol;
                                 nval = fuzzy_normalize (value, map->weight);
                         }
-                       msg_info ("<%s>, found fuzzy hash with weight: %.2f, in list: %d",
-                                       session->task->message_id, flag, nval);
+                       msg_info ("<%s>, found fuzzy hash '%s' with weight: %.2f, in list: %d",
+                                       session->task->message_id, fuzzy_to_string (session->h), flag, nval);
                         rspamd_snprintf (buf, sizeof (buf), "%d: %d / %.2f", flag, value, nval);
                         insert_result (session->task, symbol, nval, g_list_prepend (NULL, 
                                                 memory_pool_strdup (session->task->task_pool, buf)));
@@ -527,7 +549,8 @@ fuzzy_learn_callback (int fd, short what, void *arg)
                         goto err;
                 }
                 else if (buf[0] == 'O' && buf[1] == 'K') {
-                       msg_info ("added fuzzy hash for message <%s>", session->task->message_id);
+                       msg_info ("added fuzzy hash '%s' to list: %d for message <%s>",
+                                       fuzzy_to_string (session->h), session->flag, session->task->message_id);
                         r = rspamd_snprintf (buf, sizeof (buf), "OK" CRLF);
                         if (! rspamd_dispatcher_write (session->session->dispatcher, buf, r, FALSE, FALSE)) {
                                 return;
@@ -823,7 +846,7 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in)
                                                         return;
                                                 }
  
-                                               msg_info ("save hash of image: [%s]", checksum);
+                                               msg_info ("save hash of image: [%s] to list: %d", checksum, flag);
                                                 g_free (checksum);
                                         }
                                 }
@@ -852,9 +875,9 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in)
                                                         free_task (task, FALSE);
                                                         return;
                                                 }
-                                               msg_info ("save hash of part of type: %s/%s: [%s]",
+                                               msg_info ("save hash of part of type: %s/%s: [%s] to list %d",
                                                                 mime_part->type->type, mime_part->type->subtype,
-                                                               checksum);
+                                                               checksum, flag);
                                                 g_free (checksum);
                                 }
                         }
diff --git a/src/trie.c b/src/trie.c

new file mode 100644 (file)

index 0000000..945a2aa
--- /dev/null
+++ b/src/trie.c
@@ -0,0 +1,223 @@
+/* Copyright (c) 2010, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * XXX: This code was derived from CamelTrie implementation (lgpl code) and
+ * is subject to be rewritten completely from scratch (or from bsd grep)
+ */
+
+#include "config.h"
+#include "mem_pool.h"
+#include "trie.h"
+
+
+rspamd_trie_t*
+rspamd_trie_create (gboolean icase)
+{
+       rspamd_trie_t                 *new;
+
+       new = g_malloc (sizeof (rspamd_trie_t));
+
+       new->icase = icase;
+       new->pool = memory_pool_new (memory_pool_get_size ());
+       new->root.fail = NULL;
+       new->root.final = 0;
+       new->root.id = 0;
+       new->root.next = NULL;
+       new->root.match = NULL;
+       new->fail_states = g_ptr_array_sized_new (8);
+
+       return new;
+}
+
+/*
+ * Insert a single character as level of binary trie
+ */
+static struct rspamd_trie_state *
+rspamd_trie_insert_char (rspamd_trie_t *trie, gint depth, struct rspamd_trie_state *q, gchar c)
+{
+       struct rspamd_trie_match     *m;
+
+       /* Insert new match into a chain */
+       m = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_match));
+       m->next = q->match;
+       m->c = c;
+
+       q->match = m;
+       m->state = memory_pool_alloc (trie->pool, sizeof (struct rspamd_trie_state));
+       q = m->state;
+       q->match = NULL;
+       q->fail = &trie->root;
+       q->final = 0;
+       q->id = -1;
+
+       if (trie->fail_states->len < depth + 1) {
+               /* Grow fail states array */
+               guint size = trie->fail_states->len;
+
+               size = MAX (size + 64, depth + 1);
+               g_ptr_array_set_size (trie->fail_states, size);
+       }
+
+       q->next = trie->fail_states->pdata[depth];
+       trie->fail_states->pdata[depth] = q;
+
+       return q;
+}
+
+G_INLINE_FUNC struct rspamd_trie_match *
+check_match (struct rspamd_trie_state *s, gchar c)
+{
+       struct rspamd_trie_match         *m = s->match;
+
+       while (m && m->c != c) {
+               m = m->next;
+       }
+
+       return m;
+}
+
+void
+rspamd_trie_insert (rspamd_trie_t *trie, const gchar *pattern, gint pattern_id)
+{
+       const guchar               *p =  pattern;
+       struct rspamd_trie_state   *q, *q1, *r;
+       struct rspamd_trie_match   *m, *n;
+       gint                        i, depth = 0;
+       gchar                       c;
+
+       /* Insert pattern to the trie */
+
+       q = &trie->root;
+
+       while (*p) {
+               c = trie->icase ? g_ascii_tolower (*p) : *p;
+               m = check_match (q, c);
+               if (m == NULL) {
+                       /* Insert char at specified level depth */
+                       q = rspamd_trie_insert_char (trie, depth, q, c);
+               }
+               else {
+                       /* Switch current state to matched state */
+                       q = m->state;
+               }
+               p ++;
+               depth ++;
+       }
+
+       q->final = depth;
+       q->id = pattern_id;
+
+       /* Update fail states and build fail states graph */
+       /* Go throught the whole depth of prefixes */
+       for (i = 0; i < trie->fail_states->len; i++) {
+               q = trie->fail_states->pdata[i];
+               while (q) {
+                       m = q->match;
+                       while (m) {
+                               c = m->c;
+                               q1 = m->state;
+                               r = q->fail;
+                               /* Move q->fail to last known fail location for this character (or to NULL) */
+                               while (r && (n = check_match (r, c)) == NULL) {
+                                       r = r->fail;
+                               }
+
+                               /* We have found new fail location for character c, so set it in q1 */
+                               if (r != NULL) {
+                                       q1->fail = n->state;
+                                       if (q1->fail->final > q1->final) {
+                                               q1->final = q1->fail->final;
+                                       }
+                               }
+                               else {
+                                       /* Search from root */
+                                       if ((n = check_match (&trie->root, c))) {
+                                               q1->fail = n->state;
+                                       }
+                                       else {
+                                               q1->fail = &trie->root;
+                                       }
+                               }
+
+                               m = m->next;
+                       }
+
+                       q = q->next;
+               }
+       }
+}
+
+const gchar*
+rspamd_trie_lookup (rspamd_trie_t *trie, const gchar *buffer, gsize buflen, gint *matched_id)
+{
+       const guchar               *p = buffer, *prev, *pat;
+       struct rspamd_trie_state   *q;
+       struct rspamd_trie_match   *m = NULL;
+       gchar                       c;
+
+
+       q = &trie->root;
+       prev = p;
+       pat = p;
+
+       while (buflen) {
+               c = trie->icase ? g_ascii_tolower (*p) : *p;
+
+               while (q != NULL && (m = check_match (q, c)) == NULL) {
+                       q = q->fail;
+               }
+
+               if (q == &trie->root) {
+                       pat = prev;
+               }
+
+               if (q == NULL) {
+                       q = &trie->root;
+                       pat = p;
+               }
+               else if (m != NULL) {
+                       q = m->state;
+
+                       if (q->final) {
+                               if (matched_id) {
+                                       *matched_id = q->id;
+                               }
+                               return (const gchar *) pat;
+                       }
+               }
+               p ++;
+               prev = p;
+               buflen --;
+       }
+
+       return NULL;
+}
+
+void
+rspamd_trie_free (rspamd_trie_t *trie)
+{
+       g_ptr_array_free (trie->fail_states, TRUE);
+       memory_pool_delete (trie->pool);
+       g_free (trie);
+}
diff --git a/src/trie.h b/src/trie.h

new file mode 100644 (file)

index 0000000..f871162
--- /dev/null
+++ b/src/trie.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2010, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *       * Redistributions of source code must retain the above copyright
+ *         notice, this list of conditions and the following disclaimer.
+ *       * Redistributions in binary form must reproduce the above copyright
+ *         notice, this list of conditions and the following disclaimer in the
+ *         documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef TRIE_H_
+#define TRIE_H_
+
+#include "config.h"
+#include "mem_pool.h"
+
+/*
+ * Rspamd implements basic bitwise prefixed trie structure
+ */
+
+struct rspamd_trie_match;
+
+struct rspamd_trie_state {
+       struct rspamd_trie_state *next;
+       struct rspamd_trie_state *fail;
+       struct rspamd_trie_match *match;
+       guint final;
+       gint id;
+};
+
+struct rspamd_trie_match {
+       struct rspamd_trie_match *next;
+       struct rspamd_trie_state *state;
+       gchar c;
+};
+
+typedef struct rspamd_trie_s {
+       struct rspamd_trie_state root;
+       GPtrArray *fail_states;
+       gboolean icase;
+       memory_pool_t *pool;
+} rspamd_trie_t;
+
+rspamd_trie_t* rspamd_trie_create (gboolean icase);
+
+void rspamd_trie_insert (rspamd_trie_t *trie, const gchar *pattern, gint pattern_id);
+const gchar* rspamd_trie_lookup (rspamd_trie_t *trie, const gchar *buffer, gsize buflen, gint *matched_id);
+void rspamd_trie_free (rspamd_trie_t *trie);
+
+#endif /* TRIE_H_ */
diff --git a/src/url.c b/src/url.c

index 2c508ebe18c6953cf1b627f31d683be517efd847..b839d7566801f6641342b1656685a7eb8785a394 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -28,6 +28,7 @@
  #include "fstring.h"
  #include "main.h"
  #include "message.h"
+#include "trie.h"
  
  #define POST_CHAR 1
  #define POST_CHAR_S "\001"
@@ -49,24 +50,55 @@ struct _proto {
         unsigned int                    need_ssl:1;
  };
  
-static const char              *text_url = "((https?|ftp)://)?"
-       "(\\b(?<![.\\@A-Za-z0-9-])" "(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\."
-       "(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel"
-       "|cc|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se"
-       "|[a-z]{2}" "(?(1)|(?=/)))" "(?!\\w)"
-       "|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))"    /* ip in dotted view */
-       "|\\d{5,20}(?(1)|(?=[/:]))"     /* ip in numeric view */
-       ")" "(?::\\d{1,5})?"            /* port */
-       "(?!\\.\\w)"                            /* host part ended, no more of this further on */
-       "(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?"    /* path (&query) */
-       "(?<![\\s>?!),.'\"\\]:])" "(?!@)" ")";
-static const char              *html_url = "(?: src|href)=\"?(" "((https?|ftp)://)?" "(\\b(?<![.\\@A-Za-z0-9-])" "(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\." "(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel" "|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se" "|[a-z]{2}" "(?(1)|(?=/)))" "(?!\\w)" "|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))" ")" "(?::\\d{1,5})?"      /* port */
-       "(?!\\.\\w)"                            /* host part ended, no more of this further on */
-       "(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?"    /* path (&query) */
-       "(?<![\\s>?!),.'\"\\]:])" "(?!@)" "))\"?";
-
-static short                    url_initialized = 0;
-GRegex                         *text_re, *html_re;
+typedef struct url_match_s {
+       const gchar *m_begin;
+       gsize m_len;
+       const gchar *pattern;
+       const gchar *prefix;
+} url_match_t;
+
+struct url_matcher {
+       const gchar *pattern;
+       const gchar *prefix;
+       gboolean (*start)(const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+       gboolean (*end)(const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+};
+
+static gboolean url_file_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+static gboolean url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+
+static gboolean url_web_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+static gboolean url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+
+static gboolean url_email_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+static gboolean url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match);
+
+struct url_matcher matchers[] = {
+               { "file://",            "",                     url_file_start,                 url_file_end    },
+               { "ftp://",                     "",             url_web_start,                  url_web_end             },
+               { "sftp://",            "",             url_web_start,                  url_web_end             },
+               { "http://",            "",             url_web_start,                  url_web_end             },
+               { "https://",           "",             url_web_start,                  url_web_end             },
+               { "news://",            "",             url_web_start,                  url_web_end             },
+               { "nntp://",            "",             url_web_start,                  url_web_end             },
+               { "telnet://",          "",             url_web_start,                  url_web_end             },
+               { "webcal://",          "",             url_web_start,                  url_web_end             },
+               { "mailto://",          "",             url_email_start,                url_email_end   },
+               { "callto://",          "",             url_web_start,                  url_web_end             },
+               { "h323:",                      "",             url_web_start,                  url_web_end             },
+               { "sip:",                       "",             url_web_start,                  url_web_end             },
+               { "www.",                       "http://",      url_web_start,                  url_web_end             },
+               { "ftp.",                       "ftp://",       url_web_start,                  url_web_end             },
+               { "@",                          "mailto://",url_email_start,            url_email_end   }
+};
+
+struct url_match_scanner {
+       struct url_matcher *matchers;
+       gsize matchers_count;
+       rspamd_trie_t *patterns;
+};
+
+struct url_match_scanner *url_scanner = NULL;
  
  static const struct _proto      protocol_backends[] = {
         {"file", 0, NULL, 1, 0, 0, 0},
@@ -78,40 +110,6 @@ static const struct _proto      protocol_backends[] = {
         {NULL, 0, NULL, 0, 0, 1, 0},
  };
  
-/* 
-   Table of "reserved" and "unsafe" characters.  Those terms are
-   rfc1738-speak, as such largely obsoleted by rfc2396 and later
-   specs, but the general idea remains.
-
-   A reserved character is the one that you can't decode without
-   changing the meaning of the URL.  For example, you can't decode
-   "/foo/%2f/bar" into "/foo///bar" because the number and contents of
-   path components is different.  Non-reserved characters can be
-   changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
-   unsafe characters are loosely based on rfc1738, plus "$" and ",",
-   as recommended by rfc2396, and minus "~", which is very frequently
-   used (and sometimes unrecognized as %7E by broken servers).
-
-   An unsafe character is the one that should be encoded when URLs are
-   placed in foreign environments.  E.g. space and newline are unsafe
-   in HTTP contexts because HTTP uses them as separator and line
-   terminator, so they must be encoded to %20 and %0A respectively.
-   "*" is unsafe in shell context, etc.
-
-   We determine whether a character is unsafe through static table
-   lookup.  This code assumes ASCII character set and 8-bit chars.  */
-
-enum {
-       /* rfc1738 reserved chars + "$" and ",".  */
-       urlchr_reserved = 1,
-
-       /* rfc1738 unsafe chars, plus non-printables.  */
-       urlchr_unsafe = 2
-};
-
-#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
-#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
-#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
  /* Convert an ASCII hex digit to the corresponding number between 0
     and 15.  H should be a hexadecimal digit that satisfies isxdigit;
     otherwise, the result is undefined.  */
@@ -123,43 +121,44 @@ enum {
  #define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0)
  #define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0)
  
-/* Shorthands for the table: */
-#define R  urlchr_reserved
-#define U  urlchr_unsafe
-#define RU R|U
-
-static const unsigned char      urlchr_table[256] = {
-       U, U, U, U, U, U, U, U,         /* NUL SOH STX ETX  EOT ENQ ACK BEL */
-       U, U, U, U, U, U, U, U,         /* BS  HT  LF  VT   FF  CR  SO  SI  */
-       U, U, U, U, U, U, U, U,         /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
-       U, U, U, U, U, U, U, U,         /* CAN EM  SUB ESC  FS  GS  RS  US  */
-       U, 0, U, RU, R, U, R, 0,        /* SP  !   "   #    $   %   &   '   */
-       0, 0, 0, R, R, 0, 0, R,         /* (   )   *   +    ,   -   .   /   */
-       0, 0, 0, 0, 0, 0, 0, 0,         /* 0   1   2   3    4   5   6   7   */
-       0, 0, RU, R, U, R, U, R,        /* 8   9   :   ;    <   =   >   ?   */
-       RU, 0, 0, 0, 0, 0, 0, 0,        /* @   A   B   C    D   E   F   G   */
-       0, 0, 0, 0, 0, 0, 0, 0,         /* H   I   J   K    L   M   N   O   */
-       0, 0, 0, 0, 0, 0, 0, 0,         /* P   Q   R   S    T   U   V   W   */
-       0, 0, 0, RU, U, RU, U, 0,       /* X   Y   Z   [    \   ]   ^   _   */
-       U, 0, 0, 0, 0, 0, 0, 0,         /* `   a   b   c    d   e   f   g   */
-       0, 0, 0, 0, 0, 0, 0, 0,         /* h   i   j   k    l   m   n   o   */
-       0, 0, 0, 0, 0, 0, 0, 0,         /* p   q   r   s    t   u   v   w   */
-       0, 0, 0, U, U, U, 0, U,         /* x   y   z   {    |   }   ~   DEL */
-
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
-
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
-       U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+static guchar url_scanner_table[256] = {
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  9,  9,  1,  1,  9,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+        24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
+        68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
+       160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128,
+       128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
+        66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
  };
  
-#undef R
-#undef U
-#undef RU
+enum {
+       IS_CTRL         = (1 << 0),
+       IS_ALPHA        = (1 << 1),
+       IS_DIGIT        = (1 << 2),
+       IS_LWSP         = (1 << 3),
+       IS_SPACE        = (1 << 4),
+       IS_SPECIAL      = (1 << 5),
+       IS_DOMAIN       = (1 << 6),
+       IS_URLSAFE      = (1 << 7)
+};
+
+#define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0)
+#define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0)
+#define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
+#define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0)
+#define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0)
+#define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0)
+#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
+
  
  static const char              *
  url_strerror (enum uri_errno err)
@@ -216,21 +215,15 @@ check_uri_file (unsigned char *name)
  static int
  url_init (void)
  {
-       GError                         *err = NULL;
-       if (url_initialized == 0) {
-               text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err);
-               if (err != NULL) {
-                       msg_info ("cannot init text url parsing regexp: %s", err->message);
-                       g_error_free (err);
-                       return -1;
+       int                             i;
+       if (url_scanner == NULL) {
+               url_scanner = g_malloc (sizeof (struct url_match_scanner));
+               url_scanner->matchers = matchers;
+               url_scanner->matchers_count = G_N_ELEMENTS (matchers);
+               url_scanner->patterns = rspamd_trie_create (TRUE);
+               for (i = 0; i < url_scanner->matchers_count; i ++) {
+                       rspamd_trie_insert (url_scanner->patterns, matchers[i].pattern, i);
                 }
-               html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err);
-               if (err != NULL) {
-                       msg_info ("cannot init html url parsing regexp: %s", err->message);
-                       g_error_free (err);
-                       return -1;
-               }
-               url_initialized = 1;
         }
  
         return 0;
@@ -398,15 +391,8 @@ url_strip (char *s)
         *t = '\0';
  }
  
-/* The core of url_escape_* functions.  Escapes the characters that
-   match the provided mask in urlchr_table.
-
-   If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
-   will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a
-   freshly allocated string will be returned in all cases.  */
-
  static char                    *
-url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_pool_t * pool)
+url_escape_1 (const char *s, int allow_passthrough, memory_pool_t * pool)
  {
         const char                     *p1;
         char                           *p2, *newstr;
@@ -414,8 +400,9 @@ url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_p
         int                             addition = 0;
  
         for (p1 = s; *p1; p1++)
-               if (urlchr_test (*p1, mask))
+               if (!is_urlsafe (*p1)) {
                         addition += 2;          /* Two more characters (hex digits) */
+               }
  
         if (!addition) {
                 if (allow_passthrough) {
@@ -433,7 +420,7 @@ url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_p
         p2 = newstr;
         while (*p1) {
                 /* Quote the characters that match the test mask. */
-               if (urlchr_test (*p1, mask)) {
+               if (!is_urlsafe (*p1)) {
                         unsigned char                   c = *p1++;
                         *p2++ = '%';
                         *p2++ = XNUM_TO_DIGIT (c >> 4);
@@ -453,7 +440,7 @@ url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_p
  char                           *
  url_escape (const char *s, memory_pool_t * pool)
  {
-       return url_escape_1 (s, urlchr_unsafe, 0, pool);
+       return url_escape_1 (s, 0, pool);
  }
  
  /* URL-escape the unsafe characters (see urlchr_table) in a given
@@ -462,7 +449,7 @@ url_escape (const char *s, memory_pool_t * pool)
  static char                    *
  url_escape_allow_passthrough (const char *s, memory_pool_t * pool)
  {
-       return url_escape_1 (s, urlchr_unsafe, 1, pool);
+       return url_escape_1 (s, 1, pool);
  }
  
  /* Decide whether the char at position P needs to be encoded.  (It is
@@ -481,7 +468,7 @@ char_needs_escaping (const char *p)
                         /* Garbled %.. sequence: encode `%'. */
                         return 1;
         }
-       else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
+       else if (! is_urlsafe (*p))
                 return 1;
         else
                 return 0;
@@ -574,7 +561,7 @@ unescape_single_char (char *str, char chr)
  static char                    *
  url_escape_dir (const char *dir, memory_pool_t * pool)
  {
-       char                           *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1, pool);
+       char                           *newdir = url_escape_1 (dir, 1, pool);
         if (newdir == dir)
                 return (char *)dir;
  
@@ -893,14 +880,252 @@ parse_uri (struct uri *uri, unsigned char *uristring, memory_pool_t * pool)
         return URI_ERRNO_OK;
  }
  
+static const gchar url_braces[] = {
+        '(', ')' ,
+        '{', '}' ,
+        '[', ']' ,
+        '<', '>' ,
+        '|', '|' ,
+        '\'', '\''
+};
+
+static gboolean
+is_open_brace (gchar c)
+{
+       if (c == '(' ||
+               c == '{' ||
+               c == '[' ||
+               c == '<' ||
+               c == '|' ||
+               c == '\'') {
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
+static gboolean
+url_file_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+       match->m_begin = pos;
+       return TRUE;
+}
+static gboolean
+url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+       const gchar                    *p;
+       gchar                           stop;
+       int                             i;
+
+       p = pos + strlen (match->pattern);
+       if (*p == '/') {
+               p ++;
+       }
+
+       for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) {
+               if (*p == url_braces[i]) {
+                       stop = url_braces[i + 1];
+                       break;
+               }
+       }
+
+       while (p < end && *p != stop && is_urlsafe (*p)) {
+               p ++;
+       }
+
+       if (p == begin) {
+               return FALSE;
+       }
+       match->m_len = p - match->m_begin;
+
+       return TRUE;
+
+}
+
+
+static gboolean
+url_web_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+       /* Check what we have found */
+       if (pos > begin && *pos == 'w' && *(pos + 1) == 'w' && *(pos + 2) == 'w') {
+               if (!is_open_brace (*(pos - 1)) && !g_ascii_isspace (*(pos - 1))) {
+                       return FALSE;
+               }
+       }
+       return TRUE;
+}
+static gboolean
+url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+       const gchar                    *p, *c;
+       gchar                           open_brace = '\0', close_brace = '\0';
+       int                             i, brace_stack;
+       gboolean                        passwd;
+       guint                           port;
+
+       p = pos + strlen (match->pattern);
+       for (i = 0; i < G_N_ELEMENTS (url_braces) / 2; i += 2) {
+               if (*p == url_braces[i]) {
+                       close_brace = url_braces[i + 1];
+                       open_brace = *p;
+                       break;
+               }
+       }
+
+       /* find the end of the domain */
+       if (is_atom (*p)) {
+               /* might be a domain or user@domain */
+               c = p;
+               while (p < end) {
+                       if (!is_atom (*p)) {
+                               break;
+                       }
+
+                       p++;
+
+                       while (p < end && is_atom (*p)) {
+                               p++;
+                       }
+
+                       if ((p + 1) < end && *p == '.' && (is_atom (*(p + 1)) || *(p + 1) == '/')) {
+                               p++;
+                       }
+               }
+
+               if (*p != '@') {
+                       p = c;
+               }
+               else {
+                       p++;
+               }
+
+               goto domain;
+       }
+       else if (is_domain (*p)) {
+domain:
+               while (p < end) {
+                       if (!is_domain (*p)) {
+                               break;
+                       }
+
+                       p++;
+
+                       while (p < end && is_domain (*p)) {
+                               p++;
+                       }
+
+                       if ((p + 1) < end && *p == '.' && (is_domain (*(p + 1)) || *(p + 1) == '/')) {
+                               p++;
+                       }
+               }
+       }
+       else {
+               return FALSE;
+       }
+
+       if (p < end) {
+               switch (*p) {
+               case ':': /* we either have a port or a password */
+                       p++;
+
+                       if (is_digit (*p) || passwd) {
+                               port = (*p++ - '0');
+
+                               while (p < end && is_digit (*p) && port < 65536) {
+                                       port = (port * 10) + (*p++ - '0');
+                               }
+
+                               if (!passwd && (port >= 65536 || *p == '@')) {
+                                       if (p < end) {
+                                               /* this must be a password? */
+                                               goto passwd;
+                                       }
+
+                                       p--;
+                               }
+                       }
+                       else {
+                               passwd:
+                               passwd = TRUE;
+                               c = p;
+
+                               while (p < end && is_atom (*p)) {
+                                       p++;
+                               }
+
+                               if ((p + 2) < end) {
+                                       if (*p == '@') {
+                                               p++;
+                                               if (is_domain (*p)) {
+                                                       goto domain;
+                                               }
+                                       }
+
+                                       return FALSE;
+                               }
+                       }
+
+                       if (p >= end || *p != '/') {
+                               break;
+                       }
+
+                       /* we have a '/' so there could be a path - fall through */
+               case '/': /* we've detected a path component to our url */
+                       p++;
+               case '?':
+                       while (p < end && is_urlsafe (*p)) {
+                               if (*p == open_brace) {
+                                       brace_stack++;
+                               }
+                               else if (*p == close_brace) {
+                                       brace_stack--;
+                                       if (brace_stack == -1) {
+                                               break;
+                                       }
+                               }
+                               p++;
+                       }
+
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       /* urls are extremely unlikely to end with any
+        * punctuation, so strip any trailing
+        * punctuation off. Also strip off any closing
+        * double-quotes. */
+       while (p > pos && strchr (",.:;?!-|}])\"", p[-1])) {
+               p--;
+       }
+
+       match->m_len = (p - pos);
+
+       return TRUE;
+}
+
+
+static gboolean
+url_email_start (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+       return FALSE;
+}
+static gboolean
+url_email_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t *match)
+{
+       return FALSE;
+}
+
  void
  url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
  {
-       GMatchInfo                     *info;
-       GError                         *err = NULL;
-       int                             rc;
+       struct url_matcher             *matcher;
+       int                             rc, idx;
         char                           *url_str = NULL;
         struct uri                     *new;
+       const guint8                   *p, *end, *pos;
+       url_match_t                     m;
  
         if (!part->orig->data || part->orig->len == 0) {
                 msg_warn ("got empty text part");
@@ -909,27 +1134,33 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
  
         if (url_init () == 0) {
                 if (is_html) {
-                       rc = g_regex_match_full (html_re, (const char *)part->orig->data, part->orig->len, 0, 0, &info, &err);
+                       p = part->orig->data;
+                       end = p + part->orig->len;
                 }
                 else {
-                       rc = g_regex_match_full (text_re, (const char *)part->content->data, part->content->len, 0, 0, &info, &err);
-
+                       p = part->content->data;
+                       end = p + part->content->len;
                 }
-               if (rc) {
-                       while (g_match_info_matches (info)) {
-                               url_str = g_match_info_fetch (info, is_html ? 1 : 0);
-                               debug_task ("extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
-                               if (url_str != NULL) {
+               while (p < end) {
+                       if ((pos = rspamd_trie_lookup (url_scanner->patterns, p, end - p, &idx)) == NULL) {
+                               break;
+                       }
+                       else {
+                               matcher = &matchers[idx];
+                               m.pattern = matcher->pattern;
+                               m.prefix = matcher->prefix;
+                               if (matcher->start (p, pos, end, &m) && matcher->end (p, pos, end, &m)) {
+                                       url_str = memory_pool_alloc (task->task_pool, m.m_len + 1);
+                                       memcpy (url_str, m.m_begin, m.m_len);
+                                       url_str[m.m_len] = '\0';
                                         if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
                                                 new = memory_pool_alloc (pool, sizeof (struct uri));
                                                 if (new != NULL) {
                                                         g_strstrip (url_str);
                                                         rc = parse_uri (new, url_str, pool);
                                                         if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
-                                                               if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
-                                                                       g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
-                                                                       task->urls = g_list_prepend (task->urls, new);
-                                                               }
+                                                               g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+                                                               task->urls = g_list_prepend (task->urls, new);
                                                         }
                                                         else {
                                                                 msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc));
@@ -937,19 +1168,10 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
                                                 }
                                         }
                                 }
-                               memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_free, url_str);
-                               /* Get next match */
-                               g_match_info_next (info, &err);
+                               pos += strlen (matcher->pattern);
                         }
+                       p = pos;
                 }
-               else if (err != NULL) {
-                       debug_task ("error matching regexp: %s", err->message);
-                       g_free (err);
-               }
-               else {
-                       debug_task ("cannot find url pattern in given string");
-               }
-               g_match_info_free (info);
         }
  }
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Tue, 21 Sep 2010 16:11:34 +0000 (20:11 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Tue, 21 Sep 2010 16:11:34 +0000 (20:11 +0400)
CMakeLists.txt		patch \| blob \| blame \| history
perl/lib/Mail/Rspamd/Client.pm		patch \| blob \| blame \| history
rspamc.pl.in		patch \| blob \| blame \| history
src/plugins/fuzzy_check.c		patch \| blob \| blame \| history
src/trie.c	[new file with mode: 0644]	patch \| blob
src/trie.h	[new file with mode: 0644]	patch \| blob
src/url.c		patch \| blob \| blame \| history