]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Remove legacy fuzzy code completely.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 17 Jul 2015 23:10:56 +0000 (00:10 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 17 Jul 2015 23:10:56 +0000 (00:10 +0100)
15 files changed:
src/fuzzy_storage.c
src/fuzzy_storage.h
src/libmime/message.h
src/libmime/mime_expressions.c
src/libserver/fuzzy_backend.c
src/libserver/protocol.c
src/libutil/CMakeLists.txt
src/libutil/fuzzy.c [deleted file]
src/libutil/fuzzy.h [deleted file]
src/lua/lua_mimepart.c
src/plugins/fuzzy_check.c
test/CMakeLists.txt
test/rspamd_fuzzy_test.c [deleted file]
test/rspamd_test_suite.c
test/tests.h

index f544c40907edc5148109928f4d1483893e949619..507c99349615016903cdc27de0e6edfad16b469b 100644 (file)
@@ -34,7 +34,6 @@
 #include "cfg_file.h"
 #include "url.h"
 #include "message.h"
-#include "fuzzy.h"
 #include "bloom.h"
 #include "map.h"
 #include "fuzzy_storage.h"
@@ -80,19 +79,11 @@ struct rspamd_fuzzy_storage_ctx {
        struct rspamd_fuzzy_backend *backend;
 };
 
-struct rspamd_legacy_fuzzy_node {
-       gint32 value;
-       gint32 flag;
-       guint64 time;
-       rspamd_fuzzy_t h;
-};
-
 struct fuzzy_session {
        struct rspamd_worker *worker;
        struct rspamd_fuzzy_cmd *cmd;
        gint fd;
        guint64 time;
-       gboolean legacy;
        rspamd_inet_addr_t *addr;
        struct rspamd_fuzzy_storage_ctx *ctx;
 };
@@ -114,28 +105,9 @@ rspamd_fuzzy_write_reply (struct fuzzy_session *session,
                struct rspamd_fuzzy_reply *rep)
 {
        gint r;
-       gchar buf[64];
-
-       if (session->legacy) {
-               if (rep->prob > 0.5) {
-                       if (session->cmd->cmd == FUZZY_CHECK) {
-                               r = rspamd_snprintf (buf, sizeof (buf), "OK %d %d" CRLF,
-                                               rep->value, rep->flag);
-                       }
-                       else {
-                               r = rspamd_snprintf (buf, sizeof (buf), "OK" CRLF);
-                       }
 
-               }
-               else {
-                       r = rspamd_snprintf (buf, sizeof (buf), "ERR" CRLF);
-               }
-               r = rspamd_inet_address_sendto (session->fd, buf, r, 0, session->addr);
-       }
-       else {
-               r = rspamd_inet_address_sendto (session->fd, rep, sizeof (*rep), 0,
-                               session->addr);
-       }
+       r = rspamd_inet_address_sendto (session->fd, rep, sizeof (*rep), 0,
+                       session->addr);
 
        if (r == -1) {
                if (errno == EINTR) {
@@ -240,8 +212,7 @@ accept_fuzzy_socket (gint fd, short what, void *arg)
        struct fuzzy_session session;
        gint r;
        guint8 buf[2048];
-       struct rspamd_fuzzy_cmd *cmd = NULL, lcmd;
-       struct legacy_fuzzy_cmd *l;
+       struct rspamd_fuzzy_cmd *cmd = NULL;
        enum rspamd_fuzzy_epoch epoch = RSPAMD_FUZZY_EPOCH_MAX;
 
        session.worker = worker;
@@ -262,22 +233,8 @@ accept_fuzzy_socket (gint fd, short what, void *arg)
                        return;
                }
 
-               if ((guint)r == sizeof (struct legacy_fuzzy_cmd)) {
-                       session.legacy = TRUE;
-                       l = (struct legacy_fuzzy_cmd *)buf;
-                       lcmd.version = 2;
-                       memcpy (lcmd.digest, l->hash, sizeof (lcmd.digest));
-                       lcmd.cmd = l->cmd;
-                       lcmd.flag = l->flag;
-                       lcmd.shingles_count = 0;
-                       lcmd.value = l->value;
-                       lcmd.tag = 0;
-                       cmd = &lcmd;
-                       epoch = RSPAMD_FUZZY_EPOCH6;
-               }
-               else if ((guint)r >= sizeof (struct rspamd_fuzzy_cmd)) {
+               if ((guint)r >= sizeof (struct rspamd_fuzzy_cmd)) {
                        /* Check shingles count sanity */
-                       session.legacy = FALSE;
                        cmd = (struct rspamd_fuzzy_cmd *)buf;
                        epoch = rspamd_fuzzy_command_valid (cmd, r);
                        if (epoch == RSPAMD_FUZZY_EPOCH_MAX) {
index e2803c52ee25f9603fda02c9cc1d4dca61fe45ff..b9997da8ba601bb8ad9cfb043e3bfb07c1107f5b 100644 (file)
@@ -3,7 +3,6 @@
 
 #include "config.h"
 #include "main.h"
-#include "fuzzy.h"
 #include "shingles.h"
 
 #define RSPAMD_FUZZY_VERSION 3
 #define FUZZY_WRITE 1
 #define FUZZY_DEL 2
 
-struct legacy_fuzzy_cmd {
-       u_char cmd;
-       guint32 blocksize;
-       gint32 value;
-       gint32 flag;
-       u_char hash[FUZZY_HASHLEN];
-};
-
 RSPAMD_PACKED(rspamd_fuzzy_cmd) {
        guint8 version;
        guint8 cmd;
index 04e7cd5f35767772c5afadf90ac82041150f7971..b509b23cd555950904c12f9563e5a716a024b36a 100644 (file)
@@ -7,7 +7,6 @@
 #define RSPAMD_MESSAGE_H
 
 #include "config.h"
-#include "fuzzy.h"
 
 struct rspamd_task;
 struct controller_session;
@@ -43,11 +42,8 @@ struct mime_text_part {
        GByteArray *content;
        struct html_content *html;
        GList *urls_offset;     /**< list of offsets of urls                                            */
-       rspamd_fuzzy_t *fuzzy;
-       rspamd_fuzzy_t *double_fuzzy;
        GMimeObject *parent;
        struct mime_part *mime_part;
-       rspamd_fstring_t *diff_str;
        GArray *words;
        GArray *normalized_words;
        guint nlines;
index bff70c1b7b225fd06413d9b676a4ed8d062f0ecd..a4c02989e00fa6b37c93acdb48793b7a26eda101 100644 (file)
@@ -27,7 +27,6 @@
 #include "cfg_file.h"
 #include "main.h"
 #include "message.h"
-#include "fuzzy.h"
 #include "mime_expressions.h"
 #include "html.h"
 #include "lua/lua_common.h"
index 7cd4faa91cacc6f08873aa31c1211839f49c6792..a5bf28c7ca19387b384801d323ee7bc233f24b1c 100644 (file)
 #include "config.h"
 #include "main.h"
 #include "fuzzy_backend.h"
-#include "fuzzy_storage.h"
 
 #include <sqlite3.h>
 
-/* Magic sequence for hashes file */
-#define FUZZY_FILE_MAGIC "rsh"
-
-struct rspamd_legacy_fuzzy_node {
-       gint32 value;
-       gint32 flag;
-       guint64 time;
-       rspamd_fuzzy_t h;
-};
-
 struct rspamd_fuzzy_backend {
        sqlite3 *db;
        char *path;
@@ -45,7 +34,6 @@ struct rspamd_fuzzy_backend {
        gsize expired;
 };
 
-
 static const char *create_tables_sql =
                "BEGIN;"
                "CREATE TABLE digests("
@@ -393,80 +381,11 @@ rspamd_fuzzy_backend_open_db (const gchar *path, GError **err)
        return bk;
 }
 
-/*
- * Convert old database to the new format
- */
-static gboolean
-rspamd_fuzzy_backend_convert (const gchar *path, int fd, GError **err)
-{
-       gchar tmpdb[PATH_MAX];
-       struct rspamd_fuzzy_backend *nbackend;
-       struct stat st;
-       gint off;
-       guint8 *map, *p, *end;
-       struct rspamd_legacy_fuzzy_node *n;
-
-       rspamd_snprintf (tmpdb, sizeof (tmpdb), "%s.converted", path);
-       (void)unlink (tmpdb);
-       nbackend = rspamd_fuzzy_backend_create_db (tmpdb, FALSE, err);
-
-       if (nbackend == NULL) {
-               return FALSE;
-       }
-
-       (void)fstat (fd, &st);
-       (void)lseek (fd, 0, SEEK_SET);
-
-       off = sizeof (FUZZY_FILE_MAGIC);
-       if (off >= st.st_size) {
-               msg_warn ("old fuzzy storage is empty or corrupted, remove it");
-       }
-       else {
-               if ((map = mmap (NULL, st.st_size - off, PROT_READ, MAP_SHARED, fd,
-                               0)) == MAP_FAILED) {
-                       g_set_error (err, rspamd_fuzzy_backend_quark (),
-                                       errno, "Cannot mmap file %s: %s",
-                                       path, strerror (errno));
-                       rspamd_fuzzy_backend_close (nbackend);
-
-                       return FALSE;
-               }
-
-               end = map + st.st_size;
-               p = map + off;
-
-               rspamd_fuzzy_backend_run_simple (RSPAMD_FUZZY_BACKEND_TRANSACTION_START,
-                               nbackend, NULL);
-               while (p < end) {
-                       n = (struct rspamd_legacy_fuzzy_node *)p;
-                       /* Convert node flag, digest, value, time  */
-                       if (rspamd_fuzzy_backend_run_stmt (nbackend, RSPAMD_FUZZY_BACKEND_INSERT,
-                                       (gint)n->flag, n->h.hash_pipe,
-                                       (gint64)n->value, n->time) != SQLITE_OK) {
-                               msg_warn ("Cannot execute init sql %s: %s",
-                                               prepared_stmts[RSPAMD_FUZZY_BACKEND_INSERT].sql,
-                                               sqlite3_errmsg (nbackend->db));
-                       }
-                       p += sizeof (struct rspamd_legacy_fuzzy_node);
-               }
-
-               munmap (map, st.st_size);
-               rspamd_fuzzy_backend_run_simple (RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT,
-                               nbackend, NULL);
-       }
-
-       rspamd_fuzzy_backend_run_sql (create_index_sql, nbackend, NULL);
-       rspamd_fuzzy_backend_close (nbackend);
-       rename (tmpdb, path);
-
-       return TRUE;
-}
-
 struct rspamd_fuzzy_backend*
 rspamd_fuzzy_backend_open (const gchar *path, GError **err)
 {
-       gchar *dir, header[4];
-       gint fd, r;
+       gchar *dir;
+       gint fd;
        struct rspamd_fuzzy_backend *res;
        static const char sqlite_wal[] = "PRAGMA journal_mode=\"wal\";",
                        fallback_journal[] = "PRAGMA journal_mode=\"off\";";
@@ -501,21 +420,6 @@ rspamd_fuzzy_backend_open (const gchar *path, GError **err)
                        return NULL;
                }
        }
-       else {
-
-               /* Check for legacy format */
-               if ((r = read (fd, header, sizeof (header))) == sizeof (header)) {
-                       if (memcmp (header, FUZZY_FILE_MAGIC, sizeof (header) - 1) == 0) {
-                               msg_info ("Trying to convert old fuzzy database");
-                               if (!rspamd_fuzzy_backend_convert (path, fd, err)) {
-                                       close (fd);
-                                       return NULL;
-                               }
-                               msg_info ("Old database converted");
-                       }
-                       close (fd);
-               }
-       }
 
        close (fd);
 
index a4d78427f6ee3203761d98ea2a9198fc3977e193..1fedbbb46ea6796edc384cccdd6cd0d4e9653bb4 100644 (file)
@@ -645,20 +645,9 @@ write_hashes_to_log (struct rspamd_task *task, GString *logbuf)
        struct mime_text_part *text_part;
        guint i;
 
+       /* TODO: rework parts hashes */
        for (i = 0; i < task->text_parts->len; i ++) {
                text_part = g_ptr_array_index (task->text_parts, i);
-
-               if (text_part->fuzzy) {
-                       if (i != task->text_parts->len - 1) {
-                               rspamd_printf_gstring (logbuf,
-                                       " part: %Xd,",
-                                       text_part->fuzzy->h);
-                       }
-                       else {
-                               rspamd_printf_gstring (logbuf, " part: %Xd",
-                                       text_part->fuzzy->h);
-                       }
-               }
        }
 }
 
index 61e5d6d1590a3be3562f7625e51c2eb57c9b0023..338a740276230fac1519d035611f1f1e97cee3be 100644 (file)
@@ -6,7 +6,6 @@ SET(LIBRSPAMDUTILSRC
                                                                ${CMAKE_CURRENT_SOURCE_DIR}/diff.c
                                                                ${CMAKE_CURRENT_SOURCE_DIR}/expression.c
                                                                ${CMAKE_CURRENT_SOURCE_DIR}/fstring.c
-                                                               ${CMAKE_CURRENT_SOURCE_DIR}/fuzzy.c
                                                                ${CMAKE_CURRENT_SOURCE_DIR}/hash.c
                                                                ${CMAKE_CURRENT_SOURCE_DIR}/http.c
                                                                ${CMAKE_CURRENT_SOURCE_DIR}/keypairs_cache.c
diff --git a/src/libutil/fuzzy.c b/src/libutil/fuzzy.c
deleted file mode 100644 (file)
index 218065b..0000000
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * Copyright (c) 2009-2012, Vsevolod Stakhov
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-#include "fuzzy.h"
-#include "message.h"
-#include "url.h"
-#include "main.h"
-#include "xxhash.h"
-
-#define ROLL_WINDOW_SIZE 9
-#define MIN_FUZZY_BLOCK_SIZE 3
-#define HASH_INIT      0x28021967
-
-static const char *b64 =
-       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-struct roll_state {
-       guint32 h[3];
-       gchar window[ROLL_WINDOW_SIZE];
-       gint n;
-};
-
-static struct roll_state rs;
-
-
-/* Rolling hash function based on Adler-32 checksum */
-static guint32
-fuzzy_roll_hash (guint c)
-{
-       /* Check window position */
-       if (rs.n == ROLL_WINDOW_SIZE) {
-               rs.n = 0;
-       }
-
-       rs.h[1] -= rs.h[0];
-       rs.h[1] += ROLL_WINDOW_SIZE * c;
-
-       rs.h[0] += c;
-       rs.h[0] -= rs.window[rs.n];
-
-       /* Save current symbol */
-       rs.window[rs.n] = c;
-       rs.n++;
-
-       rs.h[2] <<= 5;
-       rs.h[2] ^= c;
-
-       return rs.h[0] + rs.h[1] + rs.h[2];
-}
-
-/* A simple non-rolling hash, based on the FNV hash */
-static guint32
-fuzzy_fnv_hash (guint c, guint32 hval)
-{
-       hval ^= c;
-       hval +=
-               (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
-       return hval;
-}
-
-/* Calculate blocksize depending on length of input */
-static guint32
-fuzzy_blocksize (guint32 len)
-{
-       guint32 nlen = MIN_FUZZY_BLOCK_SIZE;
-
-       while (nlen * (FUZZY_HASHLEN - 1) < len) {
-               nlen *= 2;
-       }
-       return nlen;
-}
-
-
-/* Update hash with new symbol */
-static void
-fuzzy_update (rspamd_fuzzy_t * h, guint c)
-{
-       h->rh = fuzzy_roll_hash (c);
-       h->h = fuzzy_fnv_hash (c, h->h);
-
-       if (h->rh % h->block_size == (h->block_size - 1)) {
-               h->hash_pipe[h->hi] = b64[h->h % 64];
-               if (h->hi < FUZZY_HASHLEN - 2) {
-                       h->h = HASH_INIT;
-                       h->hi++;
-               }
-       }
-}
-
-static void
-fuzzy_update2 (rspamd_fuzzy_t * h1, rspamd_fuzzy_t *h2, guint c)
-{
-       h1->rh = fuzzy_roll_hash (c);
-       h1->h = fuzzy_fnv_hash (c, h1->h);
-       h2->rh = h1->rh;
-       h2->h = fuzzy_fnv_hash (c, h2->h);
-
-       if (h1->rh % h1->block_size == (h1->block_size - 1)) {
-               h1->hash_pipe[h1->hi] = b64[h1->h % 64];
-               if (h1->hi < FUZZY_HASHLEN - 2) {
-                       h1->h = HASH_INIT;
-                       h1->hi++;
-               }
-       }
-       if (h2->rh % h2->block_size == (h2->block_size - 1)) {
-               h2->hash_pipe[h2->hi] = b64[h2->h % 64];
-               if (h2->hi < FUZZY_HASHLEN - 2) {
-                       h2->h = HASH_INIT;
-                       h2->hi++;
-               }
-       }
-}
-
-/*
- * Levenshtein distance between string1 and string2.
- *
- * Replace cost is normally 1, and 2 with nonzero xcost.
- */
-guint32
-rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2)
-{
-       gint i;
-       gint *row;                              /* we only need to keep one row of costs */
-       gint *end;
-       gint half, nx;
-       gchar *sx, *char2p, char1;
-       gint *p, D, x, offset, c3;
-
-       /* strip common prefix */
-       while (len1 > 0 && len2 > 0 && *s1 == *s2) {
-               len1--;
-               len2--;
-               s1++;
-               s2++;
-       }
-
-       /* strip common suffix */
-       while (len1 > 0 && len2 > 0 && s1[len1 - 1] == s2[len2 - 1]) {
-               len1--;
-               len2--;
-       }
-
-       /* catch trivial cases */
-       if (len1 == 0) {
-               return len2;
-       }
-
-       if (len2 == 0) {
-               return len1;
-       }
-
-       /* make the inner cycle (i.e. string2) the longer one */
-       if (len1 > len2) {
-               nx = len1;
-               sx = s1;
-               len1 = len2;
-               len2 = nx;
-               s1 = s2;
-               s2 = sx;
-       }
-       /* check len1 == 1 separately */
-       if (len1 == 1) {
-               return len2 - (memchr (s2, *s1, len2) != NULL);
-       }
-
-       len1++;
-       len2++;
-       half = len1 >> 1;
-
-       /* initalize first row */
-       row = g_malloc (len2 * sizeof (gint));
-       end = row + len2 - 1;
-       for (i = 0; i < len2; i++) {
-               row[i] = i;
-       }
-
-       /* in this case we don't have to scan two corner triangles (of size len1/2)
-        * in the matrix because no best path can go throught them. note this
-        * breaks when len1 == len2 == 2 so the memchr() special case above is
-        * necessary */
-       row[0] = len1 - half - 1;
-       for (i = 1; i < len1; i++) {
-               char1 = s1[i - 1];
-               /* skip the upper triangle */
-               if (i >= len1 - half) {
-                       offset = i - (len1 - half);
-                       char2p = s2 + offset;
-                       p = row + offset;
-                       c3 = *(p++) + (char1 != *(char2p++));
-                       x = *p;
-                       x++;
-                       D = x;
-                       if (x > c3)
-                               x = c3;
-                       *(p++) = x;
-               }
-               else {
-                       p = row + 1;
-                       char2p = s2;
-                       D = x = i;
-               }
-               /* skip the lower triangle */
-               if (i <= half + 1)
-                       end = row + len2 + i - half - 2;
-               /* main */
-               while (p <= end) {
-                       c3 = --D + (char1 != *(char2p++));
-                       x++;
-                       if (x > c3)
-                               x = c3;
-                       D = *p;
-                       D++;
-                       if (x > D)
-                               x = D;
-                       *(p++) = x;
-               }
-               /* lower triangle sentinel */
-               if (i <= half) {
-                       c3 = --D + (char1 != *char2p);
-                       x++;
-                       if (x > c3)
-                               x = c3;
-                       *p = x;
-               }
-       }
-
-       i = *end;
-       g_free (row);
-       return i;
-}
-
-/* Calculate fuzzy hash for specified string */
-rspamd_fuzzy_t *
-rspamd_fuzzy_init (rspamd_fstring_t * in, rspamd_mempool_t * pool)
-{
-       rspamd_fuzzy_t *new;
-       guint i, repeats = 0;
-       gchar *c = in->begin, last = '\0';
-       gsize real_len = 0;
-
-       new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
-       bzero (&rs, sizeof (rs));
-       for (i = 0; i < in->len; i++) {
-               if (*c == last) {
-                       repeats++;
-               }
-               else {
-                       repeats = 0;
-               }
-               if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
-                       real_len++;
-               }
-               last = *c;
-               c++;
-       }
-
-       new->block_size = fuzzy_blocksize (real_len);
-       c = in->begin;
-
-       for (i = 0; i < in->len; i++) {
-               if (*c == last) {
-                       repeats++;
-               }
-               else {
-                       repeats = 0;
-               }
-               if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
-                       fuzzy_update (new, *c);
-               }
-               last = *c;
-               c++;
-       }
-
-       /* Check whether we have more bytes in a rolling window */
-       if (new->rh != 0) {
-               new->hash_pipe[new->hi] = b64[new->h % 64];
-       }
-
-       return new;
-}
-
-rspamd_fuzzy_t *
-rspamd_fuzzy_from_byte_array (GByteArray * in, rspamd_mempool_t * pool)
-{
-       rspamd_fstring_t f;
-
-       f.begin = (gchar *)in->data;
-       f.len = in->len;
-
-       return rspamd_fuzzy_init (&f, pool);
-}
-
-void
-rspamd_fuzzy_from_text_part (struct mime_text_part *part,
-       rspamd_mempool_t *pool,
-       gsize max_diff)
-{
-       rspamd_fuzzy_t *new, *new2;
-       gchar *c, *end, *begin, *p;
-       gsize real_len = 0, len = part->content->len;
-       GList *cur_offset;
-       struct process_exception *cur_ex = NULL;
-       gunichar uc;
-       gboolean write_diff = FALSE;
-
-       cur_offset = part->urls_offset;
-       if (cur_offset != NULL) {
-               cur_ex = cur_offset->data;
-       }
-
-       begin = (gchar *)part->content->data;
-       c = begin;
-       new = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
-       new2 = rspamd_mempool_alloc0 (pool, sizeof (rspamd_fuzzy_t));
-       bzero (&rs, sizeof (rs));
-       end = c + len;
-
-       if (IS_PART_UTF (part)) {
-               while (c < end) {
-                       if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
-                               c += cur_ex->len + 1;
-                               cur_offset = g_list_next (cur_offset);
-                               if (cur_offset != NULL) {
-                                       cur_ex = cur_offset->data;
-                               }
-                       }
-                       else {
-                               uc = g_utf8_get_char (c);
-                               if (g_unichar_isalnum (uc)) {
-                                       p = g_utf8_next_char (c);
-                                       real_len += p - c;
-                               }
-                               else {
-                                       p = g_utf8_next_char (c);
-                               }
-                               c = p;
-                       }
-               }
-       }
-       else {
-               while (c < end) {
-                       if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
-                               c += cur_ex->len + 1;
-                               cur_offset = g_list_next (cur_offset);
-                               if (cur_offset != NULL) {
-                                       cur_ex = cur_offset->data;
-                               }
-                       }
-                       else {
-                               if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
-                                       real_len++;
-                               }
-                               c++;
-                       }
-               }
-       }
-
-       write_diff = real_len > 0 && real_len < max_diff;
-
-       if (write_diff) {
-               part->diff_str = rspamd_fstralloc (pool, real_len + 1);
-       }
-       else {
-               part->diff_str = NULL;
-       }
-
-       new->block_size = fuzzy_blocksize (real_len);
-       new2->block_size = new->block_size * 2;
-
-       cur_offset = part->urls_offset;
-       if (cur_offset != NULL) {
-               cur_ex = cur_offset->data;
-       }
-
-       begin = (gchar *)part->content->data;
-       c = begin;
-       end = c + len;
-       if (IS_PART_UTF (part)) {
-
-               while (c < end) {
-                       if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
-                               c += cur_ex->len + 1;
-                               cur_offset = g_list_next (cur_offset);
-                               if (cur_offset != NULL) {
-                                       cur_ex = cur_offset->data;
-                               }
-                       }
-                       else {
-                               uc = g_utf8_get_char (c);
-                               if (g_unichar_isalnum (uc)) {
-                                       fuzzy_update2 (new, new2, uc);
-                                       if (write_diff) {
-                                               rspamd_fstrappend_u (part->diff_str, uc);
-                                       }
-                               }
-                               c = g_utf8_next_char (c);
-                       }
-               }
-       }
-       else {
-               while (c < end) {
-                       if (cur_ex != NULL && (gint)cur_ex->pos == c - begin) {
-                               c += cur_ex->len + 1;
-                               cur_offset = g_list_next (cur_offset);
-                               if (cur_offset != NULL) {
-                                       cur_ex = cur_offset->data;
-                               }
-                       }
-                       else {
-                               if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c)) {
-                                       fuzzy_update2 (new, new2, *c);
-                                       if (write_diff) {
-                                               rspamd_fstrappend_c (part->diff_str, *c);
-                                       }
-                               }
-                               c++;
-                       }
-               }
-       }
-
-       /* Check whether we have more bytes in a rolling window */
-       if (new->rh != 0) {
-               new->hash_pipe[new->hi] = b64[new->h % 64];
-       }
-       if (new2->rh != 0) {
-               new2->hash_pipe[new2->hi] = b64[new2->h % 64];
-       }
-
-       part->fuzzy = new;
-       part->double_fuzzy = new2;
-}
-
-/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
-gint
-rspamd_fuzzy_compare (rspamd_fuzzy_t * h1, rspamd_fuzzy_t * h2)
-{
-       gint res, l1, l2;
-
-       /* If we have hashes of different size, input strings are too different */
-       if (h1->block_size != h2->block_size) {
-               return 0;
-       }
-
-       l1 = strlen (h1->hash_pipe);
-       l2 = strlen (h2->hash_pipe);
-
-       if (l1 == 0 || l2 == 0) {
-               if (l1 == 0 && l2 == 0) {
-                       return 100;
-               }
-               else {
-                       return 0;
-               }
-       }
-
-       res = rspamd_levinstein_distance (h1->hash_pipe, l1, h2->hash_pipe, l2);
-       res = 100 - (2 * res * 100) / (l1 + l2);
-
-       return res;
-}
-
-gint
-rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2)
-{
-       if (p1->fuzzy != NULL && p2->fuzzy != NULL) {
-               if (p1->fuzzy->block_size == p2->fuzzy->block_size) {
-                       return rspamd_fuzzy_compare (p1->fuzzy, p2->fuzzy);
-               }
-               else if (p1->double_fuzzy->block_size == p2->fuzzy->block_size) {
-                       return rspamd_fuzzy_compare (p1->double_fuzzy, p2->fuzzy);
-               }
-               else if (p2->double_fuzzy->block_size == p1->fuzzy->block_size) {
-                       return rspamd_fuzzy_compare (p2->double_fuzzy, p1->fuzzy);
-               }
-       }
-
-       return 0;
-}
-
-gint
-rspamd_fuzzy_len (rspamd_fuzzy_t *h)
-{
-       gint len;
-       void *nullpos;
-
-       nullpos = memchr (h->hash_pipe, '\0', sizeof (h->hash_pipe));
-
-       if (nullpos == NULL) {
-               len = sizeof (h->hash_pipe);
-       }
-       else {
-               len = (char *)nullpos - h->hash_pipe;
-       }
-
-       return len;
-}
-
-guint
-rspamd_fuzzy_hash (gconstpointer key)
-{
-       rspamd_fuzzy_t *fh = (rspamd_fuzzy_t *)key;
-       XXH64_state_t xxh;
-
-       XXH64_reset (&xxh, rspamd_hash_seed ());
-
-       XXH64_update (&xxh, &fh->block_size, sizeof (fh->block_size));
-       XXH64_update (&xxh, fh->hash_pipe, rspamd_fuzzy_len (fh));
-
-       return XXH64_digest (&xxh);
-}
-
-gboolean
-rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2)
-{
-       rspamd_fuzzy_t *fh1= (rspamd_fuzzy_t *)v1,
-                       *fh2 = (rspamd_fuzzy_t *)v2;
-
-       if (fh1->block_size == fh2->block_size) {
-               gint l1 = rspamd_fuzzy_len (fh1),
-                       l2 = rspamd_fuzzy_len (fh2);
-
-               if (l1 == l2) {
-                       return (memcmp (fh1->hash_pipe, fh2->hash_pipe, l1) == 0);
-               }
-       }
-
-       return FALSE;
-}
-
-/*
- * vi:ts=4
- */
diff --git a/src/libutil/fuzzy.h b/src/libutil/fuzzy.h
deleted file mode 100644 (file)
index 813599c..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * @file fuzzy.h
- * Fuzzy hashes API
- */
-
-#ifndef RSPAMD_FUZZY_H
-#define RSPAMD_FUZZY_H
-
-#include "config.h"
-#include "mem_pool.h"
-#include "fstring.h"
-
-#define FUZZY_HASHLEN 64
-
-typedef struct fuzzy_hash_s {
-       gchar hash_pipe[FUZZY_HASHLEN];     /**< result hash                                    */
-       guint32 block_size;                 /**< current blocksize                              */
-       guint32 rh;                         /**< roll hash value                                */
-       guint32 h;                              /**< hash of block                                      */
-       guint32 hi;                         /**< current index in hash pipe             */
-} rspamd_fuzzy_t;
-
-struct mime_text_part;
-
-/**
- * Calculate fuzzy hash for specified string
- * @param in input string
- * @param pool pool object
- * @return fuzzy_hash object allocated in pool
- */
-rspamd_fuzzy_t * rspamd_fuzzy_init (rspamd_fstring_t *in, rspamd_mempool_t *pool);
-/**
- * Calculate fuzzy hash for specified byte array
- * @param in input string
- * @param pool pool object
- * @return fuzzy_hash object allocated in pool
- */
-rspamd_fuzzy_t * rspamd_fuzzy_from_byte_array (GByteArray *in, rspamd_mempool_t *pool);
-
-/**
- * Calculate fuzzy hash for specified text part
- * @param part text part object
- * @param pool pool object
- * @param max_diff maximum text length to use diff algorithm in comparasions
- * @return fuzzy_hash object allocated in pool
- */
-void rspamd_fuzzy_from_text_part (struct mime_text_part *part,
-       rspamd_mempool_t *pool,
-       gsize max_diff);
-
-/**
- * Compare score of difference between two hashes
- * @param h1 first hash
- * @param h2 second hash
- * @return result in percents 0 - different hashes, 100 - identical hashes
- */
-gint rspamd_fuzzy_compare (rspamd_fuzzy_t *h1, rspamd_fuzzy_t *h2);
-
-/*
- * Compare two text parts and return percents of difference
- */
-gint rspamd_fuzzy_compare_parts (struct mime_text_part *p1, struct mime_text_part *p2);
-
-/*
- * Calculate levenstein distance between two strings. Note: this algorithm should be used
- * only for short texts - it runs too slow on long ones.
- */
-guint32 rspamd_levinstein_distance (gchar *s1, gint len1, gchar *s2, gint len2);
-
-/*
- * Hash table utilities
- */
-gint rspamd_fuzzy_len (rspamd_fuzzy_t *h);
-guint rspamd_fuzzy_hash (gconstpointer key);
-gboolean rspamd_fuzzy_equal (gconstpointer v1, gconstpointer v2);
-
-#endif
index 085295b183bd36963acd0b3d8c709d0499c0e621..323ebfd320b1489c2bd8a2a0b348dbe9f008bfda 100644 (file)
@@ -85,12 +85,6 @@ LUA_FUNCTION_DEF (textpart, is_empty);
  * @return {bool} whether a part is HTML part
  */
 LUA_FUNCTION_DEF (textpart, is_html);
-/***
- * @method text_part:get_fuzzy()
- * Returns base32 encoded value of fuzzy hash of the specified part
- * @return {string} fuzzy hash value
- */
-LUA_FUNCTION_DEF (textpart, get_fuzzy);
 /***
  * @method text_part:get_language()
  * Returns the code of the most used unicode script in the text part. Does not work with raw parts
@@ -103,16 +97,6 @@ LUA_FUNCTION_DEF (textpart, get_language);
  * @return {mimepart} mimepart object
  */
 LUA_FUNCTION_DEF (textpart, get_mimepart);
-/***
- * @method text_part:compare_distance(other)
- * Calculates the difference to another text part.  This function is intended to work with
- * the parts of `multipart/alternative` container only. If the two parts are not the parts of the
- * same `multipart/alternative` container, then they are considered as unrelated and
- * `-1` is returned.
- * @param {text_part} other text part to compare
- * @return {integer} commodity percentage (e.g. the same strings give `100`, different give `0` and unrelated give `-1`)
- */
-LUA_FUNCTION_DEF (textpart, compare_distance);
 
 static const struct luaL_reg textpartlib_m[] = {
        LUA_INTERFACE_DEF (textpart, is_utf),
@@ -121,10 +105,8 @@ static const struct luaL_reg textpartlib_m[] = {
        LUA_INTERFACE_DEF (textpart, get_lines_count),
        LUA_INTERFACE_DEF (textpart, is_empty),
        LUA_INTERFACE_DEF (textpart, is_html),
-       LUA_INTERFACE_DEF (textpart, get_fuzzy),
        LUA_INTERFACE_DEF (textpart, get_language),
        LUA_INTERFACE_DEF (textpart, get_mimepart),
-       LUA_INTERFACE_DEF (textpart, compare_distance),
        {"__tostring", rspamd_lua_class_tostring},
        {NULL, NULL}
 };
@@ -353,24 +335,6 @@ lua_textpart_is_html (lua_State * L)
        return 1;
 }
 
-static gint
-lua_textpart_get_fuzzy (lua_State * L)
-{
-       struct mime_text_part *part = lua_check_textpart (L);
-       gchar *out;
-
-       if (part == NULL || IS_PART_EMPTY (part)) {
-               lua_pushnil (L);
-               return 1;
-       }
-
-       out = rspamd_encode_base32 (part->fuzzy->hash_pipe,
-                       strlen (part->fuzzy->hash_pipe));
-       lua_pushstring (L, out);
-       g_free (out);
-
-       return 1;
-}
 
 static gint
 lua_textpart_get_language (lua_State * L)
@@ -408,60 +372,6 @@ lua_textpart_get_mimepart (lua_State * L)
        return 1;
 }
 
-static gint
-lua_textpart_compare_distance (lua_State * L)
-{
-       struct mime_text_part *part = lua_check_textpart (L), *other;
-       void *ud = luaL_checkudata (L, 2, "rspamd{textpart}");
-       gint diff = -1;
-       GMimeObject *parent;
-       const GMimeContentType *ct;
-
-       luaL_argcheck (L, ud != NULL, 2, "'textpart' expected");
-       other = ud ? *((struct mime_text_part **)ud) : NULL;
-
-       if (other != NULL && part->parent && part->parent == other->parent) {
-               parent = part->parent;
-               ct = g_mime_object_get_content_type (parent);
-#ifndef GMIME24
-               if (ct == NULL ||
-                       !g_mime_content_type_is_type (ct, "multipart", "alternative")) {
-#else
-               if (ct == NULL ||
-                       !g_mime_content_type_is_type ((GMimeContentType *)ct, "multipart",
-                       "alternative")) {
-#endif
-                       diff = -1;
-
-               }
-               else {
-                       if (!IS_PART_EMPTY (part) && !IS_PART_EMPTY (other)) {
-                               if (part->diff_str != NULL && other->diff_str != NULL) {
-                                       diff = rspamd_diff_distance (part->diff_str,
-                                                       other->diff_str);
-                               }
-                               else {
-                                       diff = rspamd_fuzzy_compare_parts (part, other);
-                               }
-                       }
-                       else if ((IS_PART_EMPTY (part) &&
-                               !IS_PART_EMPTY (other)) || (!IS_PART_EMPTY (part) &&
-                                               IS_PART_EMPTY (other))) {
-                               /* Empty and non empty parts are different */
-                               diff = 0;
-                       }
-               }
-       }
-       else {
-               diff = -1;
-       }
-
-
-       lua_pushinteger (L, diff);
-
-       return 1;
-}
-
 /* Mimepart implementation */
 
 static gint
index ec849da54c6ff341e60a0ea0087eacbc273ea536..857033ec058e47ed27d8c893e1ec78ad355da113 100644 (file)
@@ -563,46 +563,34 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
        rspamd_fstring_t *word;
        GArray *words;
 
-       if (legacy || part->words == NULL || part->words->len == 0) {
-               cmd = rspamd_mempool_alloc0 (pool, sizeof (*cmd));
+       shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd));
 
-               cmd->shingles_count = 0;
-               rspamd_strlcpy (cmd->digest, part->fuzzy->hash_pipe, sizeof (cmd->digest));
+       /*
+        * Generate hash from all words in the part
+        */
+       g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str,
+                       rule->hash_key->len) != -1);
+       words = fuzzy_preprocess_words (part, pool);
 
-               if (size != NULL) {
-                       *size = sizeof (struct rspamd_fuzzy_cmd);
-               }
+       for (i = 0; i < words->len; i ++) {
+               word = &g_array_index (words, rspamd_fstring_t, i);
+               blake2b_update (&st, word->begin, word->len);
        }
-       else {
-               shcmd = rspamd_mempool_alloc0 (pool, sizeof (*shcmd));
-
-               /*
-                * Generate hash from all words in the part
-                */
-               g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str,
-                               rule->hash_key->len) != -1);
-               words = fuzzy_preprocess_words (part, pool);
+       blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest));
 
-               for (i = 0; i < words->len; i ++) {
-                       word = &g_array_index (words, rspamd_fstring_t, i);
-                       blake2b_update (&st, word->begin, word->len);
-               }
-               blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest));
-
-               msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str);
-               sh = rspamd_shingles_generate (words,
-                               rule->shingles_key->str, pool,
-                               rspamd_shingles_default_filter, NULL);
-               if (sh != NULL) {
-                       memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
-                       shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
-               }
+       msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str);
+       sh = rspamd_shingles_generate (words,
+                       rule->shingles_key->str, pool,
+                       rspamd_shingles_default_filter, NULL);
+       if (sh != NULL) {
+               memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
+               shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+       }
 
-               cmd = (struct rspamd_fuzzy_cmd *)shcmd;
+       cmd = (struct rspamd_fuzzy_cmd *)shcmd;
 
-               if (size != NULL) {
-                       *size = sizeof (struct rspamd_fuzzy_shingle_cmd);
-               }
+       if (size != NULL) {
+               *size = sizeof (struct rspamd_fuzzy_shingle_cmd);
        }
 
        cmd->tag = ottery_rand_uint32 ();
@@ -959,7 +947,6 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
        struct mime_part *mime_part;
        struct rspamd_image *image;
        struct rspamd_fuzzy_cmd *cmd;
-       gsize hashlen;
        guint i;
        GPtrArray *res;
 
index 7c3312634ce30c44422d549496254d562f22fb7f..584bbbd4d41b71200d84a82ea0c1a1b38beac737 100644 (file)
@@ -1,6 +1,5 @@
 SET(TESTSRC            rspamd_mem_pool_test.c
                                rspamd_statfile_test.c
-                               rspamd_fuzzy_test.c
                                rspamd_url_test.c
                                rspamd_dns_test.c
                                rspamd_async_test.c
diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c
deleted file mode 100644 (file)
index b1f1f5d..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-#include "config.h"
-#include "main.h"
-#include "fuzzy.h"
-#include "tests.h"
-
-static char *s1 = "This is sample test text.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n";
-static char *s2 = "This is sample test text.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopzrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n";
-static char *s3 = "";
-static char *s4 = "abcdefghijklmn\r\n";
-static char *s5 = "This is sample test text.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopzrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n"
-                                 "abcdefghijklmnopqrstuvwx.\r\n";
-
-void 
-rspamd_fuzzy_test_func ()
-{
-       rspamd_mempool_t *pool;
-       rspamd_fuzzy_t *h1, *h2, *h3, *h4, *h5;
-       rspamd_fstring_t f1, f2, f3, f4, f5;
-       int diff2;
-
-       pool = rspamd_mempool_new (1024);
-       f1.begin = s1;
-       f1.len = strlen (s1);
-       f2.begin = s2;
-       f2.len = strlen (s2);
-       f3.begin = s3;
-       f3.len = strlen (s3);
-       f4.begin = s4;
-       f4.len = strlen (s4);
-       f5.begin = s5;
-       f5.len = strlen (s5);
-
-       h1 = rspamd_fuzzy_init (&f1, pool);
-       h2 = rspamd_fuzzy_init (&f2, pool);
-       h3 = rspamd_fuzzy_init (&f3, pool);
-       h4 = rspamd_fuzzy_init (&f4, pool);
-       h5 = rspamd_fuzzy_init (&f5, pool);
-
-       diff2 = rspamd_fuzzy_compare (h2, h5);
-       msg_debug ("rspamd_fuzzy_test_func: s1, s2 difference between strings is %d", rspamd_fuzzy_compare (h1, h2));
-       msg_debug ("rspamd_fuzzy_test_func: s1, s3 difference between strings is %d", rspamd_fuzzy_compare (h1, h3));
-       msg_debug ("rspamd_fuzzy_test_func: s3, s4 difference between strings is %d", rspamd_fuzzy_compare (h3, h4));
-       msg_debug ("rspamd_fuzzy_test_func: s2, s4 difference between strings is %d", rspamd_fuzzy_compare (h2, h4));
-       msg_debug ("rspamd_fuzzy_test_func: s2, s5 difference between strings is %d", diff2);
-       
-       /* Identical strings */
-       if (diff2 != 100) {
-               msg_err ("hash difference is %d", diff2);
-               g_assert (diff2 == 100);
-       }
-
-       rspamd_mempool_delete (pool);
-}
index 5dc854560fbe07ed01b19ed8c72ee49c44a7ab16..c1a2e27f5fad4b5627e603681f9bb5110f998785 100644 (file)
@@ -45,7 +45,6 @@ main (int argc, char **argv)
        g_log_set_default_handler (rspamd_glib_log_function, rspamd_main->logger);
 
        g_test_add_func ("/rspamd/mem_pool", rspamd_mem_pool_test_func);
-       g_test_add_func ("/rspamd/fuzzy", rspamd_fuzzy_test_func);
        g_test_add_func ("/rspamd/url", rspamd_url_test_func);
        g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func);
        g_test_add_func ("/rspamd/radix", rspamd_radix_test_func);
index f0dab9c0273f5948fa9c56e0ef480d59b74a1128..a2ba05b8415d15c9898a90efe57491957fb650d1 100644 (file)
@@ -11,9 +11,6 @@ void rspamd_url_test_func (void);
 /* Memory pools */
 void rspamd_mem_pool_test_func (void);
 
-/* Fuzzy hashes */
-void rspamd_fuzzy_test_func (void);
-
 /* Stat file */
 void rspamd_statfile_test_func (void);