]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Implement min-hash shingles for DCT data from images
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 9 Dec 2016 12:50:35 +0000 (12:50 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 9 Dec 2016 12:51:08 +0000 (12:51 +0000)
src/libutil/shingles.c
src/libutil/shingles.h

index 0f35a07e094de879d91c1e82f276b66568bce7de..9d1068c3145567fc73eca51b1031b97186b74833 100644 (file)
@@ -20,7 +20,7 @@
 #define SHINGLES_WINDOW 3
 
 struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
-rspamd_shingles_generate (GArray *input,
+rspamd_shingles_from_text (GArray *input,
                const guchar key[16],
                rspamd_mempool_t *pool,
                rspamd_shingles_filter filter,
@@ -160,6 +160,119 @@ rspamd_shingles_generate (GArray *input,
        return res;
 }
 
+struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
+rspamd_shingles_from_image (gdouble *dct,
+               const guchar key[16],
+               rspamd_mempool_t *pool,
+               rspamd_shingles_filter filter,
+               gpointer filterd,
+               enum rspamd_shingle_alg alg)
+{
+       struct rspamd_shingle *shingle;
+       guint64 **hashes;
+       rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE];
+       guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
+       const guchar *cur_key;
+       gdouble d;
+       rspamd_cryptobox_hash_state_t bs;
+       guint64 val;
+       gint i, j, k;
+       gsize hlen, beg = 0;
+       enum rspamd_cryptobox_fast_hash_type ht;
+       guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
+
+       if (pool != NULL) {
+               shingle = rspamd_mempool_alloc (pool, sizeof (*shingle));
+       }
+       else {
+               shingle = g_malloc (sizeof (*shingle));
+       }
+
+       rspamd_cryptobox_hash_init (&bs, NULL, 0);
+       cur_key = key;
+       out_key = (guchar *)&keys[0];
+
+       /* Init hashes pipes and keys */
+       hashes = g_slice_alloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
+       hlen = 64 - SHINGLES_WINDOW + 1;
+
+       for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+               hashes[i] = g_slice_alloc (hlen * sizeof (guint64));
+               /*
+                * To generate a set of hashes we just apply sha256 to the
+                * initial key as many times as many hashes are required and
+                * xor left and right parts of sha256 to get a single 16 bytes SIP key.
+                */
+               rspamd_cryptobox_hash_update (&bs, cur_key, 16);
+               rspamd_cryptobox_hash_final (&bs, shabuf);
+
+               for (j = 0; j < 16; j ++) {
+                       out_key[j] = shabuf[j];
+               }
+
+               rspamd_cryptobox_hash_init (&bs, NULL, 0);
+               cur_key = out_key;
+               out_key += 16;
+       }
+
+       switch (alg) {
+       case RSPAMD_SHINGLES_OLD:
+               ht = RSPAMD_CRYPTOBOX_MUMHASH;
+               break;
+       case RSPAMD_SHINGLES_XXHASH:
+               ht = RSPAMD_CRYPTOBOX_XXHASH64;
+               break;
+       case RSPAMD_SHINGLES_MUMHASH:
+               ht = RSPAMD_CRYPTOBOX_MUMHASH;
+               break;
+       default:
+               ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
+               break;
+       }
+
+       memset (res, 0, sizeof (res));
+
+       for (i = 0; i <=  64; i ++) {
+               if (i - beg >= SHINGLES_WINDOW || i == 64) {
+                       for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+                               /* Shift hashes window to right */
+                               for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
+                                       res[j * SHINGLES_WINDOW + k] =
+                                                       res[j * SHINGLES_WINDOW + k + 1];
+                               }
+
+                               d = dct[beg];
+                               /* Insert the last element to the pipe */
+                               memcpy (&seed, keys[j], sizeof (seed));
+                               res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+                                               rspamd_cryptobox_fast_hash_specific (ht,
+                                                               &d, sizeof (d),
+                                                               seed);
+                               val = 0;
+                               for (k = 0; k < SHINGLES_WINDOW; k ++) {
+                                       val ^= res[j * SHINGLES_WINDOW + k] >>
+                                                       (8 * (SHINGLES_WINDOW - k - 1));
+                               }
+
+                               g_assert (hlen > beg);
+                               hashes[j][beg] = val;
+                       }
+                       beg++;
+               }
+       }
+
+       /* Now we need to filter all hashes and make a shingles result */
+       for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+               shingle->hashes[i] = filter (hashes[i], hlen,
+                               i, key, filterd);
+               g_slice_free1 (hlen * sizeof (guint64), hashes[i]);
+       }
+
+       g_slice_free1 (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE, hashes);
+
+       return shingle;
+}
+
 
 guint64
 rspamd_shingles_default_filter (guint64 *input, gsize count,
index fd7d3bfc70e52b358ac5c916f8f0d0869b597e6e..5f455cdf85aa299a005ee609284c9fe62711999c 100644 (file)
@@ -51,7 +51,23 @@ typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count,
  * @param filterd opaque data for filtering function
  * @return shingles array
  */
-struct rspamd_shingle* rspamd_shingles_generate (GArray *input,
+struct rspamd_shingle* rspamd_shingles_from_text (GArray *input,
+               const guchar key[16],
+               rspamd_mempool_t *pool,
+               rspamd_shingles_filter filter,
+               gpointer filterd,
+               enum rspamd_shingle_alg alg);
+
+/**
+ * Generate shingles from the DCT matrix of an image
+ * @param dct discrete cosine transfor matrix (must be 64x64)
+ * @param key secret key used to generate shingles
+ * @param pool pool to allocate shigles array
+ * @param filter hashes filtering function
+ * @param filterd opaque data for filtering function
+ * @return shingles array
+ */
+struct rspamd_shingle* rspamd_shingles_from_image (gdouble *dct,
                const guchar key[16],
                rspamd_mempool_t *pool,
                rspamd_shingles_filter filter,