]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib: Add bloom filter support
authorAki Tuomi <aki.tuomi@dovecot.fi>
Mon, 27 Nov 2017 11:46:40 +0000 (13:46 +0200)
committerAki Tuomi <aki.tuomi@open-xchange.com>
Thu, 30 Nov 2017 19:23:13 +0000 (21:23 +0200)
src/lib/Makefile.am
src/lib/bloomfilter.c [new file with mode: 0644]
src/lib/bloomfilter.h [new file with mode: 0644]
src/lib/test-bloomfilter.c [new file with mode: 0644]
src/lib/test-lib.inc

index 5a2baefcf9c99e714f514eed829be252e18c2f7a..a353b4e1f4422070d809372a58a396d28d756ea5 100644 (file)
@@ -18,6 +18,7 @@ liblib_la_SOURCES = \
        base32.c \
        base64.c \
        bits.c \
+       bloomfilter.c \
        bsearch-insert-pos.c \
        buffer.c \
        child-wait.c \
@@ -173,6 +174,7 @@ headers = \
        base32.h \
        base64.h \
        bits.h \
+       bloomfilter.h \
        bsearch-insert-pos.h \
        buffer.h \
        byteorder.h \
@@ -330,6 +332,7 @@ test_lib_SOURCES = \
        test-base32.c \
        test-base64.c \
        test-bits.c \
+       test-bloomfilter.c \
        test-bsearch-insert-pos.c \
        test-buffer.c \
        test-byteorder.c \
diff --git a/src/lib/bloomfilter.c b/src/lib/bloomfilter.c
new file mode 100644 (file)
index 0000000..8bedb98
--- /dev/null
@@ -0,0 +1,140 @@
+#include "lib.h"
+#include "bloomfilter.h"
+#include "murmurhash3.h"
+#include "md5.h"
+#include "randgen.h"
+
+#include <math.h>
+
+struct bloomfilter {
+       pool_t pool;
+       int refcount;
+       size_t size;
+       size_t total_added;
+       unsigned int nk;
+       uint32_t seed;
+
+       bloomfilter_hash_func_t *const *k;
+
+       uint8_t *bitmap;
+};
+
+#define BITMAP_HAS_BIT(map, idx) (((map)[((idx)/CHAR_BIT)] & (0x1<<((idx)%CHAR_BIT))) != 0)
+#define BITMAP_SET_BIT(map, idx) ((map)[((idx)/CHAR_BIT)] |= (0x1<<((idx)%CHAR_BIT)))
+#define BLOOMFILTER_HASH_BYTES 16
+
+/* use only murmurhash3 by default */
+bloomfilter_hash_func_t *const bloomfilter_default_functions[] = {
+       bloomfilter_murmur3_hash,
+       NULL
+};
+
+static inline size_t
+bloomfilter_hash_fold(unsigned char result[STATIC_ARRAY BLOOMFILTER_HASH_BYTES],
+                     uint32_t seed)
+{
+#ifdef _LP64
+       /* rolls 128 bit result into a 64 bit result by xoring the first 64 bits
+          and seed, and remaining 64 bits. */
+       return be64_to_cpu_unaligned(&result[0]) ^
+              be64_to_cpu_unaligned(&result[8]) ^
+              (((size_t)seed) << 32);
+#else
+       /* rolls 128 bit result into a 32 bit result by folding
+          all the successive 32 bit values into one together with seed. */
+       return be32_to_cpu_unaligned(&result[0]) ^
+              be32_to_cpu_unaligned(&result[4]) ^
+              be32_to_cpu_unaligned(&result[8]) ^
+              be32_to_cpu_unaligned(&result[12]) ^
+              seed;
+#endif
+}
+
+size_t bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed)
+{
+       unsigned char result[MURMURHASH3_128_RESULTBYTES];
+       murmurhash3_128(data, len, seed, result);
+       /* murmur includes seed already */
+       return bloomfilter_hash_fold(result, 0);
+}
+
+size_t bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed)
+{
+       unsigned char result[MD5_RESULTLEN];
+       md5_get_digest(data, len, result);
+       return bloomfilter_hash_fold(result, seed);
+}
+
+struct bloomfilter *
+bloomfilter_create(pool_t pool, size_t size,
+                  bloomfilter_hash_func_t *const *hash_functions)
+{
+       struct bloomfilter *bf = p_new(pool, struct bloomfilter, 1);
+       i_assert(size > 0);
+       bf->pool = pool;
+       /* allocate extra byte to round up result */
+       bf->bitmap = p_malloc(pool, size/CHAR_BIT + 1);
+       bf->k = hash_functions;
+       bf->size = size;
+       while(*hash_functions != NULL) {
+               bf->nk++;
+               hash_functions++;
+       }
+       i_assert(bf->nk > 0);
+       random_fill(&bf->seed, sizeof(bf->seed));
+       bf->refcount = 1;
+       return bf;
+}
+
+void bloomfilter_ref(struct bloomfilter *bf)
+{
+       i_assert(bf->refcount > 0);
+       bf->refcount++;
+}
+
+void bloomfilter_unref(struct bloomfilter **_bf)
+{
+       struct bloomfilter *bf = *_bf;
+       if (*_bf == NULL)
+               return;
+       *_bf = NULL;
+       i_assert(bf->refcount > 0);
+
+       if (--bf->refcount > 0)
+               return;
+       /* in case system pool was used .. */
+       p_free(bf->pool, bf->bitmap);
+       p_free(bf->pool, bf);
+}
+
+size_t bloomfilter_estimated_item_count(struct bloomfilter *bf)
+{
+       return bf->total_added;
+}
+
+bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len)
+{
+       i_assert(data != NULL || len == 0);
+       bloomfilter_hash_func_t *const *k = bf->k;
+       for(;*k != NULL; k++) {
+               size_t result;
+               result = (*k)(data, len, bf->seed) % bf->size;
+               if (!BITMAP_HAS_BIT(bf->bitmap, result))
+                       return FALSE;
+       }
+       return TRUE;
+}
+
+void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len)
+{
+       i_assert(data != NULL || len == 0);
+       bloomfilter_hash_func_t *const *k = bf->k;
+       /* total added will cap at size_t, because it's an estimate */
+       if (bf->total_added < (size_t)-1)
+               bf->total_added++;
+       for(;*k != NULL; k++) {
+               size_t result;
+               result = (*k)(data, len, bf->seed) % bf->size;
+               BITMAP_SET_BIT(bf->bitmap, result);
+       }
+}
diff --git a/src/lib/bloomfilter.h b/src/lib/bloomfilter.h
new file mode 100644 (file)
index 0000000..02956c5
--- /dev/null
@@ -0,0 +1,117 @@
+#ifndef BLOOMFILTER_H
+#define BLOOMFILTER_H
+
+#include "buffer.h"
+
+/* Short explanation of bloom filter:
+
+Bloom filter is a space-efficient probabilistic filter. The idea is
+that each element that gets added, is hashed thru one or more hashing
+functions and the resulting hash modulo table size bit is set.
+
+When seeing if there is an element set, it will check that each
+hashing function result modulo table size bit is set. If any of them
+is not set, the element is missing. If all of them are set, the
+element is probably present.
+
+A bloom filter will never report a false negative, but it might
+report a false positive value.
+
+Elements cannot be removed from this bloom filter.
+*/
+
+struct bloomfilter;
+
+typedef size_t bloomfilter_hash_func_t(const void *data, size_t len, uint32_t seed);
+
+/* create bloomfilter of size with hash functions */
+struct bloomfilter *
+bloomfilter_create(pool_t pool, size_t size,
+                  bloomfilter_hash_func_t *const *hash_functions) ATTR_RETURNS_NONNULL;
+
+/* Some helpers */
+#define p_bloomfilter_create(pool, size) \
+       bloomfilter_create(pool, size, bloomfilter_default_functions)
+#define i_bloomfilter_create(size) p_bloomfilter_create(default_pool, size)
+#define t_bloomfilter_create(size) \
+       p_bloomfilter_create(pool_datastack_create(), size)
+
+/* Reference counting */
+void bloomfilter_ref(struct bloomfilter *bf);
+void bloomfilter_unref(struct bloomfilter **_bf);
+
+/* Returns estimated number of items in this filter */
+size_t bloomfilter_estimated_item_count(struct bloomfilter *bf);
+
+/* Returns TRUE if the element is probably in the filter */
+bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2);
+
+/* Inserts element into filter */
+void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2);
+
+static inline bool
+bloomfilter_has_string(struct bloomfilter *bf, const char *data)
+{
+       return bloomfilter_has_data(bf, data, strlen(data));
+}
+
+static inline void
+bloomfilter_set_string(struct bloomfilter *bf, const char *data)
+{
+        bloomfilter_set_data(bf, data, strlen(data));
+}
+
+static inline void
+bloomfilter_set_strings(struct bloomfilter *bf, const char *const *datum)
+{
+       while(*datum != NULL) {
+               bloomfilter_set_data(bf, *datum, strlen(*datum));
+               datum++;
+       }
+}
+
+static inline bool
+bloomfilter_has_buffer(struct bloomfilter *bf, const buffer_t *data)
+{
+       return bloomfilter_has_data(bf, data->data, data->used);
+}
+
+static inline void
+bloomfilter_set_buffer(struct bloomfilter *bf, const buffer_t *data)
+{
+        bloomfilter_set_data(bf, data->data, data->used);
+}
+
+static inline bool
+bloomfilter_has_int(struct bloomfilter *bf, intmax_t value)
+{
+       return bloomfilter_has_data(bf, &value, sizeof(value));
+}
+
+static inline void
+bloomfilter_set_int(struct bloomfilter *bf, intmax_t value)
+{
+        bloomfilter_set_data(bf, &value, sizeof(value));
+}
+
+static inline bool
+bloomfilter_has_uint(struct bloomfilter *bf, uintmax_t value)
+{
+       return bloomfilter_has_data(bf, &value, sizeof(value));
+}
+
+static inline void
+bloomfilter_set_uint(struct bloomfilter *bf, uintmax_t value)
+{
+        bloomfilter_set_data(bf, &value, sizeof(value));
+}
+
+size_t
+bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE;
+size_t
+bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE;
+
+/* By default, only murmur3 is used. */
+extern bloomfilter_hash_func_t *const bloomfilter_default_functions[];
+
+#endif
diff --git a/src/lib/test-bloomfilter.c b/src/lib/test-bloomfilter.c
new file mode 100644 (file)
index 0000000..68b0bb2
--- /dev/null
@@ -0,0 +1,32 @@
+#include "test-lib.h"
+#include "randgen.h"
+#include "bloomfilter.h"
+
+void test_bloomfilter(void)
+{
+       test_begin("bloomfilter");
+       struct bloomfilter *bf = i_bloomfilter_create(18);
+       const char *const strings[] = {
+               "correct", "horse", "battery", "staple", NULL
+       };
+
+       /* set some items */
+       bloomfilter_set_strings(bf, strings);
+       bloomfilter_set_int(bf, 500);
+
+       /* make sure they exist */
+       for(unsigned int i = 0; strings[i] != NULL; i++) {
+               test_assert(bloomfilter_has_string(bf, strings[i]));
+       }
+
+       test_assert(bloomfilter_has_int(bf, 500));
+
+       /* make sure nothing bad happens with non-existing items */
+       (void)bloomfilter_has_string(bf, "hello, world");
+
+       test_assert(bloomfilter_estimated_item_count(bf) == 5);
+
+       bloomfilter_unref(&bf);
+
+       test_end();
+}
index 2d995b77319d6f403a82a8f76640b0fe6665953c..ca89c5133827d4b1e28bea40b5f2d0c85a820efb 100644 (file)
@@ -8,6 +8,7 @@ FATAL(fatal_array)
 TEST(test_base32)
 TEST(test_base64)
 TEST(test_bits)
+TEST(test_bloomfilter)
 TEST(test_bsearch_insert_pos)
 TEST(test_buffer)
 TEST(test_byteorder)