base32.c \
base64.c \
bits.c \
+ bloomfilter.c \
bsearch-insert-pos.c \
buffer.c \
child-wait.c \
base32.h \
base64.h \
bits.h \
+ bloomfilter.h \
bsearch-insert-pos.h \
buffer.h \
byteorder.h \
test-base32.c \
test-base64.c \
test-bits.c \
+ test-bloomfilter.c \
test-bsearch-insert-pos.c \
test-buffer.c \
test-byteorder.c \
--- /dev/null
+#include "lib.h"
+#include "bloomfilter.h"
+#include "murmurhash3.h"
+#include "md5.h"
+#include "randgen.h"
+
+#include <math.h>
+
+struct bloomfilter {
+ pool_t pool;
+ int refcount;
+ size_t size;
+ size_t total_added;
+ unsigned int nk;
+ uint32_t seed;
+
+ bloomfilter_hash_func_t *const *k;
+
+ uint8_t *bitmap;
+};
+
+#define BITMAP_HAS_BIT(map, idx) (((map)[((idx)/CHAR_BIT)] & (0x1<<((idx)%CHAR_BIT))) != 0)
+#define BITMAP_SET_BIT(map, idx) ((map)[((idx)/CHAR_BIT)] |= (0x1<<((idx)%CHAR_BIT)))
+#define BLOOMFILTER_HASH_BYTES 16
+
+/* use only murmurhash3 by default */
+bloomfilter_hash_func_t *const bloomfilter_default_functions[] = {
+ bloomfilter_murmur3_hash,
+ NULL
+};
+
+static inline size_t
+bloomfilter_hash_fold(unsigned char result[STATIC_ARRAY BLOOMFILTER_HASH_BYTES],
+ uint32_t seed)
+{
+#ifdef _LP64
+ /* rolls 128 bit result into a 64 bit result by xoring the first 64 bits
+ and seed, and remaining 64 bits. */
+ return be64_to_cpu_unaligned(&result[0]) ^
+ be64_to_cpu_unaligned(&result[8]) ^
+ (((size_t)seed) << 32);
+#else
+ /* rolls 128 bit result into a 32 bit result by folding
+ all the successive 32 bit values into one together with seed. */
+ return be32_to_cpu_unaligned(&result[0]) ^
+ be32_to_cpu_unaligned(&result[4]) ^
+ be32_to_cpu_unaligned(&result[8]) ^
+ be32_to_cpu_unaligned(&result[12]) ^
+ seed;
+#endif
+}
+
+size_t bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed)
+{
+ unsigned char result[MURMURHASH3_128_RESULTBYTES];
+ murmurhash3_128(data, len, seed, result);
+ /* murmur includes seed already */
+ return bloomfilter_hash_fold(result, 0);
+}
+
+size_t bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed)
+{
+ unsigned char result[MD5_RESULTLEN];
+ md5_get_digest(data, len, result);
+ return bloomfilter_hash_fold(result, seed);
+}
+
+struct bloomfilter *
+bloomfilter_create(pool_t pool, size_t size,
+ bloomfilter_hash_func_t *const *hash_functions)
+{
+ struct bloomfilter *bf = p_new(pool, struct bloomfilter, 1);
+ i_assert(size > 0);
+ bf->pool = pool;
+ /* allocate extra byte to round up result */
+ bf->bitmap = p_malloc(pool, size/CHAR_BIT + 1);
+ bf->k = hash_functions;
+ bf->size = size;
+ while(*hash_functions != NULL) {
+ bf->nk++;
+ hash_functions++;
+ }
+ i_assert(bf->nk > 0);
+ random_fill(&bf->seed, sizeof(bf->seed));
+ bf->refcount = 1;
+ return bf;
+}
+
+void bloomfilter_ref(struct bloomfilter *bf)
+{
+ i_assert(bf->refcount > 0);
+ bf->refcount++;
+}
+
+void bloomfilter_unref(struct bloomfilter **_bf)
+{
+ struct bloomfilter *bf = *_bf;
+ if (*_bf == NULL)
+ return;
+ *_bf = NULL;
+ i_assert(bf->refcount > 0);
+
+ if (--bf->refcount > 0)
+ return;
+ /* in case system pool was used .. */
+ p_free(bf->pool, bf->bitmap);
+ p_free(bf->pool, bf);
+}
+
+size_t bloomfilter_estimated_item_count(struct bloomfilter *bf)
+{
+ return bf->total_added;
+}
+
+bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len)
+{
+ i_assert(data != NULL || len == 0);
+ bloomfilter_hash_func_t *const *k = bf->k;
+ for(;*k != NULL; k++) {
+ size_t result;
+ result = (*k)(data, len, bf->seed) % bf->size;
+ if (!BITMAP_HAS_BIT(bf->bitmap, result))
+ return FALSE;
+ }
+ return TRUE;
+}
+
+void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len)
+{
+ i_assert(data != NULL || len == 0);
+ bloomfilter_hash_func_t *const *k = bf->k;
+ /* total added will cap at size_t, because it's an estimate */
+ if (bf->total_added < (size_t)-1)
+ bf->total_added++;
+ for(;*k != NULL; k++) {
+ size_t result;
+ result = (*k)(data, len, bf->seed) % bf->size;
+ BITMAP_SET_BIT(bf->bitmap, result);
+ }
+}
--- /dev/null
+#ifndef BLOOMFILTER_H
+#define BLOOMFILTER_H
+
+#include "buffer.h"
+
+/* Short explanation of bloom filter:
+
+Bloom filter is a space-efficient probabilistic filter. The idea is
+that each element that gets added, is hashed thru one or more hashing
+functions and the resulting hash modulo table size bit is set.
+
+When seeing if there is an element set, it will check that each
+hashing function result modulo table size bit is set. If any of them
+is not set, the element is missing. If all of them are set, the
+element is probably present.
+
+A bloom filter will never report a false negative, but it might
+report a false positive value.
+
+Elements cannot be removed from this bloom filter.
+*/
+
+struct bloomfilter;
+
+typedef size_t bloomfilter_hash_func_t(const void *data, size_t len, uint32_t seed);
+
+/* create bloomfilter of size with hash functions */
+struct bloomfilter *
+bloomfilter_create(pool_t pool, size_t size,
+ bloomfilter_hash_func_t *const *hash_functions) ATTR_RETURNS_NONNULL;
+
+/* Some helpers */
+#define p_bloomfilter_create(pool, size) \
+ bloomfilter_create(pool, size, bloomfilter_default_functions)
+#define i_bloomfilter_create(size) p_bloomfilter_create(default_pool, size)
+#define t_bloomfilter_create(size) \
+ p_bloomfilter_create(pool_datastack_create(), size)
+
+/* Reference counting */
+void bloomfilter_ref(struct bloomfilter *bf);
+void bloomfilter_unref(struct bloomfilter **_bf);
+
+/* Returns estimated number of items in this filter */
+size_t bloomfilter_estimated_item_count(struct bloomfilter *bf);
+
+/* Returns TRUE if the element is probably in the filter */
+bool bloomfilter_has_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2);
+
+/* Inserts element into filter */
+void bloomfilter_set_data(struct bloomfilter *bf, const void *data, size_t len) ATTR_NULL(2);
+
+static inline bool
+bloomfilter_has_string(struct bloomfilter *bf, const char *data)
+{
+ return bloomfilter_has_data(bf, data, strlen(data));
+}
+
+static inline void
+bloomfilter_set_string(struct bloomfilter *bf, const char *data)
+{
+ bloomfilter_set_data(bf, data, strlen(data));
+}
+
+static inline void
+bloomfilter_set_strings(struct bloomfilter *bf, const char *const *datum)
+{
+ while(*datum != NULL) {
+ bloomfilter_set_data(bf, *datum, strlen(*datum));
+ datum++;
+ }
+}
+
+static inline bool
+bloomfilter_has_buffer(struct bloomfilter *bf, const buffer_t *data)
+{
+ return bloomfilter_has_data(bf, data->data, data->used);
+}
+
+static inline void
+bloomfilter_set_buffer(struct bloomfilter *bf, const buffer_t *data)
+{
+ bloomfilter_set_data(bf, data->data, data->used);
+}
+
+static inline bool
+bloomfilter_has_int(struct bloomfilter *bf, intmax_t value)
+{
+ return bloomfilter_has_data(bf, &value, sizeof(value));
+}
+
+static inline void
+bloomfilter_set_int(struct bloomfilter *bf, intmax_t value)
+{
+ bloomfilter_set_data(bf, &value, sizeof(value));
+}
+
+static inline bool
+bloomfilter_has_uint(struct bloomfilter *bf, uintmax_t value)
+{
+ return bloomfilter_has_data(bf, &value, sizeof(value));
+}
+
+static inline void
+bloomfilter_set_uint(struct bloomfilter *bf, uintmax_t value)
+{
+ bloomfilter_set_data(bf, &value, sizeof(value));
+}
+
+size_t
+bloomfilter_murmur3_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE;
+size_t
+bloomfilter_md5_hash(const void *data, size_t len, uint32_t seed) ATTR_PURE;
+
+/* By default, only murmur3 is used. */
+extern bloomfilter_hash_func_t *const bloomfilter_default_functions[];
+
+#endif
--- /dev/null
+#include "test-lib.h"
+#include "randgen.h"
+#include "bloomfilter.h"
+
+void test_bloomfilter(void)
+{
+ test_begin("bloomfilter");
+ struct bloomfilter *bf = i_bloomfilter_create(18);
+ const char *const strings[] = {
+ "correct", "horse", "battery", "staple", NULL
+ };
+
+ /* set some items */
+ bloomfilter_set_strings(bf, strings);
+ bloomfilter_set_int(bf, 500);
+
+ /* make sure they exist */
+ for(unsigned int i = 0; strings[i] != NULL; i++) {
+ test_assert(bloomfilter_has_string(bf, strings[i]));
+ }
+
+ test_assert(bloomfilter_has_int(bf, 500));
+
+ /* make sure nothing bad happens with non-existing items */
+ (void)bloomfilter_has_string(bf, "hello, world");
+
+ test_assert(bloomfilter_estimated_item_count(bf) == 5);
+
+ bloomfilter_unref(&bf);
+
+ test_end();
+}
TEST(test_base32)
TEST(test_base64)
TEST(test_bits)
+TEST(test_bloomfilter)
TEST(test_bsearch_insert_pos)
TEST(test_buffer)
TEST(test_byteorder)