]> git.ipfire.org Git - thirdparty/git.git/commitdiff
commit-graph: reuse existing Bloom filters during write
authorGarima Singh <garima.singh@microsoft.com>
Mon, 6 Apr 2020 16:59:50 +0000 (16:59 +0000)
committerJunio C Hamano <gitster@pobox.com>
Mon, 6 Apr 2020 18:08:37 +0000 (11:08 -0700)
Add logic to
a) parse Bloom filter information from the commit graph file and,
b) re-use existing Bloom filters.

See Documentation/technical/commit-graph-format for the format in which
the Bloom filter information is written to the commit graph file.

To read Bloom filter for a given commit with lexicographic position
'i' we need to:
1. Read BIDX[i] which essentially gives us the starting index in BDAT for
   filter of commit i+1. It is essentially the index past the end
   of the filter of commit i. It is called end_index in the code.

2. For i>0, read BIDX[i-1] which will give us the starting index in BDAT
   for filter of commit i. It is called the start_index in the code.
   For the first commit, where i = 0, Bloom filter data starts at the
   beginning, just past the header in the BDAT chunk. Hence, start_index
   will be 0.

3. The length of the filter will be end_index - start_index, because
   BIDX[i] gives the cumulative 8-byte words including the ith
   commit's filter.

We toggle whether Bloom filters should be recomputed based on the
compute_if_not_present flag.

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
bloom.c
bloom.h
commit-graph.c
t/helper/test-bloom.c

diff --git a/bloom.c b/bloom.c
index a16eee92331183cb18232037c7d659646008b4ca..0f714dd76ae6ba7b4b77003e7052f422e993ca06 100644 (file)
--- a/bloom.c
+++ b/bloom.c
@@ -4,6 +4,8 @@
 #include "diffcore.h"
 #include "revision.h"
 #include "hashmap.h"
+#include "commit-graph.h"
+#include "commit.h"
 
 define_commit_slab(bloom_filter_slab, struct bloom_filter);
 
@@ -26,6 +28,36 @@ static inline unsigned char get_bitmask(uint32_t pos)
        return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
 }
 
+static int load_bloom_filter_from_graph(struct commit_graph *g,
+                                  struct bloom_filter *filter,
+                                  struct commit *c)
+{
+       uint32_t lex_pos, start_index, end_index;
+
+       while (c->graph_pos < g->num_commits_in_base)
+               g = g->base_graph;
+
+       /* The commit graph commit 'c' lives in doesn't carry bloom filters. */
+       if (!g->chunk_bloom_indexes)
+               return 0;
+
+       lex_pos = c->graph_pos - g->num_commits_in_base;
+
+       end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
+
+       if (lex_pos > 0)
+               start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
+       else
+               start_index = 0;
+
+       filter->len = end_index - start_index;
+       filter->data = (unsigned char *)(g->chunk_bloom_data +
+                                       sizeof(unsigned char) * start_index +
+                                       BLOOMDATA_CHUNK_HEADER_SIZE);
+
+       return 1;
+}
+
 /*
  * Calculate the murmur3 32-bit hash value for the given data
  * using the given seed.
@@ -127,7 +159,8 @@ void init_bloom_filters(void)
 }
 
 struct bloom_filter *get_bloom_filter(struct repository *r,
-                                     struct commit *c)
+                                     struct commit *c,
+                                         int compute_if_not_present)
 {
        struct bloom_filter *filter;
        struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
@@ -140,6 +173,20 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 
        filter = bloom_filter_slab_at(&bloom_filters, c);
 
+       if (!filter->data) {
+               load_commit_graph_info(r, c);
+               if (c->graph_pos != COMMIT_NOT_FROM_GRAPH &&
+                       r->objects->commit_graph->chunk_bloom_indexes) {
+                       if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
+                               return filter;
+                       else
+                               return NULL;
+               }
+       }
+
+       if (filter->data || !compute_if_not_present)
+               return filter;
+
        repo_diff_setup(r, &diffopt);
        diffopt.flags.recursive = 1;
        diffopt.max_changes = max_changes;
diff --git a/bloom.h b/bloom.h
index 85ab8e9423dcf9bddeb6ada36ee49dc7a65aab7a..760d7122374ba115483ad47d3f1153dcc285a3a5 100644 (file)
--- a/bloom.h
+++ b/bloom.h
@@ -32,6 +32,7 @@ struct bloom_filter_settings {
 
 #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 }
 #define BITS_PER_WORD 8
+#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
 
 /*
  * A bloom_filter struct represents a data segment to
@@ -79,6 +80,7 @@ void add_key_to_filter(const struct bloom_key *key,
 void init_bloom_filters(void);
 
 struct bloom_filter *get_bloom_filter(struct repository *r,
-                                     struct commit *c);
+                                     struct commit *c,
+                                     int compute_if_not_present);
 
 #endif
\ No newline at end of file
index a8b6b5cca5dc6435f3b036c1d1c3994b91313714..77668629e27dbbae783cfd2364fee78d41bf446d 100644 (file)
@@ -1086,7 +1086,7 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f,
                        ctx->commits.nr);
 
        while (list < last) {
-               struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+               struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
                cur_pos += filter->len;
                display_progress(progress, ++i);
                hashwrite_be32(f, cur_pos);
@@ -1115,7 +1115,7 @@ static void write_graph_chunk_bloom_data(struct hashfile *f,
        hashwrite_be32(f, settings->bits_per_entry);
 
        while (list < last) {
-               struct bloom_filter *filter = get_bloom_filter(ctx->r, *list);
+               struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
                display_progress(progress, ++i);
                hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
                list++;
@@ -1296,7 +1296,7 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
 
        for (i = 0; i < ctx->commits.nr; i++) {
                struct commit *c = sorted_commits[i];
-               struct bloom_filter *filter = get_bloom_filter(ctx->r, c);
+               struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1);
                ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
                display_progress(progress, i + 1);
        }
index f18d1b722e1c969563c2478a84fa53128335236f..ce412664ba95f9e86f884fab9a3e9af80a3c5935 100644 (file)
@@ -39,7 +39,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
        struct bloom_filter *filter;
        setup_git_directory();
        c = lookup_commit(the_repository, commit_oid);
-       filter = get_bloom_filter(the_repository, c);
+       filter = get_bloom_filter(the_repository, c, 1);
        print_bloom_filter(filter);
 }