]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
bcachefs: Use a genradix for reading journal entries
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 21 Mar 2022 04:15:53 +0000 (00:15 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:30 +0000 (17:09 -0400)
Previously, the journal read path used a linked list for storing the
journal entries we read from disk. But there's been a bug that's been
causing journal_flush_delay to incorrectly be set to 0, leading to far
more journal entries than is normal being written out, which then means
filesystems are no longer able to start due to the O(n^2) behaviour of
inserting into/searching that linked list.

Fix this by switching to a radix tree.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/bcachefs.h
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_io.c
fs/bcachefs/journal_io.h
fs/bcachefs/recovery.c
fs/bcachefs/recovery.h
fs/bcachefs/super.c

index f2bb23162b4ac328a56342c4ed8cd863a2ac37c5..43e921b91d85f807892ce7f5e0be90a2659d9348 100644 (file)
@@ -894,7 +894,8 @@ mempool_t           bio_bounce_pages;
        mempool_t               btree_bounce_pool;
 
        struct journal          journal;
-       struct list_head        journal_entries;
+       GENRADIX(struct journal_replay *) journal_entries;
+       u64                     journal_entries_base_seq;
        struct journal_keys     journal_keys;
        struct list_head        journal_iters;
 
index f87f76553bf402fb8b91f700d7de18cda4c1cc38..f8b57de31d93a9bb65a21fe3ec232688b987815c 100644 (file)
@@ -1039,17 +1039,25 @@ void bch2_fs_journal_stop(struct journal *j)
        cancel_delayed_work_sync(&j->write_work);
 }
 
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
-                         struct list_head *journal_entries)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_entry_pin_list *p;
-       struct journal_replay *i;
+       struct journal_replay *i, **_i;
+       struct genradix_iter iter;
+       bool had_entries = false;
+       unsigned ptr;
        u64 last_seq = cur_seq, nr, seq;
 
-       if (!list_empty(journal_entries))
-               last_seq = le64_to_cpu(list_last_entry(journal_entries,
-                               struct journal_replay, list)->j.last_seq);
+       genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
+
+               last_seq = le64_to_cpu(i->j.last_seq);
+               break;
+       }
 
        nr = cur_seq - last_seq;
 
@@ -1071,14 +1079,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        j->pin.back             = cur_seq;
        atomic64_set(&j->seq, cur_seq - 1);
 
-       if (list_empty(journal_entries))
-               j->last_empty_seq = cur_seq - 1;
-
        fifo_for_each_entry_ptr(p, &j->pin, seq)
                journal_pin_list_init(p, 1);
 
-       list_for_each_entry(i, journal_entries, list) {
-               unsigned ptr;
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
 
                seq = le64_to_cpu(i->j.seq);
                BUG_ON(seq >= cur_seq);
@@ -1094,9 +1102,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
                p->devs.nr = 0;
                for (ptr = 0; ptr < i->nr_ptrs; ptr++)
                        bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+               had_entries = true;
        }
 
-       if (list_empty(journal_entries))
+       if (!had_entries)
                j->last_empty_seq = cur_seq;
 
        spin_lock(&j->lock);
index c287ecf643aa14d69e54a43ba24b25b945754a9d..59453dcfa4e937137404e7be0452a7e30114356c 100644 (file)
@@ -510,7 +510,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 
 void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+int bch2_fs_journal_start(struct journal *, u64);
 
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
index fad142196daa8edcdfcaaf5c75dbe5f5d7726c22..c84c0b8409065a8661e4774c775c9515fef1da58 100644 (file)
 #include "replicas.h"
 #include "trace.h"
 
-static void __journal_replay_free(struct journal_replay *i)
+static inline u32 journal_entry_radix_idx(struct bch_fs *c,
+                                         struct jset *j)
 {
-       list_del(&i->list);
+       return (le64_to_cpu(j->seq) - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+                                 struct journal_replay *i)
+{
+       struct journal_replay **p =
+               genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, &i->j));
+
+       BUG_ON(*p != i);
+       *p = NULL;
        kvpfree(i, offsetof(struct journal_replay, j) +
                vstruct_bytes(&i->j));
-
 }
 
 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -29,13 +39,12 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
        i->ignore = true;
 
        if (!c->opts.read_entire_journal)
-               __journal_replay_free(i);
+               __journal_replay_free(c, i);
 }
 
 struct journal_list {
        struct closure          cl;
        struct mutex            lock;
-       struct list_head        *head;
        int                     ret;
 };
 
@@ -51,19 +60,30 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
                             struct journal_list *jlist, struct jset *j,
                             bool bad)
 {
-       struct journal_replay *i, *pos, *dup = NULL;
+       struct genradix_iter iter;
+       struct journal_replay **_i, *i, *dup;
        struct journal_ptr *ptr;
-       struct list_head *where;
        size_t bytes = vstruct_bytes(j);
        u64 last_seq = 0;
        int ret = JOURNAL_ENTRY_ADD_OK;
 
+       /*
+        * Xarrays are indexed by a ulong, not a u64, so we can't index them by
+        * sequence number directly:
+        * Assume instead that they will all fall within the range of +-2billion
+        * of the filrst one we find.
+        */
+       if (!c->journal_entries_base_seq)
+               c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+
+#if 0
        list_for_each_entry_reverse(i, jlist->head, list) {
                if (!JSET_NO_FLUSH(&i->j)) {
                        last_seq = le64_to_cpu(i->j.last_seq);
                        break;
                }
        }
+#endif
 
        /* Is this entry older than the range we need? */
        if (!c->opts.read_entire_journal &&
@@ -73,29 +93,21 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
        }
 
        /* Drop entries we don't need anymore */
-       if (!JSET_NO_FLUSH(j)) {
-               list_for_each_entry_safe(i, pos, jlist->head, list) {
+       if (!JSET_NO_FLUSH(j) && !c->opts.read_entire_journal) {
+               genradix_for_each(&c->journal_entries, iter, _i) {
+                       i = *_i;
+
+                       if (!i)
+                               continue;
+
                        if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
                                break;
                        journal_replay_free(c, i);
                }
        }
 
-       list_for_each_entry_reverse(i, jlist->head, list) {
-               if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-                       where = &i->list;
-                       goto add;
-               }
-       }
-
-       where = jlist->head;
-add:
-       dup = where->next != jlist->head
-               ? container_of(where->next, struct journal_replay, list)
-               : NULL;
-
-       if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
-               dup = NULL;
+       _i = genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, j));
+       dup = _i ? *_i : NULL;
 
        /*
         * Duplicate journal entries? If so we want the one that didn't have a
@@ -131,10 +143,19 @@ add:
        if (dup) {
                i->nr_ptrs = dup->nr_ptrs;
                memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
-               __journal_replay_free(dup);
+               __journal_replay_free(c, dup);
        }
 
-       list_add(&i->list, where);
+       _i = genradix_ptr_alloc(&c->journal_entries,
+                               journal_entry_radix_idx(c, &i->j),
+                               GFP_KERNEL);
+       if (!_i) {
+               bch_err(c, "failed to allocate c->journal_entries entry");
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       *_i = i;
 found:
        for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
                if (ptr->dev == ca->dev_idx) {
@@ -913,7 +934,8 @@ static void bch2_journal_read_device(struct closure *cl)
        struct bch_fs *c = ca->fs;
        struct journal_list *jlist =
                container_of(cl->parent, struct journal_list, cl);
-       struct journal_replay *r;
+       struct journal_replay *r, **_r;
+       struct genradix_iter iter;
        struct journal_read_buf buf = { NULL, 0 };
        u64 min_seq = U64_MAX;
        unsigned i;
@@ -956,7 +978,12 @@ static void bch2_journal_read_device(struct closure *cl)
        ja->sectors_free = ca->mi.bucket_size;
 
        mutex_lock(&jlist->lock);
-       list_for_each_entry(r, jlist->head, list) {
+       genradix_for_each(&c->journal_entries, iter, _r) {
+               r = *_r;
+
+               if (!r)
+                       continue;
+
                for (i = 0; i < r->nr_ptrs; i++) {
                        if (r->ptrs[i].dev == ca->dev_idx &&
                            sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
@@ -1022,11 +1049,11 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        }
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
-                     u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 {
        struct journal_list jlist;
-       struct journal_replay *i, *t;
+       struct journal_replay *i, **_i, *prev = NULL;
+       struct genradix_iter radix_iter;
        struct bch_dev *ca;
        unsigned iter;
        struct printbuf buf = PRINTBUF;
@@ -1037,7 +1064,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
        closure_init_stack(&jlist.cl);
        mutex_init(&jlist.lock);
-       jlist.head = list;
        jlist.ret = 0;
 
        for_each_member_device(ca, c, iter) {
@@ -1061,22 +1087,21 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
        if (jlist.ret)
                return jlist.ret;
 
-       if (list_empty(list)) {
-               bch_info(c, "journal read done, but no entries found");
-               return 0;
-       }
-
-       i = list_last_entry(list, struct journal_replay, list);
-       *start_seq = le64_to_cpu(i->j.seq) + 1;
+       *start_seq = 0;
 
        /*
         * Find most recent flush entry, and ignore newer non flush entries -
         * those entries will be blacklisted:
         */
-       list_for_each_entry_safe_reverse(i, t, list, list) {
-               if (i->ignore)
+       genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
+               if (!*start_seq)
+                       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
                if (!JSET_NO_FLUSH(&i->j)) {
                        last_seq        = le64_to_cpu(i->j.last_seq);
                        *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
@@ -1086,6 +1111,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                journal_replay_free(c, i);
        }
 
+       if (!*start_seq) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
        if (!last_seq) {
                fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
                ret = -1;
@@ -1093,8 +1123,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
        }
 
        /* Drop blacklisted entries and entries older than last_seq: */
-       list_for_each_entry_safe(i, t, list, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                seq = le64_to_cpu(i->j.seq);
@@ -1113,8 +1145,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
        /* Check for missing entries: */
        seq = last_seq;
-       list_for_each_entry(i, list, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1136,11 +1170,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                               !bch2_journal_seq_is_blacklisted(c, seq, false))
                                seq++;
 
-                       if (i->list.prev != list) {
-                               struct journal_replay *p = list_prev_entry(i, list);
-
-                               bch2_journal_ptrs_to_text(&buf1, c, p);
-                               pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
+                       if (prev) {
+                               bch2_journal_ptrs_to_text(&buf1, c, prev);
+                               pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
                        } else
                                pr_buf(&buf1, "(none)");
                        bch2_journal_ptrs_to_text(&buf2, c, i);
@@ -1157,10 +1189,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                        printbuf_exit(&buf2);
                }
 
+               prev = i;
                seq++;
        }
 
-       list_for_each_entry(i, list, list) {
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
                struct jset_entry *entry;
                struct bkey_i *k, *_n;
                struct bch_replicas_padded replicas = {
@@ -1169,7 +1202,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                };
                unsigned ptr;
 
-               if (i->ignore)
+               i = *_i;
+               if (!i || i->ignore)
                        continue;
 
                ret = jset_validate_entries(c, &i->j, READ);
index f2001835e43e215f725331f1cc591f736a29b6ef..30e995c81fc4d72e97efc1ea77977e681ec02fc7 100644 (file)
@@ -7,7 +7,6 @@
  * during cache_registration
  */
 struct journal_replay {
-       struct list_head        list;
        struct journal_ptr {
                u8              dev;
                u32             bucket;
@@ -53,7 +52,7 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
                               struct journal_replay *);
 
-int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
index 9120ed26250ef8f8d498b42f74b01b747e37368f..07ce6a540856f44065f3622b698ff2c6705da2e5 100644 (file)
@@ -433,16 +433,16 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
 
 /* sort and dedup all keys in the journal: */
 
-void bch2_journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct bch_fs *c)
 {
-
-       while (!list_empty(list)) {
-               struct journal_replay *i =
-                       list_first_entry(list, struct journal_replay, list);
-               list_del(&i->list);
-               kvpfree(i, offsetof(struct journal_replay, j) +
-                       vstruct_bytes(&i->j));
-       }
+       struct journal_replay **i;
+       struct genradix_iter iter;
+
+       genradix_for_each(&c->journal_entries, iter, i)
+               if (*i)
+                       kvpfree(*i, offsetof(struct journal_replay, j) +
+                               vstruct_bytes(&(*i)->j));
+       genradix_free(&c->journal_entries);
 }
 
 /*
@@ -476,15 +476,18 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 
 static int journal_keys_sort(struct bch_fs *c)
 {
-       struct journal_replay *i;
+       struct genradix_iter iter;
+       struct journal_replay *i, **_i;
        struct jset_entry *entry;
        struct bkey_i *k, *_n;
        struct journal_keys *keys = &c->journal_keys;
        struct journal_key *src, *dst;
        size_t nr_keys = 0;
 
-       list_for_each_entry(i, &c->journal_entries, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                if (!keys->journal_seq_base)
@@ -503,8 +506,10 @@ static int journal_keys_sort(struct bch_fs *c)
        if (!keys->d)
                return -ENOMEM;
 
-       list_for_each_entry(i, &c->journal_entries, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX);
@@ -751,10 +756,8 @@ static int journal_replay_entry_early(struct bch_fs *c,
 }
 
 static int journal_replay_early(struct bch_fs *c,
-                               struct bch_sb_field_clean *clean,
-                               struct list_head *journal)
+                               struct bch_sb_field_clean *clean)
 {
-       struct journal_replay *i;
        struct jset_entry *entry;
        int ret;
 
@@ -767,8 +770,13 @@ static int journal_replay_early(struct bch_fs *c,
                                return ret;
                }
        } else {
-               list_for_each_entry(i, journal, list) {
-                       if (i->ignore)
+               struct genradix_iter iter;
+               struct journal_replay *i, **_i;
+
+               genradix_for_each(&c->journal_entries, iter, _i) {
+                       i = *_i;
+
+                       if (!i || i->ignore)
                                continue;
 
                        vstruct_for_each(&i->j, entry) {
@@ -1093,17 +1101,17 @@ int bch2_fs_recovery(struct bch_fs *c)
        }
 
        if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-               struct journal_replay *i;
+               struct genradix_iter iter;
+               struct journal_replay **i;
 
                bch_verbose(c, "starting journal read");
-               ret = bch2_journal_read(c, &c->journal_entries,
-                                       &blacklist_seq, &journal_seq);
+               ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
                if (ret)
                        goto err;
 
-               list_for_each_entry_reverse(i, &c->journal_entries, list)
-                       if (!i->ignore) {
-                               last_journal_entry = &i->j;
+               genradix_for_each_reverse(&c->journal_entries, iter, i)
+                       if (*i && !(*i)->ignore) {
+                               last_journal_entry = &(*i)->j;
                                break;
                        }
 
@@ -1152,7 +1160,7 @@ use_clean:
 
        zero_out_btree_mem_ptr(&c->journal_keys);
 
-       ret = journal_replay_early(c, clean, &c->journal_entries);
+       ret = journal_replay_early(c, clean);
        if (ret)
                goto err;
 
@@ -1175,8 +1183,7 @@ use_clean:
                }
        }
 
-       ret = bch2_fs_journal_start(&c->journal, journal_seq,
-                                   &c->journal_entries);
+       ret = bch2_fs_journal_start(&c->journal, journal_seq);
        if (ret)
                goto err;
 
@@ -1380,7 +1387,7 @@ out:
 
        if (!c->opts.keep_journal) {
                bch2_journal_keys_free(&c->journal_keys);
-               bch2_journal_entries_free(&c->journal_entries);
+               bch2_journal_entries_free(c);
        }
        kfree(clean);
        if (ret)
@@ -1401,7 +1408,6 @@ int bch2_fs_initialize(struct bch_fs *c)
        struct qstr lostfound = QSTR("lost+found");
        const char *err = "cannot allocate memory";
        struct bch_dev *ca;
-       LIST_HEAD(journal);
        unsigned i;
        int ret;
 
@@ -1441,7 +1447,7 @@ int bch2_fs_initialize(struct bch_fs *c)
         * journal_res_get() will crash if called before this has
         * set up the journal.pin FIFO and journal.cur pointer:
         */
-       bch2_fs_journal_start(&c->journal, 1, &journal);
+       bch2_fs_journal_start(&c->journal, 1);
        bch2_journal_set_replay_done(&c->journal);
 
        err = "error going read-write";
index 30580a8984a13b2de02f9ef8ace9d3999f28da02..ab8b116ac7db0cb486d2e0e475ee4856e5f9974d 100644 (file)
@@ -55,7 +55,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
                                                struct btree *);
 
 void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct list_head *);
+void bch2_journal_entries_free(struct bch_fs *);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
index 2c3d0546f2b60d7317f3282cb86335f8c233d890..689c82f6cb5d31405639bcb7cd8233c7d02437fe 100644 (file)
@@ -457,7 +457,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
        bch2_journal_keys_free(&c->journal_keys);
-       bch2_journal_entries_free(&c->journal_entries);
+       bch2_journal_entries_free(c);
        percpu_free_rwsem(&c->mark_lock);
        free_percpu(c->online_reserved);
 
@@ -676,7 +676,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_WORK(&c->journal_seq_blacklist_gc_work,
                  bch2_blacklist_entries_gc);
 
-       INIT_LIST_HEAD(&c->journal_entries);
        INIT_LIST_HEAD(&c->journal_iters);
 
        INIT_LIST_HEAD(&c->fsck_errors);