1 // SPDX-License-Identifier: GPL-2.0
5 #include "btree_journal_iter.h"
6 #include "journal_io.h"
8 #include <linux/sort.h>
11 * For managing keys we read from the journal: until journal replay works normal
12 * btree lookups need to be able to find and return keys from the journal where
13 * they overwrite what's in the btree, so we have a special iterator and
14 * operations for the regular btree iter code to use:
17 static int __journal_key_cmp(enum btree_id l_btree_id
,
20 const struct journal_key
*r
)
22 return (cmp_int(l_btree_id
, r
->btree_id
) ?:
23 cmp_int(l_level
, r
->level
) ?:
24 bpos_cmp(l_pos
, r
->k
->k
.p
));
27 static int journal_key_cmp(const struct journal_key
*l
, const struct journal_key
*r
)
29 return __journal_key_cmp(l
->btree_id
, l
->level
, l
->k
->k
.p
, r
);
32 static inline size_t idx_to_pos(struct journal_keys
*keys
, size_t idx
)
34 size_t gap_size
= keys
->size
- keys
->nr
;
41 static inline struct journal_key
*idx_to_key(struct journal_keys
*keys
, size_t idx
)
43 return keys
->d
+ idx_to_pos(keys
, idx
);
46 static size_t __bch2_journal_key_search(struct journal_keys
*keys
,
47 enum btree_id id
, unsigned level
,
50 size_t l
= 0, r
= keys
->nr
, m
;
53 m
= l
+ ((r
- l
) >> 1);
54 if (__journal_key_cmp(id
, level
, pos
, idx_to_key(keys
, m
)) > 0)
60 BUG_ON(l
< keys
->nr
&&
61 __journal_key_cmp(id
, level
, pos
, idx_to_key(keys
, l
)) > 0);
64 __journal_key_cmp(id
, level
, pos
, idx_to_key(keys
, l
- 1)) <= 0);
69 static size_t bch2_journal_key_search(struct journal_keys
*keys
,
70 enum btree_id id
, unsigned level
,
73 return idx_to_pos(keys
, __bch2_journal_key_search(keys
, id
, level
, pos
));
76 struct bkey_i
*bch2_journal_keys_peek_upto(struct bch_fs
*c
, enum btree_id btree_id
,
77 unsigned level
, struct bpos pos
,
78 struct bpos end_pos
, size_t *idx
)
80 struct journal_keys
*keys
= &c
->journal_keys
;
82 struct journal_key
*k
;
85 *idx
= __bch2_journal_key_search(keys
, btree_id
, level
, pos
);
87 while ((k
= *idx
< keys
->nr
? idx_to_key(keys
, *idx
) : NULL
)) {
88 if (__journal_key_cmp(btree_id
, level
, end_pos
, k
) < 0)
91 if (__journal_key_cmp(btree_id
, level
, pos
, k
) <= 0 &&
106 struct bkey_i
*bch2_journal_keys_peek_slot(struct bch_fs
*c
, enum btree_id btree_id
,
107 unsigned level
, struct bpos pos
)
111 return bch2_journal_keys_peek_upto(c
, btree_id
, level
, pos
, pos
, &idx
);
114 static void journal_iters_fix(struct bch_fs
*c
)
116 struct journal_keys
*keys
= &c
->journal_keys
;
117 /* The key we just inserted is immediately before the gap: */
118 size_t gap_end
= keys
->gap
+ (keys
->size
- keys
->nr
);
119 struct btree_and_journal_iter
*iter
;
122 * If an iterator points one after the key we just inserted, decrement
123 * the iterator so it points at the key we just inserted - if the
124 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
127 list_for_each_entry(iter
, &c
->journal_iters
, journal
.list
)
128 if (iter
->journal
.idx
== gap_end
)
129 iter
->journal
.idx
= keys
->gap
- 1;
132 static void journal_iters_move_gap(struct bch_fs
*c
, size_t old_gap
, size_t new_gap
)
134 struct journal_keys
*keys
= &c
->journal_keys
;
135 struct journal_iter
*iter
;
136 size_t gap_size
= keys
->size
- keys
->nr
;
138 list_for_each_entry(iter
, &c
->journal_iters
, list
) {
139 if (iter
->idx
> old_gap
)
140 iter
->idx
-= gap_size
;
141 if (iter
->idx
>= new_gap
)
142 iter
->idx
+= gap_size
;
146 int bch2_journal_key_insert_take(struct bch_fs
*c
, enum btree_id id
,
147 unsigned level
, struct bkey_i
*k
)
149 struct journal_key n
= {
155 * Ensure these keys are done last by journal replay, to unblock
158 .journal_seq
= U32_MAX
,
160 struct journal_keys
*keys
= &c
->journal_keys
;
161 size_t idx
= bch2_journal_key_search(keys
, id
, level
, k
->k
.p
);
163 BUG_ON(test_bit(BCH_FS_RW
, &c
->flags
));
165 if (idx
< keys
->size
&&
166 journal_key_cmp(&n
, &keys
->d
[idx
]) == 0) {
167 if (keys
->d
[idx
].allocated
)
168 kfree(keys
->d
[idx
].k
);
174 idx
-= keys
->size
- keys
->nr
;
176 if (keys
->nr
== keys
->size
) {
177 struct journal_keys new_keys
= {
179 .size
= max_t(size_t, keys
->size
, 8) * 2,
182 new_keys
.d
= kvmalloc_array(new_keys
.size
, sizeof(new_keys
.d
[0]), GFP_KERNEL
);
184 bch_err(c
, "%s: error allocating new key array (size %zu)",
185 __func__
, new_keys
.size
);
186 return -BCH_ERR_ENOMEM_journal_key_insert
;
189 /* Since @keys was full, there was no gap: */
190 memcpy(new_keys
.d
, keys
->d
, sizeof(keys
->d
[0]) * keys
->nr
);
194 /* And now the gap is at the end: */
195 keys
->gap
= keys
->nr
;
198 journal_iters_move_gap(c
, keys
->gap
, idx
);
200 move_gap(keys
->d
, keys
->nr
, keys
->size
, keys
->gap
, idx
);
204 keys
->d
[keys
->gap
++] = n
;
206 journal_iters_fix(c
);
212 * Can only be used from the recovery thread while we're still RO - can't be
213 * used once we've got RW, as journal_keys is at that point used by multiple
216 int bch2_journal_key_insert(struct bch_fs
*c
, enum btree_id id
,
217 unsigned level
, struct bkey_i
*k
)
222 n
= kmalloc(bkey_bytes(&k
->k
), GFP_KERNEL
);
224 return -BCH_ERR_ENOMEM_journal_key_insert
;
227 ret
= bch2_journal_key_insert_take(c
, id
, level
, n
);
233 int bch2_journal_key_delete(struct bch_fs
*c
, enum btree_id id
,
234 unsigned level
, struct bpos pos
)
236 struct bkey_i whiteout
;
238 bkey_init(&whiteout
.k
);
241 return bch2_journal_key_insert(c
, id
, level
, &whiteout
);
244 void bch2_journal_key_overwritten(struct bch_fs
*c
, enum btree_id btree
,
245 unsigned level
, struct bpos pos
)
247 struct journal_keys
*keys
= &c
->journal_keys
;
248 size_t idx
= bch2_journal_key_search(keys
, btree
, level
, pos
);
250 if (idx
< keys
->size
&&
251 keys
->d
[idx
].btree_id
== btree
&&
252 keys
->d
[idx
].level
== level
&&
253 bpos_eq(keys
->d
[idx
].k
->k
.p
, pos
))
254 keys
->d
[idx
].overwritten
= true;
257 static void bch2_journal_iter_advance(struct journal_iter
*iter
)
259 if (iter
->idx
< iter
->keys
->size
) {
261 if (iter
->idx
== iter
->keys
->gap
)
262 iter
->idx
+= iter
->keys
->size
- iter
->keys
->nr
;
266 static struct bkey_s_c
bch2_journal_iter_peek(struct journal_iter
*iter
)
268 struct journal_key
*k
= iter
->keys
->d
+ iter
->idx
;
270 while (k
< iter
->keys
->d
+ iter
->keys
->size
&&
271 k
->btree_id
== iter
->btree_id
&&
272 k
->level
== iter
->level
) {
274 return bkey_i_to_s_c(k
->k
);
276 bch2_journal_iter_advance(iter
);
277 k
= iter
->keys
->d
+ iter
->idx
;
280 return bkey_s_c_null
;
283 static void bch2_journal_iter_exit(struct journal_iter
*iter
)
285 list_del(&iter
->list
);
288 static void bch2_journal_iter_init(struct bch_fs
*c
,
289 struct journal_iter
*iter
,
290 enum btree_id id
, unsigned level
,
295 iter
->keys
= &c
->journal_keys
;
296 iter
->idx
= bch2_journal_key_search(&c
->journal_keys
, id
, level
, pos
);
299 static struct bkey_s_c
bch2_journal_iter_peek_btree(struct btree_and_journal_iter
*iter
)
301 return bch2_btree_node_iter_peek_unpack(&iter
->node_iter
,
302 iter
->b
, &iter
->unpacked
);
305 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter
*iter
)
307 bch2_btree_node_iter_advance(&iter
->node_iter
, iter
->b
);
310 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter
*iter
)
312 if (bpos_eq(iter
->pos
, SPOS_MAX
))
315 iter
->pos
= bpos_successor(iter
->pos
);
318 struct bkey_s_c
bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter
*iter
)
320 struct bkey_s_c btree_k
, journal_k
, ret
;
323 return bkey_s_c_null
;
325 while ((btree_k
= bch2_journal_iter_peek_btree(iter
)).k
&&
326 bpos_lt(btree_k
.k
->p
, iter
->pos
))
327 bch2_journal_iter_advance_btree(iter
);
329 while ((journal_k
= bch2_journal_iter_peek(&iter
->journal
)).k
&&
330 bpos_lt(journal_k
.k
->p
, iter
->pos
))
331 bch2_journal_iter_advance(&iter
->journal
);
334 (!btree_k
.k
|| bpos_le(journal_k
.k
->p
, btree_k
.k
->p
))
338 if (ret
.k
&& iter
->b
&& bpos_gt(ret
.k
->p
, iter
->b
->data
->max_key
))
342 iter
->pos
= ret
.k
->p
;
343 if (bkey_deleted(ret
.k
)) {
344 bch2_btree_and_journal_iter_advance(iter
);
348 iter
->pos
= SPOS_MAX
;
355 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter
*iter
)
357 bch2_journal_iter_exit(&iter
->journal
);
360 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
*iter
,
363 struct btree_node_iter node_iter
,
366 memset(iter
, 0, sizeof(*iter
));
369 iter
->node_iter
= node_iter
;
370 bch2_journal_iter_init(c
, &iter
->journal
, b
->c
.btree_id
, b
->c
.level
, pos
);
371 INIT_LIST_HEAD(&iter
->journal
.list
);
372 iter
->pos
= b
->data
->min_key
;
373 iter
->at_end
= false;
377 * this version is used by btree_gc before filesystem has gone RW and
378 * multithreaded, so uses the journal_iters list:
380 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
*iter
,
384 struct btree_node_iter node_iter
;
386 bch2_btree_node_iter_init_from_start(&node_iter
, b
);
387 __bch2_btree_and_journal_iter_init_node_iter(iter
, c
, b
, node_iter
, b
->data
->min_key
);
388 list_add(&iter
->journal
.list
, &c
->journal_iters
);
391 /* sort and dedup all keys in the journal: */
393 void bch2_journal_entries_free(struct bch_fs
*c
)
395 struct journal_replay
**i
;
396 struct genradix_iter iter
;
398 genradix_for_each(&c
->journal_entries
, iter
, i
)
400 kvpfree(*i
, offsetof(struct journal_replay
, j
) +
401 vstruct_bytes(&(*i
)->j
));
402 genradix_free(&c
->journal_entries
);
406 * When keys compare equal, oldest compares first:
408 static int journal_sort_key_cmp(const void *_l
, const void *_r
)
410 const struct journal_key
*l
= _l
;
411 const struct journal_key
*r
= _r
;
413 return journal_key_cmp(l
, r
) ?:
414 cmp_int(l
->journal_seq
, r
->journal_seq
) ?:
415 cmp_int(l
->journal_offset
, r
->journal_offset
);
418 void bch2_journal_keys_free(struct journal_keys
*keys
)
420 struct journal_key
*i
;
422 move_gap(keys
->d
, keys
->nr
, keys
->size
, keys
->gap
, keys
->nr
);
423 keys
->gap
= keys
->nr
;
425 for (i
= keys
->d
; i
< keys
->d
+ keys
->nr
; i
++)
431 keys
->nr
= keys
->gap
= keys
->size
= 0;
434 static void __journal_keys_sort(struct journal_keys
*keys
)
436 struct journal_key
*src
, *dst
;
438 sort(keys
->d
, keys
->nr
, sizeof(keys
->d
[0]), journal_sort_key_cmp
, NULL
);
441 while (src
< keys
->d
+ keys
->nr
) {
442 while (src
+ 1 < keys
->d
+ keys
->nr
&&
443 src
[0].btree_id
== src
[1].btree_id
&&
444 src
[0].level
== src
[1].level
&&
445 bpos_eq(src
[0].k
->k
.p
, src
[1].k
->k
.p
))
451 keys
->nr
= dst
- keys
->d
;
454 int bch2_journal_keys_sort(struct bch_fs
*c
)
456 struct genradix_iter iter
;
457 struct journal_replay
*i
, **_i
;
458 struct jset_entry
*entry
;
460 struct journal_keys
*keys
= &c
->journal_keys
;
461 size_t nr_keys
= 0, nr_read
= 0;
463 genradix_for_each(&c
->journal_entries
, iter
, _i
) {
469 for_each_jset_key(k
, entry
, &i
->j
)
476 keys
->size
= roundup_pow_of_two(nr_keys
);
478 keys
->d
= kvmalloc_array(keys
->size
, sizeof(keys
->d
[0]), GFP_KERNEL
);
480 bch_err(c
, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
485 keys
->d
= kvmalloc_array(keys
->size
, sizeof(keys
->d
[0]), GFP_KERNEL
);
486 } while (!keys
->d
&& keys
->size
> nr_keys
/ 8);
489 bch_err(c
, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
491 return -BCH_ERR_ENOMEM_journal_keys_sort
;
495 genradix_for_each(&c
->journal_entries
, iter
, _i
) {
503 for_each_jset_key(k
, entry
, &i
->j
) {
504 if (keys
->nr
== keys
->size
) {
505 __journal_keys_sort(keys
);
507 if (keys
->nr
> keys
->size
* 7 / 8) {
508 bch_err(c
, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
509 keys
->nr
, keys
->size
, nr_read
, nr_keys
);
510 return -BCH_ERR_ENOMEM_journal_keys_sort
;
514 keys
->d
[keys
->nr
++] = (struct journal_key
) {
515 .btree_id
= entry
->btree_id
,
516 .level
= entry
->level
,
518 .journal_seq
= le64_to_cpu(i
->j
.seq
),
519 .journal_offset
= k
->_data
- i
->j
._data
,
526 __journal_keys_sort(keys
);
527 keys
->gap
= keys
->nr
;
529 bch_verbose(c
, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys
, keys
->nr
);