]>
Commit | Line | Data |
---|---|---|
401585fe KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bset.h" | |
5 | #include "btree_journal_iter.h" | |
6 | #include "journal_io.h" | |
7 | ||
8 | #include <linux/sort.h> | |
9 | ||
10 | /* | |
11 | * For managing keys we read from the journal: until journal replay works normal | |
12 | * btree lookups need to be able to find and return keys from the journal where | |
13 | * they overwrite what's in the btree, so we have a special iterator and | |
14 | * operations for the regular btree iter code to use: | |
15 | */ | |
16 | ||
17 | static int __journal_key_cmp(enum btree_id l_btree_id, | |
18 | unsigned l_level, | |
19 | struct bpos l_pos, | |
20 | const struct journal_key *r) | |
21 | { | |
22 | return (cmp_int(l_btree_id, r->btree_id) ?: | |
23 | cmp_int(l_level, r->level) ?: | |
24 | bpos_cmp(l_pos, r->k->k.p)); | |
25 | } | |
26 | ||
27 | static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) | |
28 | { | |
29 | return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); | |
30 | } | |
31 | ||
32 | static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) | |
33 | { | |
34 | size_t gap_size = keys->size - keys->nr; | |
35 | ||
36 | if (idx >= keys->gap) | |
37 | idx += gap_size; | |
38 | return idx; | |
39 | } | |
40 | ||
41 | static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) | |
42 | { | |
43 | return keys->d + idx_to_pos(keys, idx); | |
44 | } | |
45 | ||
46 | static size_t __bch2_journal_key_search(struct journal_keys *keys, | |
47 | enum btree_id id, unsigned level, | |
48 | struct bpos pos) | |
49 | { | |
50 | size_t l = 0, r = keys->nr, m; | |
51 | ||
52 | while (l < r) { | |
53 | m = l + ((r - l) >> 1); | |
54 | if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) | |
55 | l = m + 1; | |
56 | else | |
57 | r = m; | |
58 | } | |
59 | ||
60 | BUG_ON(l < keys->nr && | |
61 | __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); | |
62 | ||
63 | BUG_ON(l && | |
64 | __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); | |
65 | ||
66 | return l; | |
67 | } | |
68 | ||
69 | static size_t bch2_journal_key_search(struct journal_keys *keys, | |
70 | enum btree_id id, unsigned level, | |
71 | struct bpos pos) | |
72 | { | |
73 | return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); | |
74 | } | |
75 | ||
76 | struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, | |
77 | unsigned level, struct bpos pos, | |
78 | struct bpos end_pos, size_t *idx) | |
79 | { | |
80 | struct journal_keys *keys = &c->journal_keys; | |
81 | unsigned iters = 0; | |
82 | struct journal_key *k; | |
8a443d3e KO |
83 | |
84 | BUG_ON(*idx > keys->nr); | |
401585fe KO |
85 | search: |
86 | if (!*idx) | |
87 | *idx = __bch2_journal_key_search(keys, btree_id, level, pos); | |
88 | ||
89 | while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { | |
90 | if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) | |
91 | return NULL; | |
92 | ||
93 | if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && | |
94 | !k->overwritten) | |
95 | return k->k; | |
96 | ||
97 | (*idx)++; | |
98 | iters++; | |
99 | if (iters == 10) { | |
100 | *idx = 0; | |
101 | goto search; | |
102 | } | |
103 | } | |
104 | ||
105 | return NULL; | |
106 | } | |
107 | ||
108 | struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, | |
109 | unsigned level, struct bpos pos) | |
110 | { | |
111 | size_t idx = 0; | |
112 | ||
113 | return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); | |
114 | } | |
115 | ||
116 | static void journal_iters_fix(struct bch_fs *c) | |
117 | { | |
118 | struct journal_keys *keys = &c->journal_keys; | |
119 | /* The key we just inserted is immediately before the gap: */ | |
120 | size_t gap_end = keys->gap + (keys->size - keys->nr); | |
121 | struct btree_and_journal_iter *iter; | |
122 | ||
123 | /* | |
124 | * If an iterator points one after the key we just inserted, decrement | |
125 | * the iterator so it points at the key we just inserted - if the | |
126 | * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will | |
127 | * handle that: | |
128 | */ | |
129 | list_for_each_entry(iter, &c->journal_iters, journal.list) | |
130 | if (iter->journal.idx == gap_end) | |
131 | iter->journal.idx = keys->gap - 1; | |
132 | } | |
133 | ||
134 | static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) | |
135 | { | |
136 | struct journal_keys *keys = &c->journal_keys; | |
137 | struct journal_iter *iter; | |
138 | size_t gap_size = keys->size - keys->nr; | |
139 | ||
140 | list_for_each_entry(iter, &c->journal_iters, list) { | |
141 | if (iter->idx > old_gap) | |
142 | iter->idx -= gap_size; | |
143 | if (iter->idx >= new_gap) | |
144 | iter->idx += gap_size; | |
145 | } | |
146 | } | |
147 | ||
148 | int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, | |
149 | unsigned level, struct bkey_i *k) | |
150 | { | |
151 | struct journal_key n = { | |
152 | .btree_id = id, | |
153 | .level = level, | |
154 | .k = k, | |
155 | .allocated = true, | |
156 | /* | |
157 | * Ensure these keys are done last by journal replay, to unblock | |
158 | * journal reclaim: | |
159 | */ | |
160 | .journal_seq = U32_MAX, | |
161 | }; | |
162 | struct journal_keys *keys = &c->journal_keys; | |
163 | size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); | |
164 | ||
165 | BUG_ON(test_bit(BCH_FS_RW, &c->flags)); | |
166 | ||
167 | if (idx < keys->size && | |
168 | journal_key_cmp(&n, &keys->d[idx]) == 0) { | |
169 | if (keys->d[idx].allocated) | |
170 | kfree(keys->d[idx].k); | |
171 | keys->d[idx] = n; | |
172 | return 0; | |
173 | } | |
174 | ||
175 | if (idx > keys->gap) | |
176 | idx -= keys->size - keys->nr; | |
177 | ||
178 | if (keys->nr == keys->size) { | |
179 | struct journal_keys new_keys = { | |
180 | .nr = keys->nr, | |
181 | .size = max_t(size_t, keys->size, 8) * 2, | |
182 | }; | |
183 | ||
184 | new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); | |
185 | if (!new_keys.d) { | |
186 | bch_err(c, "%s: error allocating new key array (size %zu)", | |
187 | __func__, new_keys.size); | |
188 | return -BCH_ERR_ENOMEM_journal_key_insert; | |
189 | } | |
190 | ||
191 | /* Since @keys was full, there was no gap: */ | |
192 | memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); | |
193 | kvfree(keys->d); | |
8a443d3e KO |
194 | keys->d = new_keys.d; |
195 | keys->nr = new_keys.nr; | |
196 | keys->size = new_keys.size; | |
401585fe KO |
197 | |
198 | /* And now the gap is at the end: */ | |
8a443d3e | 199 | keys->gap = keys->nr; |
401585fe KO |
200 | } |
201 | ||
202 | journal_iters_move_gap(c, keys->gap, idx); | |
203 | ||
204 | move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); | |
205 | keys->gap = idx; | |
206 | ||
207 | keys->nr++; | |
208 | keys->d[keys->gap++] = n; | |
209 | ||
210 | journal_iters_fix(c); | |
211 | ||
212 | return 0; | |
213 | } | |
214 | ||
215 | /* | |
216 | * Can only be used from the recovery thread while we're still RO - can't be | |
217 | * used once we've got RW, as journal_keys is at that point used by multiple | |
218 | * threads: | |
219 | */ | |
220 | int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, | |
221 | unsigned level, struct bkey_i *k) | |
222 | { | |
223 | struct bkey_i *n; | |
224 | int ret; | |
225 | ||
226 | n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); | |
227 | if (!n) | |
228 | return -BCH_ERR_ENOMEM_journal_key_insert; | |
229 | ||
230 | bkey_copy(n, k); | |
231 | ret = bch2_journal_key_insert_take(c, id, level, n); | |
232 | if (ret) | |
233 | kfree(n); | |
234 | return ret; | |
235 | } | |
236 | ||
237 | int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, | |
238 | unsigned level, struct bpos pos) | |
239 | { | |
240 | struct bkey_i whiteout; | |
241 | ||
242 | bkey_init(&whiteout.k); | |
243 | whiteout.k.p = pos; | |
244 | ||
245 | return bch2_journal_key_insert(c, id, level, &whiteout); | |
246 | } | |
247 | ||
248 | void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, | |
249 | unsigned level, struct bpos pos) | |
250 | { | |
251 | struct journal_keys *keys = &c->journal_keys; | |
252 | size_t idx = bch2_journal_key_search(keys, btree, level, pos); | |
253 | ||
254 | if (idx < keys->size && | |
255 | keys->d[idx].btree_id == btree && | |
256 | keys->d[idx].level == level && | |
257 | bpos_eq(keys->d[idx].k->k.p, pos)) | |
258 | keys->d[idx].overwritten = true; | |
259 | } | |
260 | ||
261 | static void bch2_journal_iter_advance(struct journal_iter *iter) | |
262 | { | |
263 | if (iter->idx < iter->keys->size) { | |
264 | iter->idx++; | |
265 | if (iter->idx == iter->keys->gap) | |
266 | iter->idx += iter->keys->size - iter->keys->nr; | |
267 | } | |
268 | } | |
269 | ||
270 | static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) | |
271 | { | |
272 | struct journal_key *k = iter->keys->d + iter->idx; | |
273 | ||
274 | while (k < iter->keys->d + iter->keys->size && | |
275 | k->btree_id == iter->btree_id && | |
276 | k->level == iter->level) { | |
277 | if (!k->overwritten) | |
278 | return bkey_i_to_s_c(k->k); | |
279 | ||
280 | bch2_journal_iter_advance(iter); | |
281 | k = iter->keys->d + iter->idx; | |
282 | } | |
283 | ||
284 | return bkey_s_c_null; | |
285 | } | |
286 | ||
287 | static void bch2_journal_iter_exit(struct journal_iter *iter) | |
288 | { | |
289 | list_del(&iter->list); | |
290 | } | |
291 | ||
292 | static void bch2_journal_iter_init(struct bch_fs *c, | |
293 | struct journal_iter *iter, | |
294 | enum btree_id id, unsigned level, | |
295 | struct bpos pos) | |
296 | { | |
297 | iter->btree_id = id; | |
298 | iter->level = level; | |
299 | iter->keys = &c->journal_keys; | |
300 | iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); | |
301 | } | |
302 | ||
303 | static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) | |
304 | { | |
305 | return bch2_btree_node_iter_peek_unpack(&iter->node_iter, | |
306 | iter->b, &iter->unpacked); | |
307 | } | |
308 | ||
309 | static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) | |
310 | { | |
311 | bch2_btree_node_iter_advance(&iter->node_iter, iter->b); | |
312 | } | |
313 | ||
314 | void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) | |
315 | { | |
316 | if (bpos_eq(iter->pos, SPOS_MAX)) | |
317 | iter->at_end = true; | |
318 | else | |
319 | iter->pos = bpos_successor(iter->pos); | |
320 | } | |
321 | ||
322 | struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) | |
323 | { | |
324 | struct bkey_s_c btree_k, journal_k, ret; | |
325 | again: | |
326 | if (iter->at_end) | |
327 | return bkey_s_c_null; | |
328 | ||
329 | while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && | |
330 | bpos_lt(btree_k.k->p, iter->pos)) | |
331 | bch2_journal_iter_advance_btree(iter); | |
332 | ||
333 | while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && | |
334 | bpos_lt(journal_k.k->p, iter->pos)) | |
335 | bch2_journal_iter_advance(&iter->journal); | |
336 | ||
337 | ret = journal_k.k && | |
338 | (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) | |
339 | ? journal_k | |
340 | : btree_k; | |
341 | ||
342 | if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) | |
343 | ret = bkey_s_c_null; | |
344 | ||
345 | if (ret.k) { | |
346 | iter->pos = ret.k->p; | |
347 | if (bkey_deleted(ret.k)) { | |
348 | bch2_btree_and_journal_iter_advance(iter); | |
349 | goto again; | |
350 | } | |
351 | } else { | |
352 | iter->pos = SPOS_MAX; | |
353 | iter->at_end = true; | |
354 | } | |
355 | ||
356 | return ret; | |
357 | } | |
358 | ||
359 | void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) | |
360 | { | |
361 | bch2_journal_iter_exit(&iter->journal); | |
362 | } | |
363 | ||
364 | void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, | |
365 | struct bch_fs *c, | |
366 | struct btree *b, | |
367 | struct btree_node_iter node_iter, | |
368 | struct bpos pos) | |
369 | { | |
370 | memset(iter, 0, sizeof(*iter)); | |
371 | ||
372 | iter->b = b; | |
373 | iter->node_iter = node_iter; | |
374 | bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); | |
375 | INIT_LIST_HEAD(&iter->journal.list); | |
376 | iter->pos = b->data->min_key; | |
377 | iter->at_end = false; | |
378 | } | |
379 | ||
380 | /* | |
381 | * this version is used by btree_gc before filesystem has gone RW and | |
382 | * multithreaded, so uses the journal_iters list: | |
383 | */ | |
384 | void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, | |
385 | struct bch_fs *c, | |
386 | struct btree *b) | |
387 | { | |
388 | struct btree_node_iter node_iter; | |
389 | ||
390 | bch2_btree_node_iter_init_from_start(&node_iter, b); | |
391 | __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); | |
392 | list_add(&iter->journal.list, &c->journal_iters); | |
393 | } | |
394 | ||
395 | /* sort and dedup all keys in the journal: */ | |
396 | ||
397 | void bch2_journal_entries_free(struct bch_fs *c) | |
398 | { | |
399 | struct journal_replay **i; | |
400 | struct genradix_iter iter; | |
401 | ||
402 | genradix_for_each(&c->journal_entries, iter, i) | |
403 | if (*i) | |
404 | kvpfree(*i, offsetof(struct journal_replay, j) + | |
405 | vstruct_bytes(&(*i)->j)); | |
406 | genradix_free(&c->journal_entries); | |
407 | } | |
408 | ||
409 | /* | |
410 | * When keys compare equal, oldest compares first: | |
411 | */ | |
412 | static int journal_sort_key_cmp(const void *_l, const void *_r) | |
413 | { | |
414 | const struct journal_key *l = _l; | |
415 | const struct journal_key *r = _r; | |
416 | ||
417 | return journal_key_cmp(l, r) ?: | |
418 | cmp_int(l->journal_seq, r->journal_seq) ?: | |
419 | cmp_int(l->journal_offset, r->journal_offset); | |
420 | } | |
421 | ||
8a443d3e | 422 | void bch2_journal_keys_put(struct bch_fs *c) |
401585fe | 423 | { |
8a443d3e | 424 | struct journal_keys *keys = &c->journal_keys; |
401585fe KO |
425 | struct journal_key *i; |
426 | ||
8a443d3e KO |
427 | BUG_ON(atomic_read(&keys->ref) <= 0); |
428 | ||
429 | if (!atomic_dec_and_test(&keys->ref)) | |
430 | return; | |
431 | ||
401585fe KO |
432 | move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); |
433 | keys->gap = keys->nr; | |
434 | ||
435 | for (i = keys->d; i < keys->d + keys->nr; i++) | |
436 | if (i->allocated) | |
437 | kfree(i->k); | |
438 | ||
439 | kvfree(keys->d); | |
440 | keys->d = NULL; | |
441 | keys->nr = keys->gap = keys->size = 0; | |
8a443d3e KO |
442 | |
443 | bch2_journal_entries_free(c); | |
401585fe KO |
444 | } |
445 | ||
446 | static void __journal_keys_sort(struct journal_keys *keys) | |
447 | { | |
448 | struct journal_key *src, *dst; | |
449 | ||
450 | sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); | |
451 | ||
452 | src = dst = keys->d; | |
453 | while (src < keys->d + keys->nr) { | |
454 | while (src + 1 < keys->d + keys->nr && | |
455 | src[0].btree_id == src[1].btree_id && | |
456 | src[0].level == src[1].level && | |
457 | bpos_eq(src[0].k->k.p, src[1].k->k.p)) | |
458 | src++; | |
459 | ||
460 | *dst++ = *src++; | |
461 | } | |
462 | ||
463 | keys->nr = dst - keys->d; | |
464 | } | |
465 | ||
466 | int bch2_journal_keys_sort(struct bch_fs *c) | |
467 | { | |
468 | struct genradix_iter iter; | |
469 | struct journal_replay *i, **_i; | |
470 | struct jset_entry *entry; | |
471 | struct bkey_i *k; | |
472 | struct journal_keys *keys = &c->journal_keys; | |
473 | size_t nr_keys = 0, nr_read = 0; | |
474 | ||
475 | genradix_for_each(&c->journal_entries, iter, _i) { | |
476 | i = *_i; | |
477 | ||
478 | if (!i || i->ignore) | |
479 | continue; | |
480 | ||
481 | for_each_jset_key(k, entry, &i->j) | |
482 | nr_keys++; | |
483 | } | |
484 | ||
485 | if (!nr_keys) | |
486 | return 0; | |
487 | ||
488 | keys->size = roundup_pow_of_two(nr_keys); | |
489 | ||
490 | keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); | |
491 | if (!keys->d) { | |
492 | bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", | |
493 | nr_keys); | |
494 | ||
495 | do { | |
496 | keys->size >>= 1; | |
497 | keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); | |
498 | } while (!keys->d && keys->size > nr_keys / 8); | |
499 | ||
500 | if (!keys->d) { | |
501 | bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", | |
502 | keys->size); | |
503 | return -BCH_ERR_ENOMEM_journal_keys_sort; | |
504 | } | |
505 | } | |
506 | ||
507 | genradix_for_each(&c->journal_entries, iter, _i) { | |
508 | i = *_i; | |
509 | ||
510 | if (!i || i->ignore) | |
511 | continue; | |
512 | ||
513 | cond_resched(); | |
514 | ||
515 | for_each_jset_key(k, entry, &i->j) { | |
516 | if (keys->nr == keys->size) { | |
517 | __journal_keys_sort(keys); | |
518 | ||
519 | if (keys->nr > keys->size * 7 / 8) { | |
520 | bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", | |
521 | keys->nr, keys->size, nr_read, nr_keys); | |
522 | return -BCH_ERR_ENOMEM_journal_keys_sort; | |
523 | } | |
524 | } | |
525 | ||
526 | keys->d[keys->nr++] = (struct journal_key) { | |
527 | .btree_id = entry->btree_id, | |
528 | .level = entry->level, | |
529 | .k = k, | |
530 | .journal_seq = le64_to_cpu(i->j.seq), | |
531 | .journal_offset = k->_data - i->j._data, | |
532 | }; | |
533 | ||
534 | nr_read++; | |
535 | } | |
536 | } | |
537 | ||
538 | __journal_keys_sort(keys); | |
539 | keys->gap = keys->nr; | |
540 | ||
541 | bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); | |
542 | return 0; | |
543 | } |