]> git.ipfire.org Git - thirdparty/kernel/stable.git/blob - fs/bcachefs/btree_update_interior.c
a4a63e363047dbe66a1bcb6d0b83799c1e030cae
[thirdparty/kernel/stable.git] / fs / bcachefs / btree_update_interior.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "bkey_buf.h"
6 #include "bkey_methods.h"
7 #include "btree_cache.h"
8 #include "btree_gc.h"
9 #include "btree_journal_iter.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_io.h"
13 #include "btree_iter.h"
14 #include "btree_locking.h"
15 #include "buckets.h"
16 #include "clock.h"
17 #include "error.h"
18 #include "extents.h"
19 #include "journal.h"
20 #include "journal_reclaim.h"
21 #include "keylist.h"
22 #include "recovery_passes.h"
23 #include "replicas.h"
24 #include "super-io.h"
25 #include "trace.h"
26
27 #include <linux/random.h>
28
29 static const char * const bch2_btree_update_modes[] = {
30 #define x(t) #t,
31 BTREE_UPDATE_MODES()
32 #undef x
33 NULL
34 };
35
36 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
37 btree_path_idx_t, struct btree *, struct keylist *);
38 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
39
40 static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
41 enum btree_id btree_id,
42 unsigned level,
43 struct bpos pos)
44 {
45 btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
46 BTREE_ITER_NOPRESERVE|
47 BTREE_ITER_INTENT, _RET_IP_);
48 path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
49
50 struct btree_path *path = trans->paths + path_idx;
51 bch2_btree_path_downgrade(trans, path);
52 __bch2_btree_path_unlock(trans, path);
53 return path_idx;
54 }
55
56 /*
57 * Verify that child nodes correctly span parent node's range:
58 */
59 int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
60 {
61 struct bch_fs *c = trans->c;
62 struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
63 ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
64 : b->data->min_key;
65 struct btree_and_journal_iter iter;
66 struct bkey_s_c k;
67 struct printbuf buf = PRINTBUF;
68 struct bkey_buf prev;
69 int ret = 0;
70
71 BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
72 !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
73 b->data->min_key));
74
75 if (!b->c.level)
76 return 0;
77
78 bch2_bkey_buf_init(&prev);
79 bkey_init(&prev.k->k);
80 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
81
82 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
83 if (k.k->type != KEY_TYPE_btree_ptr_v2)
84 goto out;
85
86 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
87
88 struct bpos expected_min = bkey_deleted(&prev.k->k)
89 ? node_min
90 : bpos_successor(prev.k->k.p);
91
92 if (!bpos_eq(expected_min, bp.v->min_key)) {
93 bch2_topology_error(c);
94
95 printbuf_reset(&buf);
96 prt_str(&buf, "end of prev node doesn't match start of next node\n"),
97 prt_printf(&buf, " in btree %s level %u node ",
98 bch2_btree_id_str(b->c.btree_id), b->c.level);
99 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
100 prt_str(&buf, "\n prev ");
101 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
102 prt_str(&buf, "\n next ");
103 bch2_bkey_val_to_text(&buf, c, k);
104
105 need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
106 goto topology_repair;
107 }
108
109 bch2_bkey_buf_reassemble(&prev, c, k);
110 bch2_btree_and_journal_iter_advance(&iter);
111 }
112
113 if (bkey_deleted(&prev.k->k)) {
114 bch2_topology_error(c);
115
116 printbuf_reset(&buf);
117 prt_str(&buf, "empty interior node\n");
118 prt_printf(&buf, " in btree %s level %u node ",
119 bch2_btree_id_str(b->c.btree_id), b->c.level);
120 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
121
122 need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
123 goto topology_repair;
124 } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
125 bch2_topology_error(c);
126
127 printbuf_reset(&buf);
128 prt_str(&buf, "last child node doesn't end at end of parent node\n");
129 prt_printf(&buf, " in btree %s level %u node ",
130 bch2_btree_id_str(b->c.btree_id), b->c.level);
131 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
132 prt_str(&buf, "\n last key ");
133 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
134
135 need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
136 goto topology_repair;
137 }
138 out:
139 fsck_err:
140 bch2_btree_and_journal_iter_exit(&iter);
141 bch2_bkey_buf_exit(&prev, c);
142 printbuf_exit(&buf);
143 return ret;
144 topology_repair:
145 if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
146 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
147 bch2_inconsistent_error(c);
148 ret = -BCH_ERR_btree_need_topology_repair;
149 } else {
150 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
151 }
152 goto out;
153 }
154
155 /* Calculate ideal packed bkey format for new btree nodes: */
156
157 static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
158 {
159 struct bkey_packed *k;
160 struct bset_tree *t;
161 struct bkey uk;
162
163 for_each_bset(b, t)
164 bset_tree_for_each_key(b, t, k)
165 if (!bkey_deleted(k)) {
166 uk = bkey_unpack_key(b, k);
167 bch2_bkey_format_add_key(s, &uk);
168 }
169 }
170
171 static struct bkey_format bch2_btree_calc_format(struct btree *b)
172 {
173 struct bkey_format_state s;
174
175 bch2_bkey_format_init(&s);
176 bch2_bkey_format_add_pos(&s, b->data->min_key);
177 bch2_bkey_format_add_pos(&s, b->data->max_key);
178 __bch2_btree_calc_format(&s, b);
179
180 return bch2_bkey_format_done(&s);
181 }
182
183 static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
184 struct bkey_format *old_f,
185 struct bkey_format *new_f)
186 {
187 /* stupid integer promotion rules */
188 ssize_t delta =
189 (((int) new_f->key_u64s - old_f->key_u64s) *
190 (int) nr.packed_keys) +
191 (((int) new_f->key_u64s - BKEY_U64s) *
192 (int) nr.unpacked_keys);
193
194 BUG_ON(delta + nr.live_u64s < 0);
195
196 return nr.live_u64s + delta;
197 }
198
199 /**
200 * bch2_btree_node_format_fits - check if we could rewrite node with a new format
201 *
202 * @c: filesystem handle
203 * @b: btree node to rewrite
204 * @nr: number of keys for new node (i.e. b->nr)
205 * @new_f: bkey format to translate keys to
206 *
207 * Returns: true if all re-packed keys will be able to fit in a new node.
208 *
209 * Assumes all keys will successfully pack with the new format.
210 */
211 static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
212 struct btree_nr_keys nr,
213 struct bkey_format *new_f)
214 {
215 size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
216
217 return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
218 }
219
220 /* Btree node freeing/allocation: */
221
222 static void __btree_node_free(struct btree_trans *trans, struct btree *b)
223 {
224 struct bch_fs *c = trans->c;
225
226 trace_and_count(c, btree_node_free, trans, b);
227
228 BUG_ON(btree_node_write_blocked(b));
229 BUG_ON(btree_node_dirty(b));
230 BUG_ON(btree_node_need_write(b));
231 BUG_ON(b == btree_node_root(c, b));
232 BUG_ON(b->ob.nr);
233 BUG_ON(!list_empty(&b->write_blocked));
234 BUG_ON(b->will_make_reachable);
235
236 clear_btree_node_noevict(b);
237
238 mutex_lock(&c->btree_cache.lock);
239 list_move(&b->list, &c->btree_cache.freeable);
240 mutex_unlock(&c->btree_cache.lock);
241 }
242
243 static void bch2_btree_node_free_inmem(struct btree_trans *trans,
244 struct btree_path *path,
245 struct btree *b)
246 {
247 struct bch_fs *c = trans->c;
248 unsigned i, level = b->c.level;
249
250 bch2_btree_node_lock_write_nofail(trans, path, &b->c);
251 bch2_btree_node_hash_remove(&c->btree_cache, b);
252 __btree_node_free(trans, b);
253 six_unlock_write(&b->c.lock);
254 mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
255
256 trans_for_each_path(trans, path, i)
257 if (path->l[level].b == b) {
258 btree_node_unlock(trans, path, level);
259 path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
260 }
261 }
262
263 static void bch2_btree_node_free_never_used(struct btree_update *as,
264 struct btree_trans *trans,
265 struct btree *b)
266 {
267 struct bch_fs *c = as->c;
268 struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
269 struct btree_path *path;
270 unsigned i, level = b->c.level;
271
272 BUG_ON(!list_empty(&b->write_blocked));
273 BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
274
275 b->will_make_reachable = 0;
276 closure_put(&as->cl);
277
278 clear_btree_node_will_make_reachable(b);
279 clear_btree_node_accessed(b);
280 clear_btree_node_dirty_acct(c, b);
281 clear_btree_node_need_write(b);
282
283 mutex_lock(&c->btree_cache.lock);
284 list_del_init(&b->list);
285 bch2_btree_node_hash_remove(&c->btree_cache, b);
286 mutex_unlock(&c->btree_cache.lock);
287
288 BUG_ON(p->nr >= ARRAY_SIZE(p->b));
289 p->b[p->nr++] = b;
290
291 six_unlock_intent(&b->c.lock);
292
293 trans_for_each_path(trans, path, i)
294 if (path->l[level].b == b) {
295 btree_node_unlock(trans, path, level);
296 path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
297 }
298 }
299
300 static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
301 struct disk_reservation *res,
302 struct closure *cl,
303 bool interior_node,
304 unsigned flags)
305 {
306 struct bch_fs *c = trans->c;
307 struct write_point *wp;
308 struct btree *b;
309 BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
310 struct open_buckets obs = { .nr = 0 };
311 struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
312 enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
313 unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
314 ? BTREE_NODE_RESERVE
315 : 0;
316 int ret;
317
318 mutex_lock(&c->btree_reserve_cache_lock);
319 if (c->btree_reserve_cache_nr > nr_reserve) {
320 struct btree_alloc *a =
321 &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
322
323 obs = a->ob;
324 bkey_copy(&tmp.k, &a->k);
325 mutex_unlock(&c->btree_reserve_cache_lock);
326 goto mem_alloc;
327 }
328 mutex_unlock(&c->btree_reserve_cache_lock);
329
330 retry:
331 ret = bch2_alloc_sectors_start_trans(trans,
332 c->opts.metadata_target ?:
333 c->opts.foreground_target,
334 0,
335 writepoint_ptr(&c->btree_write_point),
336 &devs_have,
337 res->nr_replicas,
338 min(res->nr_replicas,
339 c->opts.metadata_replicas_required),
340 watermark, 0, cl, &wp);
341 if (unlikely(ret))
342 return ERR_PTR(ret);
343
344 if (wp->sectors_free < btree_sectors(c)) {
345 struct open_bucket *ob;
346 unsigned i;
347
348 open_bucket_for_each(c, &wp->ptrs, ob, i)
349 if (ob->sectors_free < btree_sectors(c))
350 ob->sectors_free = 0;
351
352 bch2_alloc_sectors_done(c, wp);
353 goto retry;
354 }
355
356 bkey_btree_ptr_v2_init(&tmp.k);
357 bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
358
359 bch2_open_bucket_get(c, wp, &obs);
360 bch2_alloc_sectors_done(c, wp);
361 mem_alloc:
362 b = bch2_btree_node_mem_alloc(trans, interior_node);
363 six_unlock_write(&b->c.lock);
364 six_unlock_intent(&b->c.lock);
365
366 /* we hold cannibalize_lock: */
367 BUG_ON(IS_ERR(b));
368 BUG_ON(b->ob.nr);
369
370 bkey_copy(&b->key, &tmp.k);
371 b->ob = obs;
372
373 return b;
374 }
375
376 static struct btree *bch2_btree_node_alloc(struct btree_update *as,
377 struct btree_trans *trans,
378 unsigned level)
379 {
380 struct bch_fs *c = as->c;
381 struct btree *b;
382 struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
383 int ret;
384
385 BUG_ON(level >= BTREE_MAX_DEPTH);
386 BUG_ON(!p->nr);
387
388 b = p->b[--p->nr];
389
390 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
391 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
392
393 set_btree_node_accessed(b);
394 set_btree_node_dirty_acct(c, b);
395 set_btree_node_need_write(b);
396
397 bch2_bset_init_first(b, &b->data->keys);
398 b->c.level = level;
399 b->c.btree_id = as->btree_id;
400 b->version_ondisk = c->sb.version;
401
402 memset(&b->nr, 0, sizeof(b->nr));
403 b->data->magic = cpu_to_le64(bset_magic(c));
404 memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
405 b->data->flags = 0;
406 SET_BTREE_NODE_ID(b->data, as->btree_id);
407 SET_BTREE_NODE_LEVEL(b->data, level);
408
409 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
410 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
411
412 bp->v.mem_ptr = 0;
413 bp->v.seq = b->data->keys.seq;
414 bp->v.sectors_written = 0;
415 }
416
417 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
418
419 bch2_btree_build_aux_trees(b);
420
421 ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
422 BUG_ON(ret);
423
424 trace_and_count(c, btree_node_alloc, trans, b);
425 bch2_increment_clock(c, btree_sectors(c), WRITE);
426 return b;
427 }
428
429 static void btree_set_min(struct btree *b, struct bpos pos)
430 {
431 if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
432 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
433 b->data->min_key = pos;
434 }
435
436 static void btree_set_max(struct btree *b, struct bpos pos)
437 {
438 b->key.k.p = pos;
439 b->data->max_key = pos;
440 }
441
442 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
443 struct btree_trans *trans,
444 struct btree *b)
445 {
446 struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
447 struct bkey_format format = bch2_btree_calc_format(b);
448
449 /*
450 * The keys might expand with the new format - if they wouldn't fit in
451 * the btree node anymore, use the old format for now:
452 */
453 if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
454 format = b->format;
455
456 SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
457
458 btree_set_min(n, b->data->min_key);
459 btree_set_max(n, b->data->max_key);
460
461 n->data->format = format;
462 btree_node_set_format(n, format);
463
464 bch2_btree_sort_into(as->c, n, b);
465
466 btree_node_reset_sib_u64s(n);
467 return n;
468 }
469
470 static struct btree *__btree_root_alloc(struct btree_update *as,
471 struct btree_trans *trans, unsigned level)
472 {
473 struct btree *b = bch2_btree_node_alloc(as, trans, level);
474
475 btree_set_min(b, POS_MIN);
476 btree_set_max(b, SPOS_MAX);
477 b->data->format = bch2_btree_calc_format(b);
478
479 btree_node_set_format(b, b->data->format);
480 bch2_btree_build_aux_trees(b);
481
482 return b;
483 }
484
485 static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
486 {
487 struct bch_fs *c = as->c;
488 struct prealloc_nodes *p;
489
490 for (p = as->prealloc_nodes;
491 p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
492 p++) {
493 while (p->nr) {
494 struct btree *b = p->b[--p->nr];
495
496 mutex_lock(&c->btree_reserve_cache_lock);
497
498 if (c->btree_reserve_cache_nr <
499 ARRAY_SIZE(c->btree_reserve_cache)) {
500 struct btree_alloc *a =
501 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
502
503 a->ob = b->ob;
504 b->ob.nr = 0;
505 bkey_copy(&a->k, &b->key);
506 } else {
507 bch2_open_buckets_put(c, &b->ob);
508 }
509
510 mutex_unlock(&c->btree_reserve_cache_lock);
511
512 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
513 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
514 __btree_node_free(trans, b);
515 six_unlock_write(&b->c.lock);
516 six_unlock_intent(&b->c.lock);
517 }
518 }
519 }
520
521 static int bch2_btree_reserve_get(struct btree_trans *trans,
522 struct btree_update *as,
523 unsigned nr_nodes[2],
524 unsigned flags,
525 struct closure *cl)
526 {
527 struct btree *b;
528 unsigned interior;
529 int ret = 0;
530
531 BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
532
533 /*
534 * Protects reaping from the btree node cache and using the btree node
535 * open bucket reserve:
536 */
537 ret = bch2_btree_cache_cannibalize_lock(trans, cl);
538 if (ret)
539 return ret;
540
541 for (interior = 0; interior < 2; interior++) {
542 struct prealloc_nodes *p = as->prealloc_nodes + interior;
543
544 while (p->nr < nr_nodes[interior]) {
545 b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
546 interior, flags);
547 if (IS_ERR(b)) {
548 ret = PTR_ERR(b);
549 goto err;
550 }
551
552 p->b[p->nr++] = b;
553 }
554 }
555 err:
556 bch2_btree_cache_cannibalize_unlock(trans);
557 return ret;
558 }
559
560 /* Asynchronous interior node update machinery */
561
562 static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
563 {
564 struct bch_fs *c = as->c;
565
566 if (as->took_gc_lock)
567 up_read(&c->gc_lock);
568 as->took_gc_lock = false;
569
570 bch2_journal_pin_drop(&c->journal, &as->journal);
571 bch2_journal_pin_flush(&c->journal, &as->journal);
572 bch2_disk_reservation_put(c, &as->disk_res);
573 bch2_btree_reserve_put(as, trans);
574
575 bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
576 as->start_time);
577
578 mutex_lock(&c->btree_interior_update_lock);
579 list_del(&as->unwritten_list);
580 list_del(&as->list);
581
582 closure_debug_destroy(&as->cl);
583 mempool_free(as, &c->btree_interior_update_pool);
584
585 /*
586 * Have to do the wakeup with btree_interior_update_lock still held,
587 * since being on btree_interior_update_list is our ref on @c:
588 */
589 closure_wake_up(&c->btree_interior_update_wait);
590
591 mutex_unlock(&c->btree_interior_update_lock);
592 }
593
594 static void btree_update_add_key(struct btree_update *as,
595 struct keylist *keys, struct btree *b)
596 {
597 struct bkey_i *k = &b->key;
598
599 BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
600 ARRAY_SIZE(as->_old_keys));
601
602 bkey_copy(keys->top, k);
603 bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
604
605 bch2_keylist_push(keys);
606 }
607
608 /*
609 * The transactional part of an interior btree node update, where we journal the
610 * update we did to the interior node and update alloc info:
611 */
612 static int btree_update_nodes_written_trans(struct btree_trans *trans,
613 struct btree_update *as)
614 {
615 struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
616 int ret = PTR_ERR_OR_ZERO(e);
617 if (ret)
618 return ret;
619
620 memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
621
622 trans->journal_pin = &as->journal;
623
624 for_each_keylist_key(&as->old_keys, k) {
625 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
626
627 ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
628 BTREE_TRIGGER_TRANSACTIONAL);
629 if (ret)
630 return ret;
631 }
632
633 for_each_keylist_key(&as->new_keys, k) {
634 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
635
636 ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
637 BTREE_TRIGGER_TRANSACTIONAL);
638 if (ret)
639 return ret;
640 }
641
642 return 0;
643 }
644
645 static void btree_update_nodes_written(struct btree_update *as)
646 {
647 struct bch_fs *c = as->c;
648 struct btree *b;
649 struct btree_trans *trans = bch2_trans_get(c);
650 u64 journal_seq = 0;
651 unsigned i;
652 int ret;
653
654 /*
655 * If we're already in an error state, it might be because a btree node
656 * was never written, and we might be trying to free that same btree
657 * node here, but it won't have been marked as allocated and we'll see
658 * spurious disk usage inconsistencies in the transactional part below
659 * if we don't skip it:
660 */
661 ret = bch2_journal_error(&c->journal);
662 if (ret)
663 goto err;
664
665 /*
666 * Wait for any in flight writes to finish before we free the old nodes
667 * on disk:
668 */
669 for (i = 0; i < as->nr_old_nodes; i++) {
670 __le64 seq;
671
672 b = as->old_nodes[i];
673
674 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
675 seq = b->data ? b->data->keys.seq : 0;
676 six_unlock_read(&b->c.lock);
677
678 if (seq == as->old_nodes_seq[i])
679 wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
680 TASK_UNINTERRUPTIBLE);
681 }
682
683 /*
684 * We did an update to a parent node where the pointers we added pointed
685 * to child nodes that weren't written yet: now, the child nodes have
686 * been written so we can write out the update to the interior node.
687 */
688
689 /*
690 * We can't call into journal reclaim here: we'd block on the journal
691 * reclaim lock, but we may need to release the open buckets we have
692 * pinned in order for other btree updates to make forward progress, and
693 * journal reclaim does btree updates when flushing bkey_cached entries,
694 * which may require allocations as well.
695 */
696 ret = commit_do(trans, &as->disk_res, &journal_seq,
697 BCH_WATERMARK_interior_updates|
698 BCH_TRANS_COMMIT_no_enospc|
699 BCH_TRANS_COMMIT_no_check_rw|
700 BCH_TRANS_COMMIT_journal_reclaim,
701 btree_update_nodes_written_trans(trans, as));
702 bch2_trans_unlock(trans);
703
704 bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
705 "%s", bch2_err_str(ret));
706 err:
707 if (as->b) {
708
709 b = as->b;
710 btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
711 as->btree_id, b->c.level, b->key.k.p);
712 struct btree_path *path = trans->paths + path_idx;
713 /*
714 * @b is the node we did the final insert into:
715 *
716 * On failure to get a journal reservation, we still have to
717 * unblock the write and allow most of the write path to happen
718 * so that shutdown works, but the i->journal_seq mechanism
719 * won't work to prevent the btree write from being visible (we
720 * didn't get a journal sequence number) - instead
721 * __bch2_btree_node_write() doesn't do the actual write if
722 * we're in journal error state:
723 */
724
725 /*
726 * Ensure transaction is unlocked before using
727 * btree_node_lock_nopath() (the use of which is always suspect,
728 * we need to work on removing this in the future)
729 *
730 * It should be, but get_unlocked_mut_path() -> bch2_path_get()
731 * calls bch2_path_upgrade(), before we call path_make_mut(), so
732 * we may rarely end up with a locked path besides the one we
733 * have here:
734 */
735 bch2_trans_unlock(trans);
736 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
737 mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
738 path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
739 path->l[b->c.level].b = b;
740
741 bch2_btree_node_lock_write_nofail(trans, path, &b->c);
742
743 mutex_lock(&c->btree_interior_update_lock);
744
745 list_del(&as->write_blocked_list);
746 if (list_empty(&b->write_blocked))
747 clear_btree_node_write_blocked(b);
748
749 /*
750 * Node might have been freed, recheck under
751 * btree_interior_update_lock:
752 */
753 if (as->b == b) {
754 BUG_ON(!b->c.level);
755 BUG_ON(!btree_node_dirty(b));
756
757 if (!ret) {
758 struct bset *last = btree_bset_last(b);
759
760 last->journal_seq = cpu_to_le64(
761 max(journal_seq,
762 le64_to_cpu(last->journal_seq)));
763
764 bch2_btree_add_journal_pin(c, b, journal_seq);
765 } else {
766 /*
767 * If we didn't get a journal sequence number we
768 * can't write this btree node, because recovery
769 * won't know to ignore this write:
770 */
771 set_btree_node_never_write(b);
772 }
773 }
774
775 mutex_unlock(&c->btree_interior_update_lock);
776
777 mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
778 six_unlock_write(&b->c.lock);
779
780 btree_node_write_if_need(c, b, SIX_LOCK_intent);
781 btree_node_unlock(trans, path, b->c.level);
782 bch2_path_put(trans, path_idx, true);
783 }
784
785 bch2_journal_pin_drop(&c->journal, &as->journal);
786
787 mutex_lock(&c->btree_interior_update_lock);
788 for (i = 0; i < as->nr_new_nodes; i++) {
789 b = as->new_nodes[i];
790
791 BUG_ON(b->will_make_reachable != (unsigned long) as);
792 b->will_make_reachable = 0;
793 clear_btree_node_will_make_reachable(b);
794 }
795 mutex_unlock(&c->btree_interior_update_lock);
796
797 for (i = 0; i < as->nr_new_nodes; i++) {
798 b = as->new_nodes[i];
799
800 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
801 btree_node_write_if_need(c, b, SIX_LOCK_read);
802 six_unlock_read(&b->c.lock);
803 }
804
805 for (i = 0; i < as->nr_open_buckets; i++)
806 bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
807
808 bch2_btree_update_free(as, trans);
809 bch2_trans_put(trans);
810 }
811
812 static void btree_interior_update_work(struct work_struct *work)
813 {
814 struct bch_fs *c =
815 container_of(work, struct bch_fs, btree_interior_update_work);
816 struct btree_update *as;
817
818 while (1) {
819 mutex_lock(&c->btree_interior_update_lock);
820 as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
821 struct btree_update, unwritten_list);
822 if (as && !as->nodes_written)
823 as = NULL;
824 mutex_unlock(&c->btree_interior_update_lock);
825
826 if (!as)
827 break;
828
829 btree_update_nodes_written(as);
830 }
831 }
832
833 static CLOSURE_CALLBACK(btree_update_set_nodes_written)
834 {
835 closure_type(as, struct btree_update, cl);
836 struct bch_fs *c = as->c;
837
838 mutex_lock(&c->btree_interior_update_lock);
839 as->nodes_written = true;
840 mutex_unlock(&c->btree_interior_update_lock);
841
842 queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
843 }
844
845 /*
846 * We're updating @b with pointers to nodes that haven't finished writing yet:
847 * block @b from being written until @as completes
848 */
849 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
850 {
851 struct bch_fs *c = as->c;
852
853 BUG_ON(as->mode != BTREE_UPDATE_none);
854 BUG_ON(as->update_level_end < b->c.level);
855 BUG_ON(!btree_node_dirty(b));
856 BUG_ON(!b->c.level);
857
858 mutex_lock(&c->btree_interior_update_lock);
859 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
860
861 as->mode = BTREE_UPDATE_node;
862 as->b = b;
863 as->update_level_end = b->c.level;
864
865 set_btree_node_write_blocked(b);
866 list_add(&as->write_blocked_list, &b->write_blocked);
867
868 mutex_unlock(&c->btree_interior_update_lock);
869 }
870
871 static int bch2_update_reparent_journal_pin_flush(struct journal *j,
872 struct journal_entry_pin *_pin, u64 seq)
873 {
874 return 0;
875 }
876
877 static void btree_update_reparent(struct btree_update *as,
878 struct btree_update *child)
879 {
880 struct bch_fs *c = as->c;
881
882 lockdep_assert_held(&c->btree_interior_update_lock);
883
884 child->b = NULL;
885 child->mode = BTREE_UPDATE_update;
886
887 bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
888 bch2_update_reparent_journal_pin_flush);
889 }
890
891 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
892 {
893 struct bkey_i *insert = &b->key;
894 struct bch_fs *c = as->c;
895
896 BUG_ON(as->mode != BTREE_UPDATE_none);
897
898 BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
899 ARRAY_SIZE(as->journal_entries));
900
901 as->journal_u64s +=
902 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
903 BCH_JSET_ENTRY_btree_root,
904 b->c.btree_id, b->c.level,
905 insert, insert->k.u64s);
906
907 mutex_lock(&c->btree_interior_update_lock);
908 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
909
910 as->mode = BTREE_UPDATE_root;
911 mutex_unlock(&c->btree_interior_update_lock);
912 }
913
914 /*
915 * bch2_btree_update_add_new_node:
916 *
917 * This causes @as to wait on @b to be written, before it gets to
918 * bch2_btree_update_nodes_written
919 *
920 * Additionally, it sets b->will_make_reachable to prevent any additional writes
921 * to @b from happening besides the first until @b is reachable on disk
922 *
923 * And it adds @b to the list of @as's new nodes, so that we can update sector
924 * counts in bch2_btree_update_nodes_written:
925 */
926 static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
927 {
928 struct bch_fs *c = as->c;
929
930 closure_get(&as->cl);
931
932 mutex_lock(&c->btree_interior_update_lock);
933 BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
934 BUG_ON(b->will_make_reachable);
935
936 as->new_nodes[as->nr_new_nodes++] = b;
937 b->will_make_reachable = 1UL|(unsigned long) as;
938 set_btree_node_will_make_reachable(b);
939
940 mutex_unlock(&c->btree_interior_update_lock);
941
942 btree_update_add_key(as, &as->new_keys, b);
943
944 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
945 unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
946 unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
947
948 bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
949 cpu_to_le16(sectors);
950 }
951 }
952
953 /*
954 * returns true if @b was a new node
955 */
956 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
957 {
958 struct btree_update *as;
959 unsigned long v;
960 unsigned i;
961
962 mutex_lock(&c->btree_interior_update_lock);
963 /*
964 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
965 * dropped when it gets written by bch2_btree_complete_write - the
966 * xchg() is for synchronization with bch2_btree_complete_write:
967 */
968 v = xchg(&b->will_make_reachable, 0);
969 clear_btree_node_will_make_reachable(b);
970 as = (struct btree_update *) (v & ~1UL);
971
972 if (!as) {
973 mutex_unlock(&c->btree_interior_update_lock);
974 return;
975 }
976
977 for (i = 0; i < as->nr_new_nodes; i++)
978 if (as->new_nodes[i] == b)
979 goto found;
980
981 BUG();
982 found:
983 array_remove_item(as->new_nodes, as->nr_new_nodes, i);
984 mutex_unlock(&c->btree_interior_update_lock);
985
986 if (v & 1)
987 closure_put(&as->cl);
988 }
989
990 static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
991 {
992 while (b->ob.nr)
993 as->open_buckets[as->nr_open_buckets++] =
994 b->ob.v[--b->ob.nr];
995 }
996
997 static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
998 struct journal_entry_pin *_pin, u64 seq)
999 {
1000 return 0;
1001 }
1002
1003 /*
1004 * @b is being split/rewritten: it may have pointers to not-yet-written btree
1005 * nodes and thus outstanding btree_updates - redirect @b's
1006 * btree_updates to point to this btree_update:
1007 */
1008 static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
1009 struct btree *b)
1010 {
1011 struct bch_fs *c = as->c;
1012 struct btree_update *p, *n;
1013 struct btree_write *w;
1014
1015 set_btree_node_dying(b);
1016
1017 if (btree_node_fake(b))
1018 return;
1019
1020 mutex_lock(&c->btree_interior_update_lock);
1021
1022 /*
1023 * Does this node have any btree_update operations preventing
1024 * it from being written?
1025 *
1026 * If so, redirect them to point to this btree_update: we can
1027 * write out our new nodes, but we won't make them visible until those
1028 * operations complete
1029 */
1030 list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
1031 list_del_init(&p->write_blocked_list);
1032 btree_update_reparent(as, p);
1033
1034 /*
1035 * for flush_held_btree_writes() waiting on updates to flush or
1036 * nodes to be writeable:
1037 */
1038 closure_wake_up(&c->btree_interior_update_wait);
1039 }
1040
1041 clear_btree_node_dirty_acct(c, b);
1042 clear_btree_node_need_write(b);
1043 clear_btree_node_write_blocked(b);
1044
1045 /*
1046 * Does this node have unwritten data that has a pin on the journal?
1047 *
1048 * If so, transfer that pin to the btree_update operation -
1049 * note that if we're freeing multiple nodes, we only need to keep the
1050 * oldest pin of any of the nodes we're freeing. We'll release the pin
1051 * when the new nodes are persistent and reachable on disk:
1052 */
1053 w = btree_current_write(b);
1054 bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
1055 bch2_btree_update_will_free_node_journal_pin_flush);
1056 bch2_journal_pin_drop(&c->journal, &w->journal);
1057
1058 w = btree_prev_write(b);
1059 bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
1060 bch2_btree_update_will_free_node_journal_pin_flush);
1061 bch2_journal_pin_drop(&c->journal, &w->journal);
1062
1063 mutex_unlock(&c->btree_interior_update_lock);
1064
1065 /*
1066 * Is this a node that isn't reachable on disk yet?
1067 *
1068 * Nodes that aren't reachable yet have writes blocked until they're
1069 * reachable - now that we've cancelled any pending writes and moved
1070 * things waiting on that write to wait on this update, we can drop this
1071 * node from the list of nodes that the other update is making
1072 * reachable, prior to freeing it:
1073 */
1074 btree_update_drop_new_node(c, b);
1075
1076 btree_update_add_key(as, &as->old_keys, b);
1077
1078 as->old_nodes[as->nr_old_nodes] = b;
1079 as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
1080 as->nr_old_nodes++;
1081 }
1082
1083 static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
1084 {
1085 struct bch_fs *c = as->c;
1086 u64 start_time = as->start_time;
1087
1088 BUG_ON(as->mode == BTREE_UPDATE_none);
1089
1090 if (as->took_gc_lock)
1091 up_read(&as->c->gc_lock);
1092 as->took_gc_lock = false;
1093
1094 bch2_btree_reserve_put(as, trans);
1095
1096 continue_at(&as->cl, btree_update_set_nodes_written,
1097 as->c->btree_interior_update_worker);
1098
1099 bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
1100 start_time);
1101 }
1102
1103 static struct btree_update *
1104 bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
1105 unsigned level_start, bool split, unsigned flags)
1106 {
1107 struct bch_fs *c = trans->c;
1108 struct btree_update *as;
1109 u64 start_time = local_clock();
1110 int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
1111 ? BCH_DISK_RESERVATION_NOFAIL : 0;
1112 unsigned nr_nodes[2] = { 0, 0 };
1113 unsigned level_end = level_start;
1114 enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
1115 int ret = 0;
1116 u32 restart_count = trans->restart_count;
1117
1118 BUG_ON(!path->should_be_locked);
1119
1120 if (watermark == BCH_WATERMARK_copygc)
1121 watermark = BCH_WATERMARK_btree_copygc;
1122 if (watermark < BCH_WATERMARK_btree)
1123 watermark = BCH_WATERMARK_btree;
1124
1125 flags &= ~BCH_WATERMARK_MASK;
1126 flags |= watermark;
1127
1128 if (watermark < BCH_WATERMARK_reclaim &&
1129 test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
1130 if (flags & BCH_TRANS_COMMIT_journal_reclaim)
1131 return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
1132
1133 bch2_trans_unlock(trans);
1134 wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
1135 ret = bch2_trans_relock(trans);
1136 if (ret)
1137 return ERR_PTR(ret);
1138 }
1139
1140 while (1) {
1141 nr_nodes[!!level_end] += 1 + split;
1142 level_end++;
1143
1144 ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
1145 if (ret)
1146 return ERR_PTR(ret);
1147
1148 if (!btree_path_node(path, level_end)) {
1149 /* Allocating new root? */
1150 nr_nodes[1] += split;
1151 level_end = BTREE_MAX_DEPTH;
1152 break;
1153 }
1154
1155 /*
1156 * Always check for space for two keys, even if we won't have to
1157 * split at prior level - it might have been a merge instead:
1158 */
1159 if (bch2_btree_node_insert_fits(path->l[level_end].b,
1160 BKEY_BTREE_PTR_U64s_MAX * 2))
1161 break;
1162
1163 split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
1164 }
1165
1166 if (!down_read_trylock(&c->gc_lock)) {
1167 ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
1168 if (ret) {
1169 up_read(&c->gc_lock);
1170 return ERR_PTR(ret);
1171 }
1172 }
1173
1174 as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
1175 memset(as, 0, sizeof(*as));
1176 closure_init(&as->cl, NULL);
1177 as->c = c;
1178 as->start_time = start_time;
1179 as->ip_started = _RET_IP_;
1180 as->mode = BTREE_UPDATE_none;
1181 as->watermark = watermark;
1182 as->took_gc_lock = true;
1183 as->btree_id = path->btree_id;
1184 as->update_level_start = level_start;
1185 as->update_level_end = level_end;
1186 INIT_LIST_HEAD(&as->list);
1187 INIT_LIST_HEAD(&as->unwritten_list);
1188 INIT_LIST_HEAD(&as->write_blocked_list);
1189 bch2_keylist_init(&as->old_keys, as->_old_keys);
1190 bch2_keylist_init(&as->new_keys, as->_new_keys);
1191 bch2_keylist_init(&as->parent_keys, as->inline_keys);
1192
1193 mutex_lock(&c->btree_interior_update_lock);
1194 list_add_tail(&as->list, &c->btree_interior_update_list);
1195 mutex_unlock(&c->btree_interior_update_lock);
1196
1197 /*
1198 * We don't want to allocate if we're in an error state, that can cause
1199 * deadlock on emergency shutdown due to open buckets getting stuck in
1200 * the btree_reserve_cache after allocator shutdown has cleared it out.
1201 * This check needs to come after adding us to the btree_interior_update
1202 * list but before calling bch2_btree_reserve_get, to synchronize with
1203 * __bch2_fs_read_only().
1204 */
1205 ret = bch2_journal_error(&c->journal);
1206 if (ret)
1207 goto err;
1208
1209 ret = bch2_disk_reservation_get(c, &as->disk_res,
1210 (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
1211 c->opts.metadata_replicas,
1212 disk_res_flags);
1213 if (ret)
1214 goto err;
1215
1216 ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
1217 if (bch2_err_matches(ret, ENOSPC) ||
1218 bch2_err_matches(ret, ENOMEM)) {
1219 struct closure cl;
1220
1221 /*
1222 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
1223 * flag
1224 */
1225 if (bch2_err_matches(ret, ENOSPC) &&
1226 (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
1227 watermark < BCH_WATERMARK_reclaim) {
1228 ret = -BCH_ERR_journal_reclaim_would_deadlock;
1229 goto err;
1230 }
1231
1232 closure_init_stack(&cl);
1233
1234 do {
1235 ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
1236
1237 bch2_trans_unlock(trans);
1238 closure_sync(&cl);
1239 } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
1240 }
1241
1242 if (ret) {
1243 trace_and_count(c, btree_reserve_get_fail, trans->fn,
1244 _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
1245 goto err;
1246 }
1247
1248 ret = bch2_trans_relock(trans);
1249 if (ret)
1250 goto err;
1251
1252 bch2_trans_verify_not_restarted(trans, restart_count);
1253 return as;
1254 err:
1255 bch2_btree_update_free(as, trans);
1256 if (!bch2_err_matches(ret, ENOSPC) &&
1257 !bch2_err_matches(ret, EROFS) &&
1258 ret != -BCH_ERR_journal_reclaim_would_deadlock)
1259 bch_err_fn_ratelimited(c, ret);
1260 return ERR_PTR(ret);
1261 }
1262
1263 /* Btree root updates: */
1264
1265 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
1266 {
1267 /* Root nodes cannot be reaped */
1268 mutex_lock(&c->btree_cache.lock);
1269 list_del_init(&b->list);
1270 mutex_unlock(&c->btree_cache.lock);
1271
1272 mutex_lock(&c->btree_root_lock);
1273 bch2_btree_id_root(c, b->c.btree_id)->b = b;
1274 mutex_unlock(&c->btree_root_lock);
1275
1276 bch2_recalc_btree_reserve(c);
1277 }
1278
1279 static void bch2_btree_set_root(struct btree_update *as,
1280 struct btree_trans *trans,
1281 struct btree_path *path,
1282 struct btree *b)
1283 {
1284 struct bch_fs *c = as->c;
1285 struct btree *old;
1286
1287 trace_and_count(c, btree_node_set_root, trans, b);
1288
1289 old = btree_node_root(c, b);
1290
1291 /*
1292 * Ensure no one is using the old root while we switch to the
1293 * new root:
1294 */
1295 bch2_btree_node_lock_write_nofail(trans, path, &old->c);
1296
1297 bch2_btree_set_root_inmem(c, b);
1298
1299 btree_update_updated_root(as, b);
1300
1301 /*
1302 * Unlock old root after new root is visible:
1303 *
1304 * The new root isn't persistent, but that's ok: we still have
1305 * an intent lock on the new root, and any updates that would
1306 * depend on the new root would have to update the new root.
1307 */
1308 bch2_btree_node_unlock_write(trans, path, old);
1309 }
1310
1311 /* Interior node updates: */
1312
1313 static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
1314 struct btree_trans *trans,
1315 struct btree_path *path,
1316 struct btree *b,
1317 struct btree_node_iter *node_iter,
1318 struct bkey_i *insert)
1319 {
1320 struct bch_fs *c = as->c;
1321 struct bkey_packed *k;
1322 struct printbuf buf = PRINTBUF;
1323 unsigned long old, new, v;
1324
1325 BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
1326 !btree_ptr_sectors_written(insert));
1327
1328 if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
1329 bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
1330
1331 if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
1332 btree_node_type(b), WRITE, &buf) ?:
1333 bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
1334 printbuf_reset(&buf);
1335 prt_printf(&buf, "inserting invalid bkey\n ");
1336 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
1337 prt_printf(&buf, "\n ");
1338 bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
1339 btree_node_type(b), WRITE, &buf);
1340 bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
1341
1342 bch2_fs_inconsistent(c, "%s", buf.buf);
1343 dump_stack();
1344 }
1345
1346 BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
1347 ARRAY_SIZE(as->journal_entries));
1348
1349 as->journal_u64s +=
1350 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
1351 BCH_JSET_ENTRY_btree_keys,
1352 b->c.btree_id, b->c.level,
1353 insert, insert->k.u64s);
1354
1355 while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1356 bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1357 bch2_btree_node_iter_advance(node_iter, b);
1358
1359 bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
1360 set_btree_node_dirty_acct(c, b);
1361
1362 v = READ_ONCE(b->flags);
1363 do {
1364 old = new = v;
1365
1366 new &= ~BTREE_WRITE_TYPE_MASK;
1367 new |= BTREE_WRITE_interior;
1368 new |= 1 << BTREE_NODE_need_write;
1369 } while ((v = cmpxchg(&b->flags, old, new)) != old);
1370
1371 printbuf_exit(&buf);
1372 }
1373
1374 static void
1375 bch2_btree_insert_keys_interior(struct btree_update *as,
1376 struct btree_trans *trans,
1377 struct btree_path *path,
1378 struct btree *b,
1379 struct btree_node_iter node_iter,
1380 struct keylist *keys)
1381 {
1382 struct bkey_i *insert = bch2_keylist_front(keys);
1383 struct bkey_packed *k;
1384
1385 BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
1386
1387 while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1388 (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
1389 ;
1390
1391 while (!bch2_keylist_empty(keys)) {
1392 insert = bch2_keylist_front(keys);
1393
1394 if (bpos_gt(insert->k.p, b->key.k.p))
1395 break;
1396
1397 bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
1398 bch2_keylist_pop_front(keys);
1399 }
1400 }
1401
1402 /*
1403 * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1404 * node)
1405 */
1406 static void __btree_split_node(struct btree_update *as,
1407 struct btree_trans *trans,
1408 struct btree *b,
1409 struct btree *n[2])
1410 {
1411 struct bkey_packed *k;
1412 struct bpos n1_pos = POS_MIN;
1413 struct btree_node_iter iter;
1414 struct bset *bsets[2];
1415 struct bkey_format_state format[2];
1416 struct bkey_packed *out[2];
1417 struct bkey uk;
1418 unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
1419 struct { unsigned nr_keys, val_u64s; } nr_keys[2];
1420 int i;
1421
1422 memset(&nr_keys, 0, sizeof(nr_keys));
1423
1424 for (i = 0; i < 2; i++) {
1425 BUG_ON(n[i]->nsets != 1);
1426
1427 bsets[i] = btree_bset_first(n[i]);
1428 out[i] = bsets[i]->start;
1429
1430 SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
1431 bch2_bkey_format_init(&format[i]);
1432 }
1433
1434 u64s = 0;
1435 for_each_btree_node_key(b, k, &iter) {
1436 if (bkey_deleted(k))
1437 continue;
1438
1439 uk = bkey_unpack_key(b, k);
1440
1441 if (b->c.level &&
1442 u64s < n1_u64s &&
1443 u64s + k->u64s >= n1_u64s &&
1444 bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
1445 n1_u64s += k->u64s;
1446
1447 i = u64s >= n1_u64s;
1448 u64s += k->u64s;
1449 if (!i)
1450 n1_pos = uk.p;
1451 bch2_bkey_format_add_key(&format[i], &uk);
1452
1453 nr_keys[i].nr_keys++;
1454 nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
1455 }
1456
1457 btree_set_min(n[0], b->data->min_key);
1458 btree_set_max(n[0], n1_pos);
1459 btree_set_min(n[1], bpos_successor(n1_pos));
1460 btree_set_max(n[1], b->data->max_key);
1461
1462 for (i = 0; i < 2; i++) {
1463 bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
1464 bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
1465
1466 n[i]->data->format = bch2_bkey_format_done(&format[i]);
1467
1468 unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
1469 nr_keys[i].val_u64s;
1470 if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
1471 n[i]->data->format = b->format;
1472
1473 btree_node_set_format(n[i], n[i]->data->format);
1474 }
1475
1476 u64s = 0;
1477 for_each_btree_node_key(b, k, &iter) {
1478 if (bkey_deleted(k))
1479 continue;
1480
1481 i = u64s >= n1_u64s;
1482 u64s += k->u64s;
1483
1484 if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
1485 ? &b->format: &bch2_bkey_format_current, k))
1486 out[i]->format = KEY_FORMAT_LOCAL_BTREE;
1487 else
1488 bch2_bkey_unpack(b, (void *) out[i], k);
1489
1490 out[i]->needs_whiteout = false;
1491
1492 btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
1493 out[i] = bkey_p_next(out[i]);
1494 }
1495
1496 for (i = 0; i < 2; i++) {
1497 bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
1498
1499 BUG_ON(!bsets[i]->u64s);
1500
1501 set_btree_bset_end(n[i], n[i]->set);
1502
1503 btree_node_reset_sib_u64s(n[i]);
1504
1505 bch2_verify_btree_nr_keys(n[i]);
1506
1507 BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
1508 }
1509 }
1510
1511 /*
1512 * For updates to interior nodes, we've got to do the insert before we split
1513 * because the stuff we're inserting has to be inserted atomically. Post split,
1514 * the keys might have to go in different nodes and the split would no longer be
1515 * atomic.
1516 *
1517 * Worse, if the insert is from btree node coalescing, if we do the insert after
1518 * we do the split (and pick the pivot) - the pivot we pick might be between
1519 * nodes that were coalesced, and thus in the middle of a child node post
1520 * coalescing:
1521 */
1522 static void btree_split_insert_keys(struct btree_update *as,
1523 struct btree_trans *trans,
1524 btree_path_idx_t path_idx,
1525 struct btree *b,
1526 struct keylist *keys)
1527 {
1528 struct btree_path *path = trans->paths + path_idx;
1529
1530 if (!bch2_keylist_empty(keys) &&
1531 bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
1532 struct btree_node_iter node_iter;
1533
1534 bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
1535
1536 bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
1537
1538 BUG_ON(bch2_btree_node_check_topology(trans, b));
1539 }
1540 }
1541
1542 static int btree_split(struct btree_update *as, struct btree_trans *trans,
1543 btree_path_idx_t path, struct btree *b,
1544 struct keylist *keys)
1545 {
1546 struct bch_fs *c = as->c;
1547 struct btree *parent = btree_node_parent(trans->paths + path, b);
1548 struct btree *n1, *n2 = NULL, *n3 = NULL;
1549 btree_path_idx_t path1 = 0, path2 = 0;
1550 u64 start_time = local_clock();
1551 int ret = 0;
1552
1553 bch2_verify_btree_nr_keys(b);
1554 BUG_ON(!parent && (b != btree_node_root(c, b)));
1555 BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
1556
1557 ret = bch2_btree_node_check_topology(trans, b);
1558 if (ret)
1559 return ret;
1560
1561 bch2_btree_interior_update_will_free_node(as, b);
1562
1563 if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
1564 struct btree *n[2];
1565
1566 trace_and_count(c, btree_node_split, trans, b);
1567
1568 n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
1569 n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
1570
1571 __btree_split_node(as, trans, b, n);
1572
1573 if (keys) {
1574 btree_split_insert_keys(as, trans, path, n1, keys);
1575 btree_split_insert_keys(as, trans, path, n2, keys);
1576 BUG_ON(!bch2_keylist_empty(keys));
1577 }
1578
1579 bch2_btree_build_aux_trees(n2);
1580 bch2_btree_build_aux_trees(n1);
1581
1582 bch2_btree_update_add_new_node(as, n1);
1583 bch2_btree_update_add_new_node(as, n2);
1584 six_unlock_write(&n2->c.lock);
1585 six_unlock_write(&n1->c.lock);
1586
1587 path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
1588 six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
1589 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
1590 bch2_btree_path_level_init(trans, trans->paths + path1, n1);
1591
1592 path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p);
1593 six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
1594 mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
1595 bch2_btree_path_level_init(trans, trans->paths + path2, n2);
1596
1597 /*
1598 * Note that on recursive parent_keys == keys, so we
1599 * can't start adding new keys to parent_keys before emptying it
1600 * out (which we did with btree_split_insert_keys() above)
1601 */
1602 bch2_keylist_add(&as->parent_keys, &n1->key);
1603 bch2_keylist_add(&as->parent_keys, &n2->key);
1604
1605 if (!parent) {
1606 /* Depth increases, make a new root */
1607 n3 = __btree_root_alloc(as, trans, b->c.level + 1);
1608
1609 bch2_btree_update_add_new_node(as, n3);
1610 six_unlock_write(&n3->c.lock);
1611
1612 trans->paths[path2].locks_want++;
1613 BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
1614 six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
1615 mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
1616 bch2_btree_path_level_init(trans, trans->paths + path2, n3);
1617
1618 n3->sib_u64s[0] = U16_MAX;
1619 n3->sib_u64s[1] = U16_MAX;
1620
1621 btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
1622 }
1623 } else {
1624 trace_and_count(c, btree_node_compact, trans, b);
1625
1626 n1 = bch2_btree_node_alloc_replacement(as, trans, b);
1627
1628 if (keys) {
1629 btree_split_insert_keys(as, trans, path, n1, keys);
1630 BUG_ON(!bch2_keylist_empty(keys));
1631 }
1632
1633 bch2_btree_build_aux_trees(n1);
1634 bch2_btree_update_add_new_node(as, n1);
1635 six_unlock_write(&n1->c.lock);
1636
1637 path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
1638 six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
1639 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
1640 bch2_btree_path_level_init(trans, trans->paths + path1, n1);
1641
1642 if (parent)
1643 bch2_keylist_add(&as->parent_keys, &n1->key);
1644 }
1645
1646 /* New nodes all written, now make them visible: */
1647
1648 if (parent) {
1649 /* Split a non root node */
1650 ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
1651 if (ret)
1652 goto err;
1653 } else if (n3) {
1654 bch2_btree_set_root(as, trans, trans->paths + path, n3);
1655 } else {
1656 /* Root filled up but didn't need to be split */
1657 bch2_btree_set_root(as, trans, trans->paths + path, n1);
1658 }
1659
1660 if (n3) {
1661 bch2_btree_update_get_open_buckets(as, n3);
1662 bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
1663 }
1664 if (n2) {
1665 bch2_btree_update_get_open_buckets(as, n2);
1666 bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
1667 }
1668 bch2_btree_update_get_open_buckets(as, n1);
1669 bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
1670
1671 /*
1672 * The old node must be freed (in memory) _before_ unlocking the new
1673 * nodes - else another thread could re-acquire a read lock on the old
1674 * node after another thread has locked and updated the new node, thus
1675 * seeing stale data:
1676 */
1677 bch2_btree_node_free_inmem(trans, trans->paths + path, b);
1678
1679 if (n3)
1680 bch2_trans_node_add(trans, trans->paths + path, n3);
1681 if (n2)
1682 bch2_trans_node_add(trans, trans->paths + path2, n2);
1683 bch2_trans_node_add(trans, trans->paths + path1, n1);
1684
1685 if (n3)
1686 six_unlock_intent(&n3->c.lock);
1687 if (n2)
1688 six_unlock_intent(&n2->c.lock);
1689 six_unlock_intent(&n1->c.lock);
1690 out:
1691 if (path2) {
1692 __bch2_btree_path_unlock(trans, trans->paths + path2);
1693 bch2_path_put(trans, path2, true);
1694 }
1695 if (path1) {
1696 __bch2_btree_path_unlock(trans, trans->paths + path1);
1697 bch2_path_put(trans, path1, true);
1698 }
1699
1700 bch2_trans_verify_locks(trans);
1701
1702 bch2_time_stats_update(&c->times[n2
1703 ? BCH_TIME_btree_node_split
1704 : BCH_TIME_btree_node_compact],
1705 start_time);
1706 return ret;
1707 err:
1708 if (n3)
1709 bch2_btree_node_free_never_used(as, trans, n3);
1710 if (n2)
1711 bch2_btree_node_free_never_used(as, trans, n2);
1712 bch2_btree_node_free_never_used(as, trans, n1);
1713 goto out;
1714 }
1715
1716 /**
1717 * bch2_btree_insert_node - insert bkeys into a given btree node
1718 *
1719 * @as: btree_update object
1720 * @trans: btree_trans object
1721 * @path_idx: path that points to current node
1722 * @b: node to insert keys into
1723 * @keys: list of keys to insert
1724 *
1725 * Returns: 0 on success, typically transaction restart error on failure
1726 *
1727 * Inserts as many keys as it can into a given btree node, splitting it if full.
1728 * If a split occurred, this function will return early. This can only happen
1729 * for leaf nodes -- inserts into interior nodes have to be atomic.
1730 */
1731 static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
1732 btree_path_idx_t path_idx, struct btree *b,
1733 struct keylist *keys)
1734 {
1735 struct bch_fs *c = as->c;
1736 struct btree_path *path = trans->paths + path_idx, *linked;
1737 unsigned i;
1738 int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1739 int old_live_u64s = b->nr.live_u64s;
1740 int live_u64s_added, u64s_added;
1741 int ret;
1742
1743 lockdep_assert_held(&c->gc_lock);
1744 BUG_ON(!btree_node_intent_locked(path, b->c.level));
1745 BUG_ON(!b->c.level);
1746 BUG_ON(!as || as->b);
1747 bch2_verify_keylist_sorted(keys);
1748
1749 ret = bch2_btree_node_lock_write(trans, path, &b->c);
1750 if (ret)
1751 return ret;
1752
1753 bch2_btree_node_prep_for_write(trans, path, b);
1754
1755 if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
1756 bch2_btree_node_unlock_write(trans, path, b);
1757 goto split;
1758 }
1759
1760 ret = bch2_btree_node_check_topology(trans, b);
1761 if (ret) {
1762 bch2_btree_node_unlock_write(trans, path, b);
1763 return ret;
1764 }
1765
1766 bch2_btree_insert_keys_interior(as, trans, path, b,
1767 path->l[b->c.level].iter, keys);
1768
1769 trans_for_each_path_with_node(trans, b, linked, i)
1770 bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
1771
1772 bch2_trans_verify_paths(trans);
1773
1774 live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1775 u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1776
1777 if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1778 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1779 if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1780 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1781
1782 if (u64s_added > live_u64s_added &&
1783 bch2_maybe_compact_whiteouts(c, b))
1784 bch2_trans_node_reinit_iter(trans, b);
1785
1786 btree_update_updated_node(as, b);
1787 bch2_btree_node_unlock_write(trans, path, b);
1788
1789 BUG_ON(bch2_btree_node_check_topology(trans, b));
1790 return 0;
1791 split:
1792 /*
1793 * We could attempt to avoid the transaction restart, by calling
1794 * bch2_btree_path_upgrade() and allocating more nodes:
1795 */
1796 if (b->c.level >= as->update_level_end) {
1797 trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
1798 return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
1799 }
1800
1801 return btree_split(as, trans, path_idx, b, keys);
1802 }
1803
1804 int bch2_btree_split_leaf(struct btree_trans *trans,
1805 btree_path_idx_t path,
1806 unsigned flags)
1807 {
1808 /* btree_split & merge may both cause paths array to be reallocated */
1809 struct btree *b = path_l(trans->paths + path)->b;
1810 struct btree_update *as;
1811 unsigned l;
1812 int ret = 0;
1813
1814 as = bch2_btree_update_start(trans, trans->paths + path,
1815 trans->paths[path].level,
1816 true, flags);
1817 if (IS_ERR(as))
1818 return PTR_ERR(as);
1819
1820 ret = btree_split(as, trans, path, b, NULL);
1821 if (ret) {
1822 bch2_btree_update_free(as, trans);
1823 return ret;
1824 }
1825
1826 bch2_btree_update_done(as, trans);
1827
1828 for (l = trans->paths[path].level + 1;
1829 btree_node_intent_locked(&trans->paths[path], l) && !ret;
1830 l++)
1831 ret = bch2_foreground_maybe_merge(trans, path, l, flags);
1832
1833 return ret;
1834 }
1835
1836 static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
1837 btree_path_idx_t path_idx)
1838 {
1839 struct bch_fs *c = as->c;
1840 struct btree_path *path = trans->paths + path_idx;
1841 struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
1842
1843 BUG_ON(!btree_node_locked(path, b->c.level));
1844
1845 n = __btree_root_alloc(as, trans, b->c.level + 1);
1846
1847 bch2_btree_update_add_new_node(as, n);
1848 six_unlock_write(&n->c.lock);
1849
1850 path->locks_want++;
1851 BUG_ON(btree_node_locked(path, n->c.level));
1852 six_lock_increment(&n->c.lock, SIX_LOCK_intent);
1853 mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
1854 bch2_btree_path_level_init(trans, path, n);
1855
1856 n->sib_u64s[0] = U16_MAX;
1857 n->sib_u64s[1] = U16_MAX;
1858
1859 bch2_keylist_add(&as->parent_keys, &b->key);
1860 btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
1861
1862 bch2_btree_set_root(as, trans, path, n);
1863 bch2_btree_update_get_open_buckets(as, n);
1864 bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
1865 bch2_trans_node_add(trans, path, n);
1866 six_unlock_intent(&n->c.lock);
1867
1868 mutex_lock(&c->btree_cache.lock);
1869 list_add_tail(&b->list, &c->btree_cache.live);
1870 mutex_unlock(&c->btree_cache.lock);
1871
1872 bch2_trans_verify_locks(trans);
1873 }
1874
1875 int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
1876 {
1877 struct bch_fs *c = trans->c;
1878 struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
1879
1880 if (btree_node_fake(b))
1881 return bch2_btree_split_leaf(trans, path, flags);
1882
1883 struct btree_update *as =
1884 bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
1885 if (IS_ERR(as))
1886 return PTR_ERR(as);
1887
1888 __btree_increase_depth(as, trans, path);
1889 bch2_btree_update_done(as, trans);
1890 return 0;
1891 }
1892
1893 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
1894 btree_path_idx_t path,
1895 unsigned level,
1896 unsigned flags,
1897 enum btree_node_sibling sib)
1898 {
1899 struct bch_fs *c = trans->c;
1900 struct btree_update *as;
1901 struct bkey_format_state new_s;
1902 struct bkey_format new_f;
1903 struct bkey_i delete;
1904 struct btree *b, *m, *n, *prev, *next, *parent;
1905 struct bpos sib_pos;
1906 size_t sib_u64s;
1907 enum btree_id btree = trans->paths[path].btree_id;
1908 btree_path_idx_t sib_path = 0, new_path = 0;
1909 u64 start_time = local_clock();
1910 int ret = 0;
1911
1912 BUG_ON(!trans->paths[path].should_be_locked);
1913 BUG_ON(!btree_node_locked(&trans->paths[path], level));
1914
1915 b = trans->paths[path].l[level].b;
1916
1917 if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
1918 (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
1919 b->sib_u64s[sib] = U16_MAX;
1920 return 0;
1921 }
1922
1923 sib_pos = sib == btree_prev_sib
1924 ? bpos_predecessor(b->data->min_key)
1925 : bpos_successor(b->data->max_key);
1926
1927 sib_path = bch2_path_get(trans, btree, sib_pos,
1928 U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
1929 ret = bch2_btree_path_traverse(trans, sib_path, false);
1930 if (ret)
1931 goto err;
1932
1933 btree_path_set_should_be_locked(trans->paths + sib_path);
1934
1935 m = trans->paths[sib_path].l[level].b;
1936
1937 if (btree_node_parent(trans->paths + path, b) !=
1938 btree_node_parent(trans->paths + sib_path, m)) {
1939 b->sib_u64s[sib] = U16_MAX;
1940 goto out;
1941 }
1942
1943 if (sib == btree_prev_sib) {
1944 prev = m;
1945 next = b;
1946 } else {
1947 prev = b;
1948 next = m;
1949 }
1950
1951 if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
1952 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1953
1954 bch2_bpos_to_text(&buf1, prev->data->max_key);
1955 bch2_bpos_to_text(&buf2, next->data->min_key);
1956 bch_err(c,
1957 "%s(): btree topology error:\n"
1958 " prev ends at %s\n"
1959 " next starts at %s",
1960 __func__, buf1.buf, buf2.buf);
1961 printbuf_exit(&buf1);
1962 printbuf_exit(&buf2);
1963 ret = bch2_topology_error(c);
1964 goto err;
1965 }
1966
1967 bch2_bkey_format_init(&new_s);
1968 bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
1969 __bch2_btree_calc_format(&new_s, prev);
1970 __bch2_btree_calc_format(&new_s, next);
1971 bch2_bkey_format_add_pos(&new_s, next->data->max_key);
1972 new_f = bch2_bkey_format_done(&new_s);
1973
1974 sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
1975 btree_node_u64s_with_format(m->nr, &m->format, &new_f);
1976
1977 if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1978 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1979 sib_u64s /= 2;
1980 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1981 }
1982
1983 sib_u64s = min(sib_u64s, btree_max_u64s(c));
1984 sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
1985 b->sib_u64s[sib] = sib_u64s;
1986
1987 if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
1988 goto out;
1989
1990 parent = btree_node_parent(trans->paths + path, b);
1991 as = bch2_btree_update_start(trans, trans->paths + path, level, false,
1992 BCH_TRANS_COMMIT_no_enospc|flags);
1993 ret = PTR_ERR_OR_ZERO(as);
1994 if (ret)
1995 goto err;
1996
1997 trace_and_count(c, btree_node_merge, trans, b);
1998
1999 bch2_btree_interior_update_will_free_node(as, b);
2000 bch2_btree_interior_update_will_free_node(as, m);
2001
2002 n = bch2_btree_node_alloc(as, trans, b->c.level);
2003
2004 SET_BTREE_NODE_SEQ(n->data,
2005 max(BTREE_NODE_SEQ(b->data),
2006 BTREE_NODE_SEQ(m->data)) + 1);
2007
2008 btree_set_min(n, prev->data->min_key);
2009 btree_set_max(n, next->data->max_key);
2010
2011 n->data->format = new_f;
2012 btree_node_set_format(n, new_f);
2013
2014 bch2_btree_sort_into(c, n, prev);
2015 bch2_btree_sort_into(c, n, next);
2016
2017 bch2_btree_build_aux_trees(n);
2018 bch2_btree_update_add_new_node(as, n);
2019 six_unlock_write(&n->c.lock);
2020
2021 new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p);
2022 six_lock_increment(&n->c.lock, SIX_LOCK_intent);
2023 mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
2024 bch2_btree_path_level_init(trans, trans->paths + new_path, n);
2025
2026 bkey_init(&delete.k);
2027 delete.k.p = prev->key.k.p;
2028 bch2_keylist_add(&as->parent_keys, &delete);
2029 bch2_keylist_add(&as->parent_keys, &n->key);
2030
2031 bch2_trans_verify_paths(trans);
2032
2033 ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
2034 if (ret)
2035 goto err_free_update;
2036
2037 bch2_trans_verify_paths(trans);
2038
2039 bch2_btree_update_get_open_buckets(as, n);
2040 bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
2041
2042 bch2_btree_node_free_inmem(trans, trans->paths + path, b);
2043 bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
2044
2045 bch2_trans_node_add(trans, trans->paths + path, n);
2046
2047 bch2_trans_verify_paths(trans);
2048
2049 six_unlock_intent(&n->c.lock);
2050
2051 bch2_btree_update_done(as, trans);
2052
2053 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
2054 out:
2055 err:
2056 if (new_path)
2057 bch2_path_put(trans, new_path, true);
2058 bch2_path_put(trans, sib_path, true);
2059 bch2_trans_verify_locks(trans);
2060 return ret;
2061 err_free_update:
2062 bch2_btree_node_free_never_used(as, trans, n);
2063 bch2_btree_update_free(as, trans);
2064 goto out;
2065 }
2066
2067 int bch2_btree_node_rewrite(struct btree_trans *trans,
2068 struct btree_iter *iter,
2069 struct btree *b,
2070 unsigned flags)
2071 {
2072 struct bch_fs *c = trans->c;
2073 struct btree *n, *parent;
2074 struct btree_update *as;
2075 btree_path_idx_t new_path = 0;
2076 int ret;
2077
2078 flags |= BCH_TRANS_COMMIT_no_enospc;
2079
2080 struct btree_path *path = btree_iter_path(trans, iter);
2081 parent = btree_node_parent(path, b);
2082 as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
2083 ret = PTR_ERR_OR_ZERO(as);
2084 if (ret)
2085 goto out;
2086
2087 bch2_btree_interior_update_will_free_node(as, b);
2088
2089 n = bch2_btree_node_alloc_replacement(as, trans, b);
2090
2091 bch2_btree_build_aux_trees(n);
2092 bch2_btree_update_add_new_node(as, n);
2093 six_unlock_write(&n->c.lock);
2094
2095 new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
2096 six_lock_increment(&n->c.lock, SIX_LOCK_intent);
2097 mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
2098 bch2_btree_path_level_init(trans, trans->paths + new_path, n);
2099
2100 trace_and_count(c, btree_node_rewrite, trans, b);
2101
2102 if (parent) {
2103 bch2_keylist_add(&as->parent_keys, &n->key);
2104 ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
2105 if (ret)
2106 goto err;
2107 } else {
2108 bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
2109 }
2110
2111 bch2_btree_update_get_open_buckets(as, n);
2112 bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
2113
2114 bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
2115
2116 bch2_trans_node_add(trans, trans->paths + iter->path, n);
2117 six_unlock_intent(&n->c.lock);
2118
2119 bch2_btree_update_done(as, trans);
2120 out:
2121 if (new_path)
2122 bch2_path_put(trans, new_path, true);
2123 bch2_trans_downgrade(trans);
2124 return ret;
2125 err:
2126 bch2_btree_node_free_never_used(as, trans, n);
2127 bch2_btree_update_free(as, trans);
2128 goto out;
2129 }
2130
2131 struct async_btree_rewrite {
2132 struct bch_fs *c;
2133 struct work_struct work;
2134 struct list_head list;
2135 enum btree_id btree_id;
2136 unsigned level;
2137 struct bpos pos;
2138 __le64 seq;
2139 };
2140
2141 static int async_btree_node_rewrite_trans(struct btree_trans *trans,
2142 struct async_btree_rewrite *a)
2143 {
2144 struct bch_fs *c = trans->c;
2145 struct btree_iter iter;
2146 struct btree *b;
2147 int ret;
2148
2149 bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
2150 BTREE_MAX_DEPTH, a->level, 0);
2151 b = bch2_btree_iter_peek_node(&iter);
2152 ret = PTR_ERR_OR_ZERO(b);
2153 if (ret)
2154 goto out;
2155
2156 if (!b || b->data->keys.seq != a->seq) {
2157 struct printbuf buf = PRINTBUF;
2158
2159 if (b)
2160 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
2161 else
2162 prt_str(&buf, "(null");
2163 bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
2164 __func__, a->seq, buf.buf);
2165 printbuf_exit(&buf);
2166 goto out;
2167 }
2168
2169 ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
2170 out:
2171 bch2_trans_iter_exit(trans, &iter);
2172
2173 return ret;
2174 }
2175
2176 static void async_btree_node_rewrite_work(struct work_struct *work)
2177 {
2178 struct async_btree_rewrite *a =
2179 container_of(work, struct async_btree_rewrite, work);
2180 struct bch_fs *c = a->c;
2181 int ret;
2182
2183 ret = bch2_trans_do(c, NULL, NULL, 0,
2184 async_btree_node_rewrite_trans(trans, a));
2185 bch_err_fn_ratelimited(c, ret);
2186 bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
2187 kfree(a);
2188 }
2189
2190 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
2191 {
2192 struct async_btree_rewrite *a;
2193 int ret;
2194
2195 a = kmalloc(sizeof(*a), GFP_NOFS);
2196 if (!a) {
2197 bch_err(c, "%s: error allocating memory", __func__);
2198 return;
2199 }
2200
2201 a->c = c;
2202 a->btree_id = b->c.btree_id;
2203 a->level = b->c.level;
2204 a->pos = b->key.k.p;
2205 a->seq = b->data->keys.seq;
2206 INIT_WORK(&a->work, async_btree_node_rewrite_work);
2207
2208 if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
2209 mutex_lock(&c->pending_node_rewrites_lock);
2210 list_add(&a->list, &c->pending_node_rewrites);
2211 mutex_unlock(&c->pending_node_rewrites_lock);
2212 return;
2213 }
2214
2215 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
2216 if (test_bit(BCH_FS_started, &c->flags)) {
2217 bch_err(c, "%s: error getting c->writes ref", __func__);
2218 kfree(a);
2219 return;
2220 }
2221
2222 ret = bch2_fs_read_write_early(c);
2223 bch_err_msg(c, ret, "going read-write");
2224 if (ret) {
2225 kfree(a);
2226 return;
2227 }
2228
2229 bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
2230 }
2231
2232 queue_work(c->btree_node_rewrite_worker, &a->work);
2233 }
2234
2235 void bch2_do_pending_node_rewrites(struct bch_fs *c)
2236 {
2237 struct async_btree_rewrite *a, *n;
2238
2239 mutex_lock(&c->pending_node_rewrites_lock);
2240 list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
2241 list_del(&a->list);
2242
2243 bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
2244 queue_work(c->btree_node_rewrite_worker, &a->work);
2245 }
2246 mutex_unlock(&c->pending_node_rewrites_lock);
2247 }
2248
2249 void bch2_free_pending_node_rewrites(struct bch_fs *c)
2250 {
2251 struct async_btree_rewrite *a, *n;
2252
2253 mutex_lock(&c->pending_node_rewrites_lock);
2254 list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
2255 list_del(&a->list);
2256
2257 kfree(a);
2258 }
2259 mutex_unlock(&c->pending_node_rewrites_lock);
2260 }
2261
2262 static int __bch2_btree_node_update_key(struct btree_trans *trans,
2263 struct btree_iter *iter,
2264 struct btree *b, struct btree *new_hash,
2265 struct bkey_i *new_key,
2266 unsigned commit_flags,
2267 bool skip_triggers)
2268 {
2269 struct bch_fs *c = trans->c;
2270 struct btree_iter iter2 = { NULL };
2271 struct btree *parent;
2272 int ret;
2273
2274 if (!skip_triggers) {
2275 ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
2276 bkey_i_to_s_c(&b->key),
2277 BTREE_TRIGGER_TRANSACTIONAL) ?:
2278 bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
2279 bkey_i_to_s(new_key),
2280 BTREE_TRIGGER_TRANSACTIONAL);
2281 if (ret)
2282 return ret;
2283 }
2284
2285 if (new_hash) {
2286 bkey_copy(&new_hash->key, new_key);
2287 ret = bch2_btree_node_hash_insert(&c->btree_cache,
2288 new_hash, b->c.level, b->c.btree_id);
2289 BUG_ON(ret);
2290 }
2291
2292 parent = btree_node_parent(btree_iter_path(trans, iter), b);
2293 if (parent) {
2294 bch2_trans_copy_iter(&iter2, iter);
2295
2296 iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
2297 iter2.flags & BTREE_ITER_INTENT,
2298 _THIS_IP_);
2299
2300 struct btree_path *path2 = btree_iter_path(trans, &iter2);
2301 BUG_ON(path2->level != b->c.level);
2302 BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
2303
2304 btree_path_set_level_up(trans, path2);
2305
2306 trans->paths_sorted = false;
2307
2308 ret = bch2_btree_iter_traverse(&iter2) ?:
2309 bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
2310 if (ret)
2311 goto err;
2312 } else {
2313 BUG_ON(btree_node_root(c, b) != b);
2314
2315 struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
2316 jset_u64s(new_key->k.u64s));
2317 ret = PTR_ERR_OR_ZERO(e);
2318 if (ret)
2319 return ret;
2320
2321 journal_entry_set(e,
2322 BCH_JSET_ENTRY_btree_root,
2323 b->c.btree_id, b->c.level,
2324 new_key, new_key->k.u64s);
2325 }
2326
2327 ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
2328 if (ret)
2329 goto err;
2330
2331 bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
2332
2333 if (new_hash) {
2334 mutex_lock(&c->btree_cache.lock);
2335 bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
2336 bch2_btree_node_hash_remove(&c->btree_cache, b);
2337
2338 bkey_copy(&b->key, new_key);
2339 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
2340 BUG_ON(ret);
2341 mutex_unlock(&c->btree_cache.lock);
2342 } else {
2343 bkey_copy(&b->key, new_key);
2344 }
2345
2346 bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
2347 out:
2348 bch2_trans_iter_exit(trans, &iter2);
2349 return ret;
2350 err:
2351 if (new_hash) {
2352 mutex_lock(&c->btree_cache.lock);
2353 bch2_btree_node_hash_remove(&c->btree_cache, b);
2354 mutex_unlock(&c->btree_cache.lock);
2355 }
2356 goto out;
2357 }
2358
2359 int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
2360 struct btree *b, struct bkey_i *new_key,
2361 unsigned commit_flags, bool skip_triggers)
2362 {
2363 struct bch_fs *c = trans->c;
2364 struct btree *new_hash = NULL;
2365 struct btree_path *path = btree_iter_path(trans, iter);
2366 struct closure cl;
2367 int ret = 0;
2368
2369 ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
2370 if (ret)
2371 return ret;
2372
2373 closure_init_stack(&cl);
2374
2375 /*
2376 * check btree_ptr_hash_val() after @b is locked by
2377 * btree_iter_traverse():
2378 */
2379 if (btree_ptr_hash_val(new_key) != b->hash_val) {
2380 ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
2381 if (ret) {
2382 ret = drop_locks_do(trans, (closure_sync(&cl), 0));
2383 if (ret)
2384 return ret;
2385 }
2386
2387 new_hash = bch2_btree_node_mem_alloc(trans, false);
2388 }
2389
2390 path->intent_ref++;
2391 ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
2392 commit_flags, skip_triggers);
2393 --path->intent_ref;
2394
2395 if (new_hash) {
2396 mutex_lock(&c->btree_cache.lock);
2397 list_move(&new_hash->list, &c->btree_cache.freeable);
2398 mutex_unlock(&c->btree_cache.lock);
2399
2400 six_unlock_write(&new_hash->c.lock);
2401 six_unlock_intent(&new_hash->c.lock);
2402 }
2403 closure_sync(&cl);
2404 bch2_btree_cache_cannibalize_unlock(trans);
2405 return ret;
2406 }
2407
2408 int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
2409 struct btree *b, struct bkey_i *new_key,
2410 unsigned commit_flags, bool skip_triggers)
2411 {
2412 struct btree_iter iter;
2413 int ret;
2414
2415 bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
2416 BTREE_MAX_DEPTH, b->c.level,
2417 BTREE_ITER_INTENT);
2418 ret = bch2_btree_iter_traverse(&iter);
2419 if (ret)
2420 goto out;
2421
2422 /* has node been freed? */
2423 if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
2424 /* node has been freed: */
2425 BUG_ON(!btree_node_dying(b));
2426 goto out;
2427 }
2428
2429 BUG_ON(!btree_node_hashed(b));
2430
2431 struct bch_extent_ptr *ptr;
2432 bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
2433 !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
2434
2435 ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
2436 commit_flags, skip_triggers);
2437 out:
2438 bch2_trans_iter_exit(trans, &iter);
2439 return ret;
2440 }
2441
2442 /* Init code: */
2443
2444 /*
2445 * Only for filesystem bringup, when first reading the btree roots or allocating
2446 * btree roots when initializing a new filesystem:
2447 */
2448 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
2449 {
2450 BUG_ON(btree_node_root(c, b));
2451
2452 bch2_btree_set_root_inmem(c, b);
2453 }
2454
2455 static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level)
2456 {
2457 struct bch_fs *c = trans->c;
2458 struct closure cl;
2459 struct btree *b;
2460 int ret;
2461
2462 closure_init_stack(&cl);
2463
2464 do {
2465 ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
2466 closure_sync(&cl);
2467 } while (ret);
2468
2469 b = bch2_btree_node_mem_alloc(trans, false);
2470 bch2_btree_cache_cannibalize_unlock(trans);
2471
2472 set_btree_node_fake(b);
2473 set_btree_node_need_rewrite(b);
2474 b->c.level = level;
2475 b->c.btree_id = id;
2476
2477 bkey_btree_ptr_init(&b->key);
2478 b->key.k.p = SPOS_MAX;
2479 *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
2480
2481 bch2_bset_init_first(b, &b->data->keys);
2482 bch2_btree_build_aux_trees(b);
2483
2484 b->data->flags = 0;
2485 btree_set_min(b, POS_MIN);
2486 btree_set_max(b, SPOS_MAX);
2487 b->data->format = bch2_btree_calc_format(b);
2488 btree_node_set_format(b, b->data->format);
2489
2490 ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
2491 b->c.level, b->c.btree_id);
2492 BUG_ON(ret);
2493
2494 bch2_btree_set_root_inmem(c, b);
2495
2496 six_unlock_write(&b->c.lock);
2497 six_unlock_intent(&b->c.lock);
2498 return 0;
2499 }
2500
2501 void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
2502 {
2503 bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
2504 }
2505
2506 static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
2507 {
2508 prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
2509 (void *) as->ip_started,
2510 bch2_btree_id_str(as->btree_id),
2511 as->update_level_start,
2512 as->update_level_end,
2513 bch2_watermarks[as->watermark],
2514 bch2_btree_update_modes[as->mode],
2515 as->nodes_written,
2516 closure_nr_remaining(&as->cl),
2517 as->journal.seq);
2518 }
2519
2520 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
2521 {
2522 struct btree_update *as;
2523
2524 mutex_lock(&c->btree_interior_update_lock);
2525 list_for_each_entry(as, &c->btree_interior_update_list, list)
2526 bch2_btree_update_to_text(out, as);
2527 mutex_unlock(&c->btree_interior_update_lock);
2528 }
2529
2530 static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
2531 {
2532 bool ret;
2533
2534 mutex_lock(&c->btree_interior_update_lock);
2535 ret = !list_empty(&c->btree_interior_update_list);
2536 mutex_unlock(&c->btree_interior_update_lock);
2537
2538 return ret;
2539 }
2540
2541 bool bch2_btree_interior_updates_flush(struct bch_fs *c)
2542 {
2543 bool ret = bch2_btree_interior_updates_pending(c);
2544
2545 if (ret)
2546 closure_wait_event(&c->btree_interior_update_wait,
2547 !bch2_btree_interior_updates_pending(c));
2548 return ret;
2549 }
2550
2551 void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
2552 {
2553 struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
2554
2555 mutex_lock(&c->btree_root_lock);
2556
2557 r->level = entry->level;
2558 r->alive = true;
2559 bkey_copy(&r->key, (struct bkey_i *) entry->start);
2560
2561 mutex_unlock(&c->btree_root_lock);
2562 }
2563
2564 struct jset_entry *
2565 bch2_btree_roots_to_journal_entries(struct bch_fs *c,
2566 struct jset_entry *end,
2567 unsigned long skip)
2568 {
2569 unsigned i;
2570
2571 mutex_lock(&c->btree_root_lock);
2572
2573 for (i = 0; i < btree_id_nr_alive(c); i++) {
2574 struct btree_root *r = bch2_btree_id_root(c, i);
2575
2576 if (r->alive && !test_bit(i, &skip)) {
2577 journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
2578 i, r->level, &r->key, r->key.k.u64s);
2579 end = vstruct_next(end);
2580 }
2581 }
2582
2583 mutex_unlock(&c->btree_root_lock);
2584
2585 return end;
2586 }
2587
2588 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
2589 {
2590 if (c->btree_node_rewrite_worker)
2591 destroy_workqueue(c->btree_node_rewrite_worker);
2592 if (c->btree_interior_update_worker)
2593 destroy_workqueue(c->btree_interior_update_worker);
2594 mempool_exit(&c->btree_interior_update_pool);
2595 }
2596
2597 void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
2598 {
2599 mutex_init(&c->btree_reserve_cache_lock);
2600 INIT_LIST_HEAD(&c->btree_interior_update_list);
2601 INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
2602 mutex_init(&c->btree_interior_update_lock);
2603 INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
2604
2605 INIT_LIST_HEAD(&c->pending_node_rewrites);
2606 mutex_init(&c->pending_node_rewrites_lock);
2607 }
2608
2609 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
2610 {
2611 c->btree_interior_update_worker =
2612 alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
2613 if (!c->btree_interior_update_worker)
2614 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
2615
2616 c->btree_node_rewrite_worker =
2617 alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
2618 if (!c->btree_node_rewrite_worker)
2619 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
2620
2621 if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
2622 sizeof(struct btree_update)))
2623 return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
2624
2625 return 0;
2626 }