1 // SPDX-License-Identifier: GPL-2.0
4 #include "alloc_foreground.h"
6 #include "bkey_methods.h"
7 #include "btree_cache.h"
9 #include "btree_journal_iter.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
13 #include "btree_iter.h"
14 #include "btree_locking.h"
20 #include "journal_reclaim.h"
22 #include "recovery_passes.h"
27 #include <linux/random.h>
29 static const char * const bch2_btree_update_modes
[] = {
36 static int bch2_btree_insert_node(struct btree_update
*, struct btree_trans
*,
37 btree_path_idx_t
, struct btree
*, struct keylist
*);
38 static void bch2_btree_update_add_new_node(struct btree_update
*, struct btree
*);
40 static btree_path_idx_t
get_unlocked_mut_path(struct btree_trans
*trans
,
41 enum btree_id btree_id
,
45 btree_path_idx_t path_idx
= bch2_path_get(trans
, btree_id
, pos
, level
+ 1, level
,
46 BTREE_ITER_NOPRESERVE
|
47 BTREE_ITER_INTENT
, _RET_IP_
);
48 path_idx
= bch2_btree_path_make_mut(trans
, path_idx
, true, _RET_IP_
);
50 struct btree_path
*path
= trans
->paths
+ path_idx
;
51 bch2_btree_path_downgrade(trans
, path
);
52 __bch2_btree_path_unlock(trans
, path
);
57 * Verify that child nodes correctly span parent node's range:
59 int bch2_btree_node_check_topology(struct btree_trans
*trans
, struct btree
*b
)
61 struct bch_fs
*c
= trans
->c
;
62 struct bpos node_min
= b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
63 ? bkey_i_to_btree_ptr_v2(&b
->key
)->v
.min_key
65 struct btree_and_journal_iter iter
;
67 struct printbuf buf
= PRINTBUF
;
71 BUG_ON(b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
&&
72 !bpos_eq(bkey_i_to_btree_ptr_v2(&b
->key
)->v
.min_key
,
78 bch2_bkey_buf_init(&prev
);
79 bkey_init(&prev
.k
->k
);
80 bch2_btree_and_journal_iter_init_node_iter(trans
, &iter
, b
);
82 while ((k
= bch2_btree_and_journal_iter_peek(&iter
)).k
) {
83 if (k
.k
->type
!= KEY_TYPE_btree_ptr_v2
)
86 struct bkey_s_c_btree_ptr_v2 bp
= bkey_s_c_to_btree_ptr_v2(k
);
88 struct bpos expected_min
= bkey_deleted(&prev
.k
->k
)
90 : bpos_successor(prev
.k
->k
.p
);
92 if (!bpos_eq(expected_min
, bp
.v
->min_key
)) {
93 bch2_topology_error(c
);
96 prt_str(&buf
, "end of prev node doesn't match start of next node\n"),
97 prt_printf(&buf
, " in btree %s level %u node ",
98 bch2_btree_id_str(b
->c
.btree_id
), b
->c
.level
);
99 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
100 prt_str(&buf
, "\n prev ");
101 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(prev
.k
));
102 prt_str(&buf
, "\n next ");
103 bch2_bkey_val_to_text(&buf
, c
, k
);
105 need_fsck_err(c
, btree_node_topology_bad_min_key
, "%s", buf
.buf
);
106 goto topology_repair
;
109 bch2_bkey_buf_reassemble(&prev
, c
, k
);
110 bch2_btree_and_journal_iter_advance(&iter
);
113 if (bkey_deleted(&prev
.k
->k
)) {
114 bch2_topology_error(c
);
116 printbuf_reset(&buf
);
117 prt_str(&buf
, "empty interior node\n");
118 prt_printf(&buf
, " in btree %s level %u node ",
119 bch2_btree_id_str(b
->c
.btree_id
), b
->c
.level
);
120 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
122 need_fsck_err(c
, btree_node_topology_empty_interior_node
, "%s", buf
.buf
);
123 goto topology_repair
;
124 } else if (!bpos_eq(prev
.k
->k
.p
, b
->key
.k
.p
)) {
125 bch2_topology_error(c
);
127 printbuf_reset(&buf
);
128 prt_str(&buf
, "last child node doesn't end at end of parent node\n");
129 prt_printf(&buf
, " in btree %s level %u node ",
130 bch2_btree_id_str(b
->c
.btree_id
), b
->c
.level
);
131 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
132 prt_str(&buf
, "\n last key ");
133 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(prev
.k
));
135 need_fsck_err(c
, btree_node_topology_bad_max_key
, "%s", buf
.buf
);
136 goto topology_repair
;
140 bch2_btree_and_journal_iter_exit(&iter
);
141 bch2_bkey_buf_exit(&prev
, c
);
145 if ((c
->recovery_passes_explicit
& BIT_ULL(BCH_RECOVERY_PASS_check_topology
)) &&
146 c
->curr_recovery_pass
> BCH_RECOVERY_PASS_check_topology
) {
147 bch2_inconsistent_error(c
);
148 ret
= -BCH_ERR_btree_need_topology_repair
;
150 ret
= bch2_run_explicit_recovery_pass(c
, BCH_RECOVERY_PASS_check_topology
);
155 /* Calculate ideal packed bkey format for new btree nodes: */
157 static void __bch2_btree_calc_format(struct bkey_format_state
*s
, struct btree
*b
)
159 struct bkey_packed
*k
;
164 bset_tree_for_each_key(b
, t
, k
)
165 if (!bkey_deleted(k
)) {
166 uk
= bkey_unpack_key(b
, k
);
167 bch2_bkey_format_add_key(s
, &uk
);
171 static struct bkey_format
bch2_btree_calc_format(struct btree
*b
)
173 struct bkey_format_state s
;
175 bch2_bkey_format_init(&s
);
176 bch2_bkey_format_add_pos(&s
, b
->data
->min_key
);
177 bch2_bkey_format_add_pos(&s
, b
->data
->max_key
);
178 __bch2_btree_calc_format(&s
, b
);
180 return bch2_bkey_format_done(&s
);
183 static size_t btree_node_u64s_with_format(struct btree_nr_keys nr
,
184 struct bkey_format
*old_f
,
185 struct bkey_format
*new_f
)
187 /* stupid integer promotion rules */
189 (((int) new_f
->key_u64s
- old_f
->key_u64s
) *
190 (int) nr
.packed_keys
) +
191 (((int) new_f
->key_u64s
- BKEY_U64s
) *
192 (int) nr
.unpacked_keys
);
194 BUG_ON(delta
+ nr
.live_u64s
< 0);
196 return nr
.live_u64s
+ delta
;
200 * bch2_btree_node_format_fits - check if we could rewrite node with a new format
202 * @c: filesystem handle
203 * @b: btree node to rewrite
204 * @nr: number of keys for new node (i.e. b->nr)
205 * @new_f: bkey format to translate keys to
207 * Returns: true if all re-packed keys will be able to fit in a new node.
209 * Assumes all keys will successfully pack with the new format.
211 static bool bch2_btree_node_format_fits(struct bch_fs
*c
, struct btree
*b
,
212 struct btree_nr_keys nr
,
213 struct bkey_format
*new_f
)
215 size_t u64s
= btree_node_u64s_with_format(nr
, &b
->format
, new_f
);
217 return __vstruct_bytes(struct btree_node
, u64s
) < btree_buf_bytes(b
);
220 /* Btree node freeing/allocation: */
222 static void __btree_node_free(struct btree_trans
*trans
, struct btree
*b
)
224 struct bch_fs
*c
= trans
->c
;
226 trace_and_count(c
, btree_node_free
, trans
, b
);
228 BUG_ON(btree_node_write_blocked(b
));
229 BUG_ON(btree_node_dirty(b
));
230 BUG_ON(btree_node_need_write(b
));
231 BUG_ON(b
== btree_node_root(c
, b
));
233 BUG_ON(!list_empty(&b
->write_blocked
));
234 BUG_ON(b
->will_make_reachable
);
236 clear_btree_node_noevict(b
);
238 mutex_lock(&c
->btree_cache
.lock
);
239 list_move(&b
->list
, &c
->btree_cache
.freeable
);
240 mutex_unlock(&c
->btree_cache
.lock
);
243 static void bch2_btree_node_free_inmem(struct btree_trans
*trans
,
244 struct btree_path
*path
,
247 struct bch_fs
*c
= trans
->c
;
248 unsigned i
, level
= b
->c
.level
;
250 bch2_btree_node_lock_write_nofail(trans
, path
, &b
->c
);
251 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
252 __btree_node_free(trans
, b
);
253 six_unlock_write(&b
->c
.lock
);
254 mark_btree_node_locked_noreset(path
, level
, BTREE_NODE_INTENT_LOCKED
);
256 trans_for_each_path(trans
, path
, i
)
257 if (path
->l
[level
].b
== b
) {
258 btree_node_unlock(trans
, path
, level
);
259 path
->l
[level
].b
= ERR_PTR(-BCH_ERR_no_btree_node_init
);
263 static void bch2_btree_node_free_never_used(struct btree_update
*as
,
264 struct btree_trans
*trans
,
267 struct bch_fs
*c
= as
->c
;
268 struct prealloc_nodes
*p
= &as
->prealloc_nodes
[b
->c
.lock
.readers
!= NULL
];
269 struct btree_path
*path
;
270 unsigned i
, level
= b
->c
.level
;
272 BUG_ON(!list_empty(&b
->write_blocked
));
273 BUG_ON(b
->will_make_reachable
!= (1UL|(unsigned long) as
));
275 b
->will_make_reachable
= 0;
276 closure_put(&as
->cl
);
278 clear_btree_node_will_make_reachable(b
);
279 clear_btree_node_accessed(b
);
280 clear_btree_node_dirty_acct(c
, b
);
281 clear_btree_node_need_write(b
);
283 mutex_lock(&c
->btree_cache
.lock
);
284 list_del_init(&b
->list
);
285 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
286 mutex_unlock(&c
->btree_cache
.lock
);
288 BUG_ON(p
->nr
>= ARRAY_SIZE(p
->b
));
291 six_unlock_intent(&b
->c
.lock
);
293 trans_for_each_path(trans
, path
, i
)
294 if (path
->l
[level
].b
== b
) {
295 btree_node_unlock(trans
, path
, level
);
296 path
->l
[level
].b
= ERR_PTR(-BCH_ERR_no_btree_node_init
);
300 static struct btree
*__bch2_btree_node_alloc(struct btree_trans
*trans
,
301 struct disk_reservation
*res
,
306 struct bch_fs
*c
= trans
->c
;
307 struct write_point
*wp
;
309 BKEY_PADDED_ONSTACK(k
, BKEY_BTREE_PTR_VAL_U64s_MAX
) tmp
;
310 struct open_buckets obs
= { .nr
= 0 };
311 struct bch_devs_list devs_have
= (struct bch_devs_list
) { 0 };
312 enum bch_watermark watermark
= flags
& BCH_WATERMARK_MASK
;
313 unsigned nr_reserve
= watermark
< BCH_WATERMARK_reclaim
318 mutex_lock(&c
->btree_reserve_cache_lock
);
319 if (c
->btree_reserve_cache_nr
> nr_reserve
) {
320 struct btree_alloc
*a
=
321 &c
->btree_reserve_cache
[--c
->btree_reserve_cache_nr
];
324 bkey_copy(&tmp
.k
, &a
->k
);
325 mutex_unlock(&c
->btree_reserve_cache_lock
);
328 mutex_unlock(&c
->btree_reserve_cache_lock
);
331 ret
= bch2_alloc_sectors_start_trans(trans
,
332 c
->opts
.metadata_target
?:
333 c
->opts
.foreground_target
,
335 writepoint_ptr(&c
->btree_write_point
),
338 min(res
->nr_replicas
,
339 c
->opts
.metadata_replicas_required
),
340 watermark
, 0, cl
, &wp
);
344 if (wp
->sectors_free
< btree_sectors(c
)) {
345 struct open_bucket
*ob
;
348 open_bucket_for_each(c
, &wp
->ptrs
, ob
, i
)
349 if (ob
->sectors_free
< btree_sectors(c
))
350 ob
->sectors_free
= 0;
352 bch2_alloc_sectors_done(c
, wp
);
356 bkey_btree_ptr_v2_init(&tmp
.k
);
357 bch2_alloc_sectors_append_ptrs(c
, wp
, &tmp
.k
, btree_sectors(c
), false);
359 bch2_open_bucket_get(c
, wp
, &obs
);
360 bch2_alloc_sectors_done(c
, wp
);
362 b
= bch2_btree_node_mem_alloc(trans
, interior_node
);
363 six_unlock_write(&b
->c
.lock
);
364 six_unlock_intent(&b
->c
.lock
);
366 /* we hold cannibalize_lock: */
370 bkey_copy(&b
->key
, &tmp
.k
);
376 static struct btree
*bch2_btree_node_alloc(struct btree_update
*as
,
377 struct btree_trans
*trans
,
380 struct bch_fs
*c
= as
->c
;
382 struct prealloc_nodes
*p
= &as
->prealloc_nodes
[!!level
];
385 BUG_ON(level
>= BTREE_MAX_DEPTH
);
390 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_intent
);
391 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_write
);
393 set_btree_node_accessed(b
);
394 set_btree_node_dirty_acct(c
, b
);
395 set_btree_node_need_write(b
);
397 bch2_bset_init_first(b
, &b
->data
->keys
);
399 b
->c
.btree_id
= as
->btree_id
;
400 b
->version_ondisk
= c
->sb
.version
;
402 memset(&b
->nr
, 0, sizeof(b
->nr
));
403 b
->data
->magic
= cpu_to_le64(bset_magic(c
));
404 memset(&b
->data
->_ptr
, 0, sizeof(b
->data
->_ptr
));
406 SET_BTREE_NODE_ID(b
->data
, as
->btree_id
);
407 SET_BTREE_NODE_LEVEL(b
->data
, level
);
409 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
) {
410 struct bkey_i_btree_ptr_v2
*bp
= bkey_i_to_btree_ptr_v2(&b
->key
);
413 bp
->v
.seq
= b
->data
->keys
.seq
;
414 bp
->v
.sectors_written
= 0;
417 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b
->data
, true);
419 bch2_btree_build_aux_trees(b
);
421 ret
= bch2_btree_node_hash_insert(&c
->btree_cache
, b
, level
, as
->btree_id
);
424 trace_and_count(c
, btree_node_alloc
, trans
, b
);
425 bch2_increment_clock(c
, btree_sectors(c
), WRITE
);
429 static void btree_set_min(struct btree
*b
, struct bpos pos
)
431 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
)
432 bkey_i_to_btree_ptr_v2(&b
->key
)->v
.min_key
= pos
;
433 b
->data
->min_key
= pos
;
436 static void btree_set_max(struct btree
*b
, struct bpos pos
)
439 b
->data
->max_key
= pos
;
442 static struct btree
*bch2_btree_node_alloc_replacement(struct btree_update
*as
,
443 struct btree_trans
*trans
,
446 struct btree
*n
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
447 struct bkey_format format
= bch2_btree_calc_format(b
);
450 * The keys might expand with the new format - if they wouldn't fit in
451 * the btree node anymore, use the old format for now:
453 if (!bch2_btree_node_format_fits(as
->c
, b
, b
->nr
, &format
))
456 SET_BTREE_NODE_SEQ(n
->data
, BTREE_NODE_SEQ(b
->data
) + 1);
458 btree_set_min(n
, b
->data
->min_key
);
459 btree_set_max(n
, b
->data
->max_key
);
461 n
->data
->format
= format
;
462 btree_node_set_format(n
, format
);
464 bch2_btree_sort_into(as
->c
, n
, b
);
466 btree_node_reset_sib_u64s(n
);
470 static struct btree
*__btree_root_alloc(struct btree_update
*as
,
471 struct btree_trans
*trans
, unsigned level
)
473 struct btree
*b
= bch2_btree_node_alloc(as
, trans
, level
);
475 btree_set_min(b
, POS_MIN
);
476 btree_set_max(b
, SPOS_MAX
);
477 b
->data
->format
= bch2_btree_calc_format(b
);
479 btree_node_set_format(b
, b
->data
->format
);
480 bch2_btree_build_aux_trees(b
);
485 static void bch2_btree_reserve_put(struct btree_update
*as
, struct btree_trans
*trans
)
487 struct bch_fs
*c
= as
->c
;
488 struct prealloc_nodes
*p
;
490 for (p
= as
->prealloc_nodes
;
491 p
< as
->prealloc_nodes
+ ARRAY_SIZE(as
->prealloc_nodes
);
494 struct btree
*b
= p
->b
[--p
->nr
];
496 mutex_lock(&c
->btree_reserve_cache_lock
);
498 if (c
->btree_reserve_cache_nr
<
499 ARRAY_SIZE(c
->btree_reserve_cache
)) {
500 struct btree_alloc
*a
=
501 &c
->btree_reserve_cache
[c
->btree_reserve_cache_nr
++];
505 bkey_copy(&a
->k
, &b
->key
);
507 bch2_open_buckets_put(c
, &b
->ob
);
510 mutex_unlock(&c
->btree_reserve_cache_lock
);
512 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_intent
);
513 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_write
);
514 __btree_node_free(trans
, b
);
515 six_unlock_write(&b
->c
.lock
);
516 six_unlock_intent(&b
->c
.lock
);
521 static int bch2_btree_reserve_get(struct btree_trans
*trans
,
522 struct btree_update
*as
,
523 unsigned nr_nodes
[2],
531 BUG_ON(nr_nodes
[0] + nr_nodes
[1] > BTREE_RESERVE_MAX
);
534 * Protects reaping from the btree node cache and using the btree node
535 * open bucket reserve:
537 ret
= bch2_btree_cache_cannibalize_lock(trans
, cl
);
541 for (interior
= 0; interior
< 2; interior
++) {
542 struct prealloc_nodes
*p
= as
->prealloc_nodes
+ interior
;
544 while (p
->nr
< nr_nodes
[interior
]) {
545 b
= __bch2_btree_node_alloc(trans
, &as
->disk_res
, cl
,
556 bch2_btree_cache_cannibalize_unlock(trans
);
560 /* Asynchronous interior node update machinery */
562 static void bch2_btree_update_free(struct btree_update
*as
, struct btree_trans
*trans
)
564 struct bch_fs
*c
= as
->c
;
566 if (as
->took_gc_lock
)
567 up_read(&c
->gc_lock
);
568 as
->took_gc_lock
= false;
570 bch2_journal_pin_drop(&c
->journal
, &as
->journal
);
571 bch2_journal_pin_flush(&c
->journal
, &as
->journal
);
572 bch2_disk_reservation_put(c
, &as
->disk_res
);
573 bch2_btree_reserve_put(as
, trans
);
575 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_interior_update_total
],
578 mutex_lock(&c
->btree_interior_update_lock
);
579 list_del(&as
->unwritten_list
);
582 closure_debug_destroy(&as
->cl
);
583 mempool_free(as
, &c
->btree_interior_update_pool
);
586 * Have to do the wakeup with btree_interior_update_lock still held,
587 * since being on btree_interior_update_list is our ref on @c:
589 closure_wake_up(&c
->btree_interior_update_wait
);
591 mutex_unlock(&c
->btree_interior_update_lock
);
594 static void btree_update_add_key(struct btree_update
*as
,
595 struct keylist
*keys
, struct btree
*b
)
597 struct bkey_i
*k
= &b
->key
;
599 BUG_ON(bch2_keylist_u64s(keys
) + k
->k
.u64s
>
600 ARRAY_SIZE(as
->_old_keys
));
602 bkey_copy(keys
->top
, k
);
603 bkey_i_to_btree_ptr_v2(keys
->top
)->v
.mem_ptr
= b
->c
.level
+ 1;
605 bch2_keylist_push(keys
);
609 * The transactional part of an interior btree node update, where we journal the
610 * update we did to the interior node and update alloc info:
612 static int btree_update_nodes_written_trans(struct btree_trans
*trans
,
613 struct btree_update
*as
)
615 struct jset_entry
*e
= bch2_trans_jset_entry_alloc(trans
, as
->journal_u64s
);
616 int ret
= PTR_ERR_OR_ZERO(e
);
620 memcpy(e
, as
->journal_entries
, as
->journal_u64s
* sizeof(u64
));
622 trans
->journal_pin
= &as
->journal
;
624 for_each_keylist_key(&as
->old_keys
, k
) {
625 unsigned level
= bkey_i_to_btree_ptr_v2(k
)->v
.mem_ptr
;
627 ret
= bch2_key_trigger_old(trans
, as
->btree_id
, level
, bkey_i_to_s_c(k
),
628 BTREE_TRIGGER_TRANSACTIONAL
);
633 for_each_keylist_key(&as
->new_keys
, k
) {
634 unsigned level
= bkey_i_to_btree_ptr_v2(k
)->v
.mem_ptr
;
636 ret
= bch2_key_trigger_new(trans
, as
->btree_id
, level
, bkey_i_to_s(k
),
637 BTREE_TRIGGER_TRANSACTIONAL
);
645 static void btree_update_nodes_written(struct btree_update
*as
)
647 struct bch_fs
*c
= as
->c
;
649 struct btree_trans
*trans
= bch2_trans_get(c
);
655 * If we're already in an error state, it might be because a btree node
656 * was never written, and we might be trying to free that same btree
657 * node here, but it won't have been marked as allocated and we'll see
658 * spurious disk usage inconsistencies in the transactional part below
659 * if we don't skip it:
661 ret
= bch2_journal_error(&c
->journal
);
666 * Wait for any in flight writes to finish before we free the old nodes
669 for (i
= 0; i
< as
->nr_old_nodes
; i
++) {
672 b
= as
->old_nodes
[i
];
674 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
675 seq
= b
->data
? b
->data
->keys
.seq
: 0;
676 six_unlock_read(&b
->c
.lock
);
678 if (seq
== as
->old_nodes_seq
[i
])
679 wait_on_bit_io(&b
->flags
, BTREE_NODE_write_in_flight_inner
,
680 TASK_UNINTERRUPTIBLE
);
684 * We did an update to a parent node where the pointers we added pointed
685 * to child nodes that weren't written yet: now, the child nodes have
686 * been written so we can write out the update to the interior node.
690 * We can't call into journal reclaim here: we'd block on the journal
691 * reclaim lock, but we may need to release the open buckets we have
692 * pinned in order for other btree updates to make forward progress, and
693 * journal reclaim does btree updates when flushing bkey_cached entries,
694 * which may require allocations as well.
696 ret
= commit_do(trans
, &as
->disk_res
, &journal_seq
,
697 BCH_WATERMARK_interior_updates
|
698 BCH_TRANS_COMMIT_no_enospc
|
699 BCH_TRANS_COMMIT_no_check_rw
|
700 BCH_TRANS_COMMIT_journal_reclaim
,
701 btree_update_nodes_written_trans(trans
, as
));
702 bch2_trans_unlock(trans
);
704 bch2_fs_fatal_err_on(ret
&& !bch2_journal_error(&c
->journal
), c
,
705 "%s", bch2_err_str(ret
));
710 btree_path_idx_t path_idx
= get_unlocked_mut_path(trans
,
711 as
->btree_id
, b
->c
.level
, b
->key
.k
.p
);
712 struct btree_path
*path
= trans
->paths
+ path_idx
;
714 * @b is the node we did the final insert into:
716 * On failure to get a journal reservation, we still have to
717 * unblock the write and allow most of the write path to happen
718 * so that shutdown works, but the i->journal_seq mechanism
719 * won't work to prevent the btree write from being visible (we
720 * didn't get a journal sequence number) - instead
721 * __bch2_btree_node_write() doesn't do the actual write if
722 * we're in journal error state:
726 * Ensure transaction is unlocked before using
727 * btree_node_lock_nopath() (the use of which is always suspect,
728 * we need to work on removing this in the future)
730 * It should be, but get_unlocked_mut_path() -> bch2_path_get()
731 * calls bch2_path_upgrade(), before we call path_make_mut(), so
732 * we may rarely end up with a locked path besides the one we
735 bch2_trans_unlock(trans
);
736 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_intent
);
737 mark_btree_node_locked(trans
, path
, b
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
738 path
->l
[b
->c
.level
].lock_seq
= six_lock_seq(&b
->c
.lock
);
739 path
->l
[b
->c
.level
].b
= b
;
741 bch2_btree_node_lock_write_nofail(trans
, path
, &b
->c
);
743 mutex_lock(&c
->btree_interior_update_lock
);
745 list_del(&as
->write_blocked_list
);
746 if (list_empty(&b
->write_blocked
))
747 clear_btree_node_write_blocked(b
);
750 * Node might have been freed, recheck under
751 * btree_interior_update_lock:
755 BUG_ON(!btree_node_dirty(b
));
758 struct bset
*last
= btree_bset_last(b
);
760 last
->journal_seq
= cpu_to_le64(
762 le64_to_cpu(last
->journal_seq
)));
764 bch2_btree_add_journal_pin(c
, b
, journal_seq
);
767 * If we didn't get a journal sequence number we
768 * can't write this btree node, because recovery
769 * won't know to ignore this write:
771 set_btree_node_never_write(b
);
775 mutex_unlock(&c
->btree_interior_update_lock
);
777 mark_btree_node_locked_noreset(path
, b
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
778 six_unlock_write(&b
->c
.lock
);
780 btree_node_write_if_need(c
, b
, SIX_LOCK_intent
);
781 btree_node_unlock(trans
, path
, b
->c
.level
);
782 bch2_path_put(trans
, path_idx
, true);
785 bch2_journal_pin_drop(&c
->journal
, &as
->journal
);
787 mutex_lock(&c
->btree_interior_update_lock
);
788 for (i
= 0; i
< as
->nr_new_nodes
; i
++) {
789 b
= as
->new_nodes
[i
];
791 BUG_ON(b
->will_make_reachable
!= (unsigned long) as
);
792 b
->will_make_reachable
= 0;
793 clear_btree_node_will_make_reachable(b
);
795 mutex_unlock(&c
->btree_interior_update_lock
);
797 for (i
= 0; i
< as
->nr_new_nodes
; i
++) {
798 b
= as
->new_nodes
[i
];
800 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
801 btree_node_write_if_need(c
, b
, SIX_LOCK_read
);
802 six_unlock_read(&b
->c
.lock
);
805 for (i
= 0; i
< as
->nr_open_buckets
; i
++)
806 bch2_open_bucket_put(c
, c
->open_buckets
+ as
->open_buckets
[i
]);
808 bch2_btree_update_free(as
, trans
);
809 bch2_trans_put(trans
);
812 static void btree_interior_update_work(struct work_struct
*work
)
815 container_of(work
, struct bch_fs
, btree_interior_update_work
);
816 struct btree_update
*as
;
819 mutex_lock(&c
->btree_interior_update_lock
);
820 as
= list_first_entry_or_null(&c
->btree_interior_updates_unwritten
,
821 struct btree_update
, unwritten_list
);
822 if (as
&& !as
->nodes_written
)
824 mutex_unlock(&c
->btree_interior_update_lock
);
829 btree_update_nodes_written(as
);
833 static CLOSURE_CALLBACK(btree_update_set_nodes_written
)
835 closure_type(as
, struct btree_update
, cl
);
836 struct bch_fs
*c
= as
->c
;
838 mutex_lock(&c
->btree_interior_update_lock
);
839 as
->nodes_written
= true;
840 mutex_unlock(&c
->btree_interior_update_lock
);
842 queue_work(c
->btree_interior_update_worker
, &c
->btree_interior_update_work
);
846 * We're updating @b with pointers to nodes that haven't finished writing yet:
847 * block @b from being written until @as completes
849 static void btree_update_updated_node(struct btree_update
*as
, struct btree
*b
)
851 struct bch_fs
*c
= as
->c
;
853 BUG_ON(as
->mode
!= BTREE_UPDATE_none
);
854 BUG_ON(as
->update_level_end
< b
->c
.level
);
855 BUG_ON(!btree_node_dirty(b
));
858 mutex_lock(&c
->btree_interior_update_lock
);
859 list_add_tail(&as
->unwritten_list
, &c
->btree_interior_updates_unwritten
);
861 as
->mode
= BTREE_UPDATE_node
;
863 as
->update_level_end
= b
->c
.level
;
865 set_btree_node_write_blocked(b
);
866 list_add(&as
->write_blocked_list
, &b
->write_blocked
);
868 mutex_unlock(&c
->btree_interior_update_lock
);
871 static int bch2_update_reparent_journal_pin_flush(struct journal
*j
,
872 struct journal_entry_pin
*_pin
, u64 seq
)
877 static void btree_update_reparent(struct btree_update
*as
,
878 struct btree_update
*child
)
880 struct bch_fs
*c
= as
->c
;
882 lockdep_assert_held(&c
->btree_interior_update_lock
);
885 child
->mode
= BTREE_UPDATE_update
;
887 bch2_journal_pin_copy(&c
->journal
, &as
->journal
, &child
->journal
,
888 bch2_update_reparent_journal_pin_flush
);
891 static void btree_update_updated_root(struct btree_update
*as
, struct btree
*b
)
893 struct bkey_i
*insert
= &b
->key
;
894 struct bch_fs
*c
= as
->c
;
896 BUG_ON(as
->mode
!= BTREE_UPDATE_none
);
898 BUG_ON(as
->journal_u64s
+ jset_u64s(insert
->k
.u64s
) >
899 ARRAY_SIZE(as
->journal_entries
));
902 journal_entry_set((void *) &as
->journal_entries
[as
->journal_u64s
],
903 BCH_JSET_ENTRY_btree_root
,
904 b
->c
.btree_id
, b
->c
.level
,
905 insert
, insert
->k
.u64s
);
907 mutex_lock(&c
->btree_interior_update_lock
);
908 list_add_tail(&as
->unwritten_list
, &c
->btree_interior_updates_unwritten
);
910 as
->mode
= BTREE_UPDATE_root
;
911 mutex_unlock(&c
->btree_interior_update_lock
);
915 * bch2_btree_update_add_new_node:
917 * This causes @as to wait on @b to be written, before it gets to
918 * bch2_btree_update_nodes_written
920 * Additionally, it sets b->will_make_reachable to prevent any additional writes
921 * to @b from happening besides the first until @b is reachable on disk
923 * And it adds @b to the list of @as's new nodes, so that we can update sector
924 * counts in bch2_btree_update_nodes_written:
926 static void bch2_btree_update_add_new_node(struct btree_update
*as
, struct btree
*b
)
928 struct bch_fs
*c
= as
->c
;
930 closure_get(&as
->cl
);
932 mutex_lock(&c
->btree_interior_update_lock
);
933 BUG_ON(as
->nr_new_nodes
>= ARRAY_SIZE(as
->new_nodes
));
934 BUG_ON(b
->will_make_reachable
);
936 as
->new_nodes
[as
->nr_new_nodes
++] = b
;
937 b
->will_make_reachable
= 1UL|(unsigned long) as
;
938 set_btree_node_will_make_reachable(b
);
940 mutex_unlock(&c
->btree_interior_update_lock
);
942 btree_update_add_key(as
, &as
->new_keys
, b
);
944 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
) {
945 unsigned bytes
= vstruct_end(&b
->data
->keys
) - (void *) b
->data
;
946 unsigned sectors
= round_up(bytes
, block_bytes(c
)) >> 9;
948 bkey_i_to_btree_ptr_v2(&b
->key
)->v
.sectors_written
=
949 cpu_to_le16(sectors
);
954 * returns true if @b was a new node
956 static void btree_update_drop_new_node(struct bch_fs
*c
, struct btree
*b
)
958 struct btree_update
*as
;
962 mutex_lock(&c
->btree_interior_update_lock
);
964 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
965 * dropped when it gets written by bch2_btree_complete_write - the
966 * xchg() is for synchronization with bch2_btree_complete_write:
968 v
= xchg(&b
->will_make_reachable
, 0);
969 clear_btree_node_will_make_reachable(b
);
970 as
= (struct btree_update
*) (v
& ~1UL);
973 mutex_unlock(&c
->btree_interior_update_lock
);
977 for (i
= 0; i
< as
->nr_new_nodes
; i
++)
978 if (as
->new_nodes
[i
] == b
)
983 array_remove_item(as
->new_nodes
, as
->nr_new_nodes
, i
);
984 mutex_unlock(&c
->btree_interior_update_lock
);
987 closure_put(&as
->cl
);
990 static void bch2_btree_update_get_open_buckets(struct btree_update
*as
, struct btree
*b
)
993 as
->open_buckets
[as
->nr_open_buckets
++] =
997 static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal
*j
,
998 struct journal_entry_pin
*_pin
, u64 seq
)
1004 * @b is being split/rewritten: it may have pointers to not-yet-written btree
1005 * nodes and thus outstanding btree_updates - redirect @b's
1006 * btree_updates to point to this btree_update:
1008 static void bch2_btree_interior_update_will_free_node(struct btree_update
*as
,
1011 struct bch_fs
*c
= as
->c
;
1012 struct btree_update
*p
, *n
;
1013 struct btree_write
*w
;
1015 set_btree_node_dying(b
);
1017 if (btree_node_fake(b
))
1020 mutex_lock(&c
->btree_interior_update_lock
);
1023 * Does this node have any btree_update operations preventing
1024 * it from being written?
1026 * If so, redirect them to point to this btree_update: we can
1027 * write out our new nodes, but we won't make them visible until those
1028 * operations complete
1030 list_for_each_entry_safe(p
, n
, &b
->write_blocked
, write_blocked_list
) {
1031 list_del_init(&p
->write_blocked_list
);
1032 btree_update_reparent(as
, p
);
1035 * for flush_held_btree_writes() waiting on updates to flush or
1036 * nodes to be writeable:
1038 closure_wake_up(&c
->btree_interior_update_wait
);
1041 clear_btree_node_dirty_acct(c
, b
);
1042 clear_btree_node_need_write(b
);
1043 clear_btree_node_write_blocked(b
);
1046 * Does this node have unwritten data that has a pin on the journal?
1048 * If so, transfer that pin to the btree_update operation -
1049 * note that if we're freeing multiple nodes, we only need to keep the
1050 * oldest pin of any of the nodes we're freeing. We'll release the pin
1051 * when the new nodes are persistent and reachable on disk:
1053 w
= btree_current_write(b
);
1054 bch2_journal_pin_copy(&c
->journal
, &as
->journal
, &w
->journal
,
1055 bch2_btree_update_will_free_node_journal_pin_flush
);
1056 bch2_journal_pin_drop(&c
->journal
, &w
->journal
);
1058 w
= btree_prev_write(b
);
1059 bch2_journal_pin_copy(&c
->journal
, &as
->journal
, &w
->journal
,
1060 bch2_btree_update_will_free_node_journal_pin_flush
);
1061 bch2_journal_pin_drop(&c
->journal
, &w
->journal
);
1063 mutex_unlock(&c
->btree_interior_update_lock
);
1066 * Is this a node that isn't reachable on disk yet?
1068 * Nodes that aren't reachable yet have writes blocked until they're
1069 * reachable - now that we've cancelled any pending writes and moved
1070 * things waiting on that write to wait on this update, we can drop this
1071 * node from the list of nodes that the other update is making
1072 * reachable, prior to freeing it:
1074 btree_update_drop_new_node(c
, b
);
1076 btree_update_add_key(as
, &as
->old_keys
, b
);
1078 as
->old_nodes
[as
->nr_old_nodes
] = b
;
1079 as
->old_nodes_seq
[as
->nr_old_nodes
] = b
->data
->keys
.seq
;
1083 static void bch2_btree_update_done(struct btree_update
*as
, struct btree_trans
*trans
)
1085 struct bch_fs
*c
= as
->c
;
1086 u64 start_time
= as
->start_time
;
1088 BUG_ON(as
->mode
== BTREE_UPDATE_none
);
1090 if (as
->took_gc_lock
)
1091 up_read(&as
->c
->gc_lock
);
1092 as
->took_gc_lock
= false;
1094 bch2_btree_reserve_put(as
, trans
);
1096 continue_at(&as
->cl
, btree_update_set_nodes_written
,
1097 as
->c
->btree_interior_update_worker
);
1099 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_interior_update_foreground
],
1103 static struct btree_update
*
1104 bch2_btree_update_start(struct btree_trans
*trans
, struct btree_path
*path
,
1105 unsigned level_start
, bool split
, unsigned flags
)
1107 struct bch_fs
*c
= trans
->c
;
1108 struct btree_update
*as
;
1109 u64 start_time
= local_clock();
1110 int disk_res_flags
= (flags
& BCH_TRANS_COMMIT_no_enospc
)
1111 ? BCH_DISK_RESERVATION_NOFAIL
: 0;
1112 unsigned nr_nodes
[2] = { 0, 0 };
1113 unsigned level_end
= level_start
;
1114 enum bch_watermark watermark
= flags
& BCH_WATERMARK_MASK
;
1116 u32 restart_count
= trans
->restart_count
;
1118 BUG_ON(!path
->should_be_locked
);
1120 if (watermark
== BCH_WATERMARK_copygc
)
1121 watermark
= BCH_WATERMARK_btree_copygc
;
1122 if (watermark
< BCH_WATERMARK_btree
)
1123 watermark
= BCH_WATERMARK_btree
;
1125 flags
&= ~BCH_WATERMARK_MASK
;
1128 if (watermark
< BCH_WATERMARK_reclaim
&&
1129 test_bit(JOURNAL_SPACE_LOW
, &c
->journal
.flags
)) {
1130 if (flags
& BCH_TRANS_COMMIT_journal_reclaim
)
1131 return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock
);
1133 bch2_trans_unlock(trans
);
1134 wait_event(c
->journal
.wait
, !test_bit(JOURNAL_SPACE_LOW
, &c
->journal
.flags
));
1135 ret
= bch2_trans_relock(trans
);
1137 return ERR_PTR(ret
);
1141 nr_nodes
[!!level_end
] += 1 + split
;
1144 ret
= bch2_btree_path_upgrade(trans
, path
, level_end
+ 1);
1146 return ERR_PTR(ret
);
1148 if (!btree_path_node(path
, level_end
)) {
1149 /* Allocating new root? */
1150 nr_nodes
[1] += split
;
1151 level_end
= BTREE_MAX_DEPTH
;
1156 * Always check for space for two keys, even if we won't have to
1157 * split at prior level - it might have been a merge instead:
1159 if (bch2_btree_node_insert_fits(path
->l
[level_end
].b
,
1160 BKEY_BTREE_PTR_U64s_MAX
* 2))
1163 split
= path
->l
[level_end
].b
->nr
.live_u64s
> BTREE_SPLIT_THRESHOLD(c
);
1166 if (!down_read_trylock(&c
->gc_lock
)) {
1167 ret
= drop_locks_do(trans
, (down_read(&c
->gc_lock
), 0));
1169 up_read(&c
->gc_lock
);
1170 return ERR_PTR(ret
);
1174 as
= mempool_alloc(&c
->btree_interior_update_pool
, GFP_NOFS
);
1175 memset(as
, 0, sizeof(*as
));
1176 closure_init(&as
->cl
, NULL
);
1178 as
->start_time
= start_time
;
1179 as
->ip_started
= _RET_IP_
;
1180 as
->mode
= BTREE_UPDATE_none
;
1181 as
->watermark
= watermark
;
1182 as
->took_gc_lock
= true;
1183 as
->btree_id
= path
->btree_id
;
1184 as
->update_level_start
= level_start
;
1185 as
->update_level_end
= level_end
;
1186 INIT_LIST_HEAD(&as
->list
);
1187 INIT_LIST_HEAD(&as
->unwritten_list
);
1188 INIT_LIST_HEAD(&as
->write_blocked_list
);
1189 bch2_keylist_init(&as
->old_keys
, as
->_old_keys
);
1190 bch2_keylist_init(&as
->new_keys
, as
->_new_keys
);
1191 bch2_keylist_init(&as
->parent_keys
, as
->inline_keys
);
1193 mutex_lock(&c
->btree_interior_update_lock
);
1194 list_add_tail(&as
->list
, &c
->btree_interior_update_list
);
1195 mutex_unlock(&c
->btree_interior_update_lock
);
1198 * We don't want to allocate if we're in an error state, that can cause
1199 * deadlock on emergency shutdown due to open buckets getting stuck in
1200 * the btree_reserve_cache after allocator shutdown has cleared it out.
1201 * This check needs to come after adding us to the btree_interior_update
1202 * list but before calling bch2_btree_reserve_get, to synchronize with
1203 * __bch2_fs_read_only().
1205 ret
= bch2_journal_error(&c
->journal
);
1209 ret
= bch2_disk_reservation_get(c
, &as
->disk_res
,
1210 (nr_nodes
[0] + nr_nodes
[1]) * btree_sectors(c
),
1211 c
->opts
.metadata_replicas
,
1216 ret
= bch2_btree_reserve_get(trans
, as
, nr_nodes
, flags
, NULL
);
1217 if (bch2_err_matches(ret
, ENOSPC
) ||
1218 bch2_err_matches(ret
, ENOMEM
)) {
1222 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
1225 if (bch2_err_matches(ret
, ENOSPC
) &&
1226 (flags
& BCH_TRANS_COMMIT_journal_reclaim
) &&
1227 watermark
< BCH_WATERMARK_reclaim
) {
1228 ret
= -BCH_ERR_journal_reclaim_would_deadlock
;
1232 closure_init_stack(&cl
);
1235 ret
= bch2_btree_reserve_get(trans
, as
, nr_nodes
, flags
, &cl
);
1237 bch2_trans_unlock(trans
);
1239 } while (bch2_err_matches(ret
, BCH_ERR_operation_blocked
));
1243 trace_and_count(c
, btree_reserve_get_fail
, trans
->fn
,
1244 _RET_IP_
, nr_nodes
[0] + nr_nodes
[1], ret
);
1248 ret
= bch2_trans_relock(trans
);
1252 bch2_trans_verify_not_restarted(trans
, restart_count
);
1255 bch2_btree_update_free(as
, trans
);
1256 if (!bch2_err_matches(ret
, ENOSPC
) &&
1257 !bch2_err_matches(ret
, EROFS
) &&
1258 ret
!= -BCH_ERR_journal_reclaim_would_deadlock
)
1259 bch_err_fn_ratelimited(c
, ret
);
1260 return ERR_PTR(ret
);
1263 /* Btree root updates: */
1265 static void bch2_btree_set_root_inmem(struct bch_fs
*c
, struct btree
*b
)
1267 /* Root nodes cannot be reaped */
1268 mutex_lock(&c
->btree_cache
.lock
);
1269 list_del_init(&b
->list
);
1270 mutex_unlock(&c
->btree_cache
.lock
);
1272 mutex_lock(&c
->btree_root_lock
);
1273 bch2_btree_id_root(c
, b
->c
.btree_id
)->b
= b
;
1274 mutex_unlock(&c
->btree_root_lock
);
1276 bch2_recalc_btree_reserve(c
);
1279 static void bch2_btree_set_root(struct btree_update
*as
,
1280 struct btree_trans
*trans
,
1281 struct btree_path
*path
,
1284 struct bch_fs
*c
= as
->c
;
1287 trace_and_count(c
, btree_node_set_root
, trans
, b
);
1289 old
= btree_node_root(c
, b
);
1292 * Ensure no one is using the old root while we switch to the
1295 bch2_btree_node_lock_write_nofail(trans
, path
, &old
->c
);
1297 bch2_btree_set_root_inmem(c
, b
);
1299 btree_update_updated_root(as
, b
);
1302 * Unlock old root after new root is visible:
1304 * The new root isn't persistent, but that's ok: we still have
1305 * an intent lock on the new root, and any updates that would
1306 * depend on the new root would have to update the new root.
1308 bch2_btree_node_unlock_write(trans
, path
, old
);
1311 /* Interior node updates: */
1313 static void bch2_insert_fixup_btree_ptr(struct btree_update
*as
,
1314 struct btree_trans
*trans
,
1315 struct btree_path
*path
,
1317 struct btree_node_iter
*node_iter
,
1318 struct bkey_i
*insert
)
1320 struct bch_fs
*c
= as
->c
;
1321 struct bkey_packed
*k
;
1322 struct printbuf buf
= PRINTBUF
;
1323 unsigned long old
, new, v
;
1325 BUG_ON(insert
->k
.type
== KEY_TYPE_btree_ptr_v2
&&
1326 !btree_ptr_sectors_written(insert
));
1328 if (unlikely(!test_bit(JOURNAL_REPLAY_DONE
, &c
->journal
.flags
)))
1329 bch2_journal_key_overwritten(c
, b
->c
.btree_id
, b
->c
.level
, insert
->k
.p
);
1331 if (bch2_bkey_invalid(c
, bkey_i_to_s_c(insert
),
1332 btree_node_type(b
), WRITE
, &buf
) ?:
1333 bch2_bkey_in_btree_node(c
, b
, bkey_i_to_s_c(insert
), &buf
)) {
1334 printbuf_reset(&buf
);
1335 prt_printf(&buf
, "inserting invalid bkey\n ");
1336 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(insert
));
1337 prt_printf(&buf
, "\n ");
1338 bch2_bkey_invalid(c
, bkey_i_to_s_c(insert
),
1339 btree_node_type(b
), WRITE
, &buf
);
1340 bch2_bkey_in_btree_node(c
, b
, bkey_i_to_s_c(insert
), &buf
);
1342 bch2_fs_inconsistent(c
, "%s", buf
.buf
);
1346 BUG_ON(as
->journal_u64s
+ jset_u64s(insert
->k
.u64s
) >
1347 ARRAY_SIZE(as
->journal_entries
));
1350 journal_entry_set((void *) &as
->journal_entries
[as
->journal_u64s
],
1351 BCH_JSET_ENTRY_btree_keys
,
1352 b
->c
.btree_id
, b
->c
.level
,
1353 insert
, insert
->k
.u64s
);
1355 while ((k
= bch2_btree_node_iter_peek_all(node_iter
, b
)) &&
1356 bkey_iter_pos_cmp(b
, k
, &insert
->k
.p
) < 0)
1357 bch2_btree_node_iter_advance(node_iter
, b
);
1359 bch2_btree_bset_insert_key(trans
, path
, b
, node_iter
, insert
);
1360 set_btree_node_dirty_acct(c
, b
);
1362 v
= READ_ONCE(b
->flags
);
1366 new &= ~BTREE_WRITE_TYPE_MASK
;
1367 new |= BTREE_WRITE_interior
;
1368 new |= 1 << BTREE_NODE_need_write
;
1369 } while ((v
= cmpxchg(&b
->flags
, old
, new)) != old
);
1371 printbuf_exit(&buf
);
1375 bch2_btree_insert_keys_interior(struct btree_update
*as
,
1376 struct btree_trans
*trans
,
1377 struct btree_path
*path
,
1379 struct btree_node_iter node_iter
,
1380 struct keylist
*keys
)
1382 struct bkey_i
*insert
= bch2_keylist_front(keys
);
1383 struct bkey_packed
*k
;
1385 BUG_ON(btree_node_type(b
) != BKEY_TYPE_btree
);
1387 while ((k
= bch2_btree_node_iter_prev_all(&node_iter
, b
)) &&
1388 (bkey_cmp_left_packed(b
, k
, &insert
->k
.p
) >= 0))
1391 while (!bch2_keylist_empty(keys
)) {
1392 insert
= bch2_keylist_front(keys
);
1394 if (bpos_gt(insert
->k
.p
, b
->key
.k
.p
))
1397 bch2_insert_fixup_btree_ptr(as
, trans
, path
, b
, &node_iter
, insert
);
1398 bch2_keylist_pop_front(keys
);
1403 * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1406 static void __btree_split_node(struct btree_update
*as
,
1407 struct btree_trans
*trans
,
1411 struct bkey_packed
*k
;
1412 struct bpos n1_pos
= POS_MIN
;
1413 struct btree_node_iter iter
;
1414 struct bset
*bsets
[2];
1415 struct bkey_format_state format
[2];
1416 struct bkey_packed
*out
[2];
1418 unsigned u64s
, n1_u64s
= (b
->nr
.live_u64s
* 3) / 5;
1419 struct { unsigned nr_keys
, val_u64s
; } nr_keys
[2];
1422 memset(&nr_keys
, 0, sizeof(nr_keys
));
1424 for (i
= 0; i
< 2; i
++) {
1425 BUG_ON(n
[i
]->nsets
!= 1);
1427 bsets
[i
] = btree_bset_first(n
[i
]);
1428 out
[i
] = bsets
[i
]->start
;
1430 SET_BTREE_NODE_SEQ(n
[i
]->data
, BTREE_NODE_SEQ(b
->data
) + 1);
1431 bch2_bkey_format_init(&format
[i
]);
1435 for_each_btree_node_key(b
, k
, &iter
) {
1436 if (bkey_deleted(k
))
1439 uk
= bkey_unpack_key(b
, k
);
1443 u64s
+ k
->u64s
>= n1_u64s
&&
1444 bch2_key_deleted_in_journal(trans
, b
->c
.btree_id
, b
->c
.level
, uk
.p
))
1447 i
= u64s
>= n1_u64s
;
1451 bch2_bkey_format_add_key(&format
[i
], &uk
);
1453 nr_keys
[i
].nr_keys
++;
1454 nr_keys
[i
].val_u64s
+= bkeyp_val_u64s(&b
->format
, k
);
1457 btree_set_min(n
[0], b
->data
->min_key
);
1458 btree_set_max(n
[0], n1_pos
);
1459 btree_set_min(n
[1], bpos_successor(n1_pos
));
1460 btree_set_max(n
[1], b
->data
->max_key
);
1462 for (i
= 0; i
< 2; i
++) {
1463 bch2_bkey_format_add_pos(&format
[i
], n
[i
]->data
->min_key
);
1464 bch2_bkey_format_add_pos(&format
[i
], n
[i
]->data
->max_key
);
1466 n
[i
]->data
->format
= bch2_bkey_format_done(&format
[i
]);
1468 unsigned u64s
= nr_keys
[i
].nr_keys
* n
[i
]->data
->format
.key_u64s
+
1469 nr_keys
[i
].val_u64s
;
1470 if (__vstruct_bytes(struct btree_node
, u64s
) > btree_buf_bytes(b
))
1471 n
[i
]->data
->format
= b
->format
;
1473 btree_node_set_format(n
[i
], n
[i
]->data
->format
);
1477 for_each_btree_node_key(b
, k
, &iter
) {
1478 if (bkey_deleted(k
))
1481 i
= u64s
>= n1_u64s
;
1484 if (bch2_bkey_transform(&n
[i
]->format
, out
[i
], bkey_packed(k
)
1485 ? &b
->format
: &bch2_bkey_format_current
, k
))
1486 out
[i
]->format
= KEY_FORMAT_LOCAL_BTREE
;
1488 bch2_bkey_unpack(b
, (void *) out
[i
], k
);
1490 out
[i
]->needs_whiteout
= false;
1492 btree_keys_account_key_add(&n
[i
]->nr
, 0, out
[i
]);
1493 out
[i
] = bkey_p_next(out
[i
]);
1496 for (i
= 0; i
< 2; i
++) {
1497 bsets
[i
]->u64s
= cpu_to_le16((u64
*) out
[i
] - bsets
[i
]->_data
);
1499 BUG_ON(!bsets
[i
]->u64s
);
1501 set_btree_bset_end(n
[i
], n
[i
]->set
);
1503 btree_node_reset_sib_u64s(n
[i
]);
1505 bch2_verify_btree_nr_keys(n
[i
]);
1507 BUG_ON(bch2_btree_node_check_topology(trans
, n
[i
]));
1512 * For updates to interior nodes, we've got to do the insert before we split
1513 * because the stuff we're inserting has to be inserted atomically. Post split,
1514 * the keys might have to go in different nodes and the split would no longer be
1517 * Worse, if the insert is from btree node coalescing, if we do the insert after
1518 * we do the split (and pick the pivot) - the pivot we pick might be between
1519 * nodes that were coalesced, and thus in the middle of a child node post
1522 static void btree_split_insert_keys(struct btree_update
*as
,
1523 struct btree_trans
*trans
,
1524 btree_path_idx_t path_idx
,
1526 struct keylist
*keys
)
1528 struct btree_path
*path
= trans
->paths
+ path_idx
;
1530 if (!bch2_keylist_empty(keys
) &&
1531 bpos_le(bch2_keylist_front(keys
)->k
.p
, b
->data
->max_key
)) {
1532 struct btree_node_iter node_iter
;
1534 bch2_btree_node_iter_init(&node_iter
, b
, &bch2_keylist_front(keys
)->k
.p
);
1536 bch2_btree_insert_keys_interior(as
, trans
, path
, b
, node_iter
, keys
);
1538 BUG_ON(bch2_btree_node_check_topology(trans
, b
));
1542 static int btree_split(struct btree_update
*as
, struct btree_trans
*trans
,
1543 btree_path_idx_t path
, struct btree
*b
,
1544 struct keylist
*keys
)
1546 struct bch_fs
*c
= as
->c
;
1547 struct btree
*parent
= btree_node_parent(trans
->paths
+ path
, b
);
1548 struct btree
*n1
, *n2
= NULL
, *n3
= NULL
;
1549 btree_path_idx_t path1
= 0, path2
= 0;
1550 u64 start_time
= local_clock();
1553 bch2_verify_btree_nr_keys(b
);
1554 BUG_ON(!parent
&& (b
!= btree_node_root(c
, b
)));
1555 BUG_ON(parent
&& !btree_node_intent_locked(trans
->paths
+ path
, b
->c
.level
+ 1));
1557 ret
= bch2_btree_node_check_topology(trans
, b
);
1561 bch2_btree_interior_update_will_free_node(as
, b
);
1563 if (b
->nr
.live_u64s
> BTREE_SPLIT_THRESHOLD(c
)) {
1566 trace_and_count(c
, btree_node_split
, trans
, b
);
1568 n
[0] = n1
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
1569 n
[1] = n2
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
1571 __btree_split_node(as
, trans
, b
, n
);
1574 btree_split_insert_keys(as
, trans
, path
, n1
, keys
);
1575 btree_split_insert_keys(as
, trans
, path
, n2
, keys
);
1576 BUG_ON(!bch2_keylist_empty(keys
));
1579 bch2_btree_build_aux_trees(n2
);
1580 bch2_btree_build_aux_trees(n1
);
1582 bch2_btree_update_add_new_node(as
, n1
);
1583 bch2_btree_update_add_new_node(as
, n2
);
1584 six_unlock_write(&n2
->c
.lock
);
1585 six_unlock_write(&n1
->c
.lock
);
1587 path1
= get_unlocked_mut_path(trans
, as
->btree_id
, n1
->c
.level
, n1
->key
.k
.p
);
1588 six_lock_increment(&n1
->c
.lock
, SIX_LOCK_intent
);
1589 mark_btree_node_locked(trans
, trans
->paths
+ path1
, n1
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1590 bch2_btree_path_level_init(trans
, trans
->paths
+ path1
, n1
);
1592 path2
= get_unlocked_mut_path(trans
, as
->btree_id
, n2
->c
.level
, n2
->key
.k
.p
);
1593 six_lock_increment(&n2
->c
.lock
, SIX_LOCK_intent
);
1594 mark_btree_node_locked(trans
, trans
->paths
+ path2
, n2
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1595 bch2_btree_path_level_init(trans
, trans
->paths
+ path2
, n2
);
1598 * Note that on recursive parent_keys == keys, so we
1599 * can't start adding new keys to parent_keys before emptying it
1600 * out (which we did with btree_split_insert_keys() above)
1602 bch2_keylist_add(&as
->parent_keys
, &n1
->key
);
1603 bch2_keylist_add(&as
->parent_keys
, &n2
->key
);
1606 /* Depth increases, make a new root */
1607 n3
= __btree_root_alloc(as
, trans
, b
->c
.level
+ 1);
1609 bch2_btree_update_add_new_node(as
, n3
);
1610 six_unlock_write(&n3
->c
.lock
);
1612 trans
->paths
[path2
].locks_want
++;
1613 BUG_ON(btree_node_locked(trans
->paths
+ path2
, n3
->c
.level
));
1614 six_lock_increment(&n3
->c
.lock
, SIX_LOCK_intent
);
1615 mark_btree_node_locked(trans
, trans
->paths
+ path2
, n3
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1616 bch2_btree_path_level_init(trans
, trans
->paths
+ path2
, n3
);
1618 n3
->sib_u64s
[0] = U16_MAX
;
1619 n3
->sib_u64s
[1] = U16_MAX
;
1621 btree_split_insert_keys(as
, trans
, path
, n3
, &as
->parent_keys
);
1624 trace_and_count(c
, btree_node_compact
, trans
, b
);
1626 n1
= bch2_btree_node_alloc_replacement(as
, trans
, b
);
1629 btree_split_insert_keys(as
, trans
, path
, n1
, keys
);
1630 BUG_ON(!bch2_keylist_empty(keys
));
1633 bch2_btree_build_aux_trees(n1
);
1634 bch2_btree_update_add_new_node(as
, n1
);
1635 six_unlock_write(&n1
->c
.lock
);
1637 path1
= get_unlocked_mut_path(trans
, as
->btree_id
, n1
->c
.level
, n1
->key
.k
.p
);
1638 six_lock_increment(&n1
->c
.lock
, SIX_LOCK_intent
);
1639 mark_btree_node_locked(trans
, trans
->paths
+ path1
, n1
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1640 bch2_btree_path_level_init(trans
, trans
->paths
+ path1
, n1
);
1643 bch2_keylist_add(&as
->parent_keys
, &n1
->key
);
1646 /* New nodes all written, now make them visible: */
1649 /* Split a non root node */
1650 ret
= bch2_btree_insert_node(as
, trans
, path
, parent
, &as
->parent_keys
);
1654 bch2_btree_set_root(as
, trans
, trans
->paths
+ path
, n3
);
1656 /* Root filled up but didn't need to be split */
1657 bch2_btree_set_root(as
, trans
, trans
->paths
+ path
, n1
);
1661 bch2_btree_update_get_open_buckets(as
, n3
);
1662 bch2_btree_node_write(c
, n3
, SIX_LOCK_intent
, 0);
1665 bch2_btree_update_get_open_buckets(as
, n2
);
1666 bch2_btree_node_write(c
, n2
, SIX_LOCK_intent
, 0);
1668 bch2_btree_update_get_open_buckets(as
, n1
);
1669 bch2_btree_node_write(c
, n1
, SIX_LOCK_intent
, 0);
1672 * The old node must be freed (in memory) _before_ unlocking the new
1673 * nodes - else another thread could re-acquire a read lock on the old
1674 * node after another thread has locked and updated the new node, thus
1675 * seeing stale data:
1677 bch2_btree_node_free_inmem(trans
, trans
->paths
+ path
, b
);
1680 bch2_trans_node_add(trans
, trans
->paths
+ path
, n3
);
1682 bch2_trans_node_add(trans
, trans
->paths
+ path2
, n2
);
1683 bch2_trans_node_add(trans
, trans
->paths
+ path1
, n1
);
1686 six_unlock_intent(&n3
->c
.lock
);
1688 six_unlock_intent(&n2
->c
.lock
);
1689 six_unlock_intent(&n1
->c
.lock
);
1692 __bch2_btree_path_unlock(trans
, trans
->paths
+ path2
);
1693 bch2_path_put(trans
, path2
, true);
1696 __bch2_btree_path_unlock(trans
, trans
->paths
+ path1
);
1697 bch2_path_put(trans
, path1
, true);
1700 bch2_trans_verify_locks(trans
);
1702 bch2_time_stats_update(&c
->times
[n2
1703 ? BCH_TIME_btree_node_split
1704 : BCH_TIME_btree_node_compact
],
1709 bch2_btree_node_free_never_used(as
, trans
, n3
);
1711 bch2_btree_node_free_never_used(as
, trans
, n2
);
1712 bch2_btree_node_free_never_used(as
, trans
, n1
);
1717 * bch2_btree_insert_node - insert bkeys into a given btree node
1719 * @as: btree_update object
1720 * @trans: btree_trans object
1721 * @path_idx: path that points to current node
1722 * @b: node to insert keys into
1723 * @keys: list of keys to insert
1725 * Returns: 0 on success, typically transaction restart error on failure
1727 * Inserts as many keys as it can into a given btree node, splitting it if full.
1728 * If a split occurred, this function will return early. This can only happen
1729 * for leaf nodes -- inserts into interior nodes have to be atomic.
1731 static int bch2_btree_insert_node(struct btree_update
*as
, struct btree_trans
*trans
,
1732 btree_path_idx_t path_idx
, struct btree
*b
,
1733 struct keylist
*keys
)
1735 struct bch_fs
*c
= as
->c
;
1736 struct btree_path
*path
= trans
->paths
+ path_idx
, *linked
;
1738 int old_u64s
= le16_to_cpu(btree_bset_last(b
)->u64s
);
1739 int old_live_u64s
= b
->nr
.live_u64s
;
1740 int live_u64s_added
, u64s_added
;
1743 lockdep_assert_held(&c
->gc_lock
);
1744 BUG_ON(!btree_node_intent_locked(path
, b
->c
.level
));
1745 BUG_ON(!b
->c
.level
);
1746 BUG_ON(!as
|| as
->b
);
1747 bch2_verify_keylist_sorted(keys
);
1749 ret
= bch2_btree_node_lock_write(trans
, path
, &b
->c
);
1753 bch2_btree_node_prep_for_write(trans
, path
, b
);
1755 if (!bch2_btree_node_insert_fits(b
, bch2_keylist_u64s(keys
))) {
1756 bch2_btree_node_unlock_write(trans
, path
, b
);
1760 ret
= bch2_btree_node_check_topology(trans
, b
);
1762 bch2_btree_node_unlock_write(trans
, path
, b
);
1766 bch2_btree_insert_keys_interior(as
, trans
, path
, b
,
1767 path
->l
[b
->c
.level
].iter
, keys
);
1769 trans_for_each_path_with_node(trans
, b
, linked
, i
)
1770 bch2_btree_node_iter_peek(&linked
->l
[b
->c
.level
].iter
, b
);
1772 bch2_trans_verify_paths(trans
);
1774 live_u64s_added
= (int) b
->nr
.live_u64s
- old_live_u64s
;
1775 u64s_added
= (int) le16_to_cpu(btree_bset_last(b
)->u64s
) - old_u64s
;
1777 if (b
->sib_u64s
[0] != U16_MAX
&& live_u64s_added
< 0)
1778 b
->sib_u64s
[0] = max(0, (int) b
->sib_u64s
[0] + live_u64s_added
);
1779 if (b
->sib_u64s
[1] != U16_MAX
&& live_u64s_added
< 0)
1780 b
->sib_u64s
[1] = max(0, (int) b
->sib_u64s
[1] + live_u64s_added
);
1782 if (u64s_added
> live_u64s_added
&&
1783 bch2_maybe_compact_whiteouts(c
, b
))
1784 bch2_trans_node_reinit_iter(trans
, b
);
1786 btree_update_updated_node(as
, b
);
1787 bch2_btree_node_unlock_write(trans
, path
, b
);
1789 BUG_ON(bch2_btree_node_check_topology(trans
, b
));
1793 * We could attempt to avoid the transaction restart, by calling
1794 * bch2_btree_path_upgrade() and allocating more nodes:
1796 if (b
->c
.level
>= as
->update_level_end
) {
1797 trace_and_count(c
, trans_restart_split_race
, trans
, _THIS_IP_
, b
);
1798 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_split_race
);
1801 return btree_split(as
, trans
, path_idx
, b
, keys
);
1804 int bch2_btree_split_leaf(struct btree_trans
*trans
,
1805 btree_path_idx_t path
,
1808 /* btree_split & merge may both cause paths array to be reallocated */
1809 struct btree
*b
= path_l(trans
->paths
+ path
)->b
;
1810 struct btree_update
*as
;
1814 as
= bch2_btree_update_start(trans
, trans
->paths
+ path
,
1815 trans
->paths
[path
].level
,
1820 ret
= btree_split(as
, trans
, path
, b
, NULL
);
1822 bch2_btree_update_free(as
, trans
);
1826 bch2_btree_update_done(as
, trans
);
1828 for (l
= trans
->paths
[path
].level
+ 1;
1829 btree_node_intent_locked(&trans
->paths
[path
], l
) && !ret
;
1831 ret
= bch2_foreground_maybe_merge(trans
, path
, l
, flags
);
1836 static void __btree_increase_depth(struct btree_update
*as
, struct btree_trans
*trans
,
1837 btree_path_idx_t path_idx
)
1839 struct bch_fs
*c
= as
->c
;
1840 struct btree_path
*path
= trans
->paths
+ path_idx
;
1841 struct btree
*n
, *b
= bch2_btree_id_root(c
, path
->btree_id
)->b
;
1843 BUG_ON(!btree_node_locked(path
, b
->c
.level
));
1845 n
= __btree_root_alloc(as
, trans
, b
->c
.level
+ 1);
1847 bch2_btree_update_add_new_node(as
, n
);
1848 six_unlock_write(&n
->c
.lock
);
1851 BUG_ON(btree_node_locked(path
, n
->c
.level
));
1852 six_lock_increment(&n
->c
.lock
, SIX_LOCK_intent
);
1853 mark_btree_node_locked(trans
, path
, n
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1854 bch2_btree_path_level_init(trans
, path
, n
);
1856 n
->sib_u64s
[0] = U16_MAX
;
1857 n
->sib_u64s
[1] = U16_MAX
;
1859 bch2_keylist_add(&as
->parent_keys
, &b
->key
);
1860 btree_split_insert_keys(as
, trans
, path_idx
, n
, &as
->parent_keys
);
1862 bch2_btree_set_root(as
, trans
, path
, n
);
1863 bch2_btree_update_get_open_buckets(as
, n
);
1864 bch2_btree_node_write(c
, n
, SIX_LOCK_intent
, 0);
1865 bch2_trans_node_add(trans
, path
, n
);
1866 six_unlock_intent(&n
->c
.lock
);
1868 mutex_lock(&c
->btree_cache
.lock
);
1869 list_add_tail(&b
->list
, &c
->btree_cache
.live
);
1870 mutex_unlock(&c
->btree_cache
.lock
);
1872 bch2_trans_verify_locks(trans
);
1875 int bch2_btree_increase_depth(struct btree_trans
*trans
, btree_path_idx_t path
, unsigned flags
)
1877 struct bch_fs
*c
= trans
->c
;
1878 struct btree
*b
= bch2_btree_id_root(c
, trans
->paths
[path
].btree_id
)->b
;
1880 if (btree_node_fake(b
))
1881 return bch2_btree_split_leaf(trans
, path
, flags
);
1883 struct btree_update
*as
=
1884 bch2_btree_update_start(trans
, trans
->paths
+ path
, b
->c
.level
, true, flags
);
1888 __btree_increase_depth(as
, trans
, path
);
1889 bch2_btree_update_done(as
, trans
);
1893 int __bch2_foreground_maybe_merge(struct btree_trans
*trans
,
1894 btree_path_idx_t path
,
1897 enum btree_node_sibling sib
)
1899 struct bch_fs
*c
= trans
->c
;
1900 struct btree_update
*as
;
1901 struct bkey_format_state new_s
;
1902 struct bkey_format new_f
;
1903 struct bkey_i
delete;
1904 struct btree
*b
, *m
, *n
, *prev
, *next
, *parent
;
1905 struct bpos sib_pos
;
1907 enum btree_id btree
= trans
->paths
[path
].btree_id
;
1908 btree_path_idx_t sib_path
= 0, new_path
= 0;
1909 u64 start_time
= local_clock();
1912 BUG_ON(!trans
->paths
[path
].should_be_locked
);
1913 BUG_ON(!btree_node_locked(&trans
->paths
[path
], level
));
1915 b
= trans
->paths
[path
].l
[level
].b
;
1917 if ((sib
== btree_prev_sib
&& bpos_eq(b
->data
->min_key
, POS_MIN
)) ||
1918 (sib
== btree_next_sib
&& bpos_eq(b
->data
->max_key
, SPOS_MAX
))) {
1919 b
->sib_u64s
[sib
] = U16_MAX
;
1923 sib_pos
= sib
== btree_prev_sib
1924 ? bpos_predecessor(b
->data
->min_key
)
1925 : bpos_successor(b
->data
->max_key
);
1927 sib_path
= bch2_path_get(trans
, btree
, sib_pos
,
1928 U8_MAX
, level
, BTREE_ITER_INTENT
, _THIS_IP_
);
1929 ret
= bch2_btree_path_traverse(trans
, sib_path
, false);
1933 btree_path_set_should_be_locked(trans
->paths
+ sib_path
);
1935 m
= trans
->paths
[sib_path
].l
[level
].b
;
1937 if (btree_node_parent(trans
->paths
+ path
, b
) !=
1938 btree_node_parent(trans
->paths
+ sib_path
, m
)) {
1939 b
->sib_u64s
[sib
] = U16_MAX
;
1943 if (sib
== btree_prev_sib
) {
1951 if (!bpos_eq(bpos_successor(prev
->data
->max_key
), next
->data
->min_key
)) {
1952 struct printbuf buf1
= PRINTBUF
, buf2
= PRINTBUF
;
1954 bch2_bpos_to_text(&buf1
, prev
->data
->max_key
);
1955 bch2_bpos_to_text(&buf2
, next
->data
->min_key
);
1957 "%s(): btree topology error:\n"
1958 " prev ends at %s\n"
1959 " next starts at %s",
1960 __func__
, buf1
.buf
, buf2
.buf
);
1961 printbuf_exit(&buf1
);
1962 printbuf_exit(&buf2
);
1963 ret
= bch2_topology_error(c
);
1967 bch2_bkey_format_init(&new_s
);
1968 bch2_bkey_format_add_pos(&new_s
, prev
->data
->min_key
);
1969 __bch2_btree_calc_format(&new_s
, prev
);
1970 __bch2_btree_calc_format(&new_s
, next
);
1971 bch2_bkey_format_add_pos(&new_s
, next
->data
->max_key
);
1972 new_f
= bch2_bkey_format_done(&new_s
);
1974 sib_u64s
= btree_node_u64s_with_format(b
->nr
, &b
->format
, &new_f
) +
1975 btree_node_u64s_with_format(m
->nr
, &m
->format
, &new_f
);
1977 if (sib_u64s
> BTREE_FOREGROUND_MERGE_HYSTERESIS(c
)) {
1978 sib_u64s
-= BTREE_FOREGROUND_MERGE_HYSTERESIS(c
);
1980 sib_u64s
+= BTREE_FOREGROUND_MERGE_HYSTERESIS(c
);
1983 sib_u64s
= min(sib_u64s
, btree_max_u64s(c
));
1984 sib_u64s
= min(sib_u64s
, (size_t) U16_MAX
- 1);
1985 b
->sib_u64s
[sib
] = sib_u64s
;
1987 if (b
->sib_u64s
[sib
] > c
->btree_foreground_merge_threshold
)
1990 parent
= btree_node_parent(trans
->paths
+ path
, b
);
1991 as
= bch2_btree_update_start(trans
, trans
->paths
+ path
, level
, false,
1992 BCH_TRANS_COMMIT_no_enospc
|flags
);
1993 ret
= PTR_ERR_OR_ZERO(as
);
1997 trace_and_count(c
, btree_node_merge
, trans
, b
);
1999 bch2_btree_interior_update_will_free_node(as
, b
);
2000 bch2_btree_interior_update_will_free_node(as
, m
);
2002 n
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
2004 SET_BTREE_NODE_SEQ(n
->data
,
2005 max(BTREE_NODE_SEQ(b
->data
),
2006 BTREE_NODE_SEQ(m
->data
)) + 1);
2008 btree_set_min(n
, prev
->data
->min_key
);
2009 btree_set_max(n
, next
->data
->max_key
);
2011 n
->data
->format
= new_f
;
2012 btree_node_set_format(n
, new_f
);
2014 bch2_btree_sort_into(c
, n
, prev
);
2015 bch2_btree_sort_into(c
, n
, next
);
2017 bch2_btree_build_aux_trees(n
);
2018 bch2_btree_update_add_new_node(as
, n
);
2019 six_unlock_write(&n
->c
.lock
);
2021 new_path
= get_unlocked_mut_path(trans
, btree
, n
->c
.level
, n
->key
.k
.p
);
2022 six_lock_increment(&n
->c
.lock
, SIX_LOCK_intent
);
2023 mark_btree_node_locked(trans
, trans
->paths
+ new_path
, n
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
2024 bch2_btree_path_level_init(trans
, trans
->paths
+ new_path
, n
);
2026 bkey_init(&delete.k
);
2027 delete.k
.p
= prev
->key
.k
.p
;
2028 bch2_keylist_add(&as
->parent_keys
, &delete);
2029 bch2_keylist_add(&as
->parent_keys
, &n
->key
);
2031 bch2_trans_verify_paths(trans
);
2033 ret
= bch2_btree_insert_node(as
, trans
, path
, parent
, &as
->parent_keys
);
2035 goto err_free_update
;
2037 bch2_trans_verify_paths(trans
);
2039 bch2_btree_update_get_open_buckets(as
, n
);
2040 bch2_btree_node_write(c
, n
, SIX_LOCK_intent
, 0);
2042 bch2_btree_node_free_inmem(trans
, trans
->paths
+ path
, b
);
2043 bch2_btree_node_free_inmem(trans
, trans
->paths
+ sib_path
, m
);
2045 bch2_trans_node_add(trans
, trans
->paths
+ path
, n
);
2047 bch2_trans_verify_paths(trans
);
2049 six_unlock_intent(&n
->c
.lock
);
2051 bch2_btree_update_done(as
, trans
);
2053 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_node_merge
], start_time
);
2057 bch2_path_put(trans
, new_path
, true);
2058 bch2_path_put(trans
, sib_path
, true);
2059 bch2_trans_verify_locks(trans
);
2062 bch2_btree_node_free_never_used(as
, trans
, n
);
2063 bch2_btree_update_free(as
, trans
);
2067 int bch2_btree_node_rewrite(struct btree_trans
*trans
,
2068 struct btree_iter
*iter
,
2072 struct bch_fs
*c
= trans
->c
;
2073 struct btree
*n
, *parent
;
2074 struct btree_update
*as
;
2075 btree_path_idx_t new_path
= 0;
2078 flags
|= BCH_TRANS_COMMIT_no_enospc
;
2080 struct btree_path
*path
= btree_iter_path(trans
, iter
);
2081 parent
= btree_node_parent(path
, b
);
2082 as
= bch2_btree_update_start(trans
, path
, b
->c
.level
, false, flags
);
2083 ret
= PTR_ERR_OR_ZERO(as
);
2087 bch2_btree_interior_update_will_free_node(as
, b
);
2089 n
= bch2_btree_node_alloc_replacement(as
, trans
, b
);
2091 bch2_btree_build_aux_trees(n
);
2092 bch2_btree_update_add_new_node(as
, n
);
2093 six_unlock_write(&n
->c
.lock
);
2095 new_path
= get_unlocked_mut_path(trans
, iter
->btree_id
, n
->c
.level
, n
->key
.k
.p
);
2096 six_lock_increment(&n
->c
.lock
, SIX_LOCK_intent
);
2097 mark_btree_node_locked(trans
, trans
->paths
+ new_path
, n
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
2098 bch2_btree_path_level_init(trans
, trans
->paths
+ new_path
, n
);
2100 trace_and_count(c
, btree_node_rewrite
, trans
, b
);
2103 bch2_keylist_add(&as
->parent_keys
, &n
->key
);
2104 ret
= bch2_btree_insert_node(as
, trans
, iter
->path
, parent
, &as
->parent_keys
);
2108 bch2_btree_set_root(as
, trans
, btree_iter_path(trans
, iter
), n
);
2111 bch2_btree_update_get_open_buckets(as
, n
);
2112 bch2_btree_node_write(c
, n
, SIX_LOCK_intent
, 0);
2114 bch2_btree_node_free_inmem(trans
, btree_iter_path(trans
, iter
), b
);
2116 bch2_trans_node_add(trans
, trans
->paths
+ iter
->path
, n
);
2117 six_unlock_intent(&n
->c
.lock
);
2119 bch2_btree_update_done(as
, trans
);
2122 bch2_path_put(trans
, new_path
, true);
2123 bch2_trans_downgrade(trans
);
2126 bch2_btree_node_free_never_used(as
, trans
, n
);
2127 bch2_btree_update_free(as
, trans
);
2131 struct async_btree_rewrite
{
2133 struct work_struct work
;
2134 struct list_head list
;
2135 enum btree_id btree_id
;
2141 static int async_btree_node_rewrite_trans(struct btree_trans
*trans
,
2142 struct async_btree_rewrite
*a
)
2144 struct bch_fs
*c
= trans
->c
;
2145 struct btree_iter iter
;
2149 bch2_trans_node_iter_init(trans
, &iter
, a
->btree_id
, a
->pos
,
2150 BTREE_MAX_DEPTH
, a
->level
, 0);
2151 b
= bch2_btree_iter_peek_node(&iter
);
2152 ret
= PTR_ERR_OR_ZERO(b
);
2156 if (!b
|| b
->data
->keys
.seq
!= a
->seq
) {
2157 struct printbuf buf
= PRINTBUF
;
2160 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
2162 prt_str(&buf
, "(null");
2163 bch_info(c
, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
2164 __func__
, a
->seq
, buf
.buf
);
2165 printbuf_exit(&buf
);
2169 ret
= bch2_btree_node_rewrite(trans
, &iter
, b
, 0);
2171 bch2_trans_iter_exit(trans
, &iter
);
2176 static void async_btree_node_rewrite_work(struct work_struct
*work
)
2178 struct async_btree_rewrite
*a
=
2179 container_of(work
, struct async_btree_rewrite
, work
);
2180 struct bch_fs
*c
= a
->c
;
2183 ret
= bch2_trans_do(c
, NULL
, NULL
, 0,
2184 async_btree_node_rewrite_trans(trans
, a
));
2185 bch_err_fn_ratelimited(c
, ret
);
2186 bch2_write_ref_put(c
, BCH_WRITE_REF_node_rewrite
);
2190 void bch2_btree_node_rewrite_async(struct bch_fs
*c
, struct btree
*b
)
2192 struct async_btree_rewrite
*a
;
2195 a
= kmalloc(sizeof(*a
), GFP_NOFS
);
2197 bch_err(c
, "%s: error allocating memory", __func__
);
2202 a
->btree_id
= b
->c
.btree_id
;
2203 a
->level
= b
->c
.level
;
2204 a
->pos
= b
->key
.k
.p
;
2205 a
->seq
= b
->data
->keys
.seq
;
2206 INIT_WORK(&a
->work
, async_btree_node_rewrite_work
);
2208 if (unlikely(!test_bit(BCH_FS_may_go_rw
, &c
->flags
))) {
2209 mutex_lock(&c
->pending_node_rewrites_lock
);
2210 list_add(&a
->list
, &c
->pending_node_rewrites
);
2211 mutex_unlock(&c
->pending_node_rewrites_lock
);
2215 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_node_rewrite
)) {
2216 if (test_bit(BCH_FS_started
, &c
->flags
)) {
2217 bch_err(c
, "%s: error getting c->writes ref", __func__
);
2222 ret
= bch2_fs_read_write_early(c
);
2223 bch_err_msg(c
, ret
, "going read-write");
2229 bch2_write_ref_get(c
, BCH_WRITE_REF_node_rewrite
);
2232 queue_work(c
->btree_node_rewrite_worker
, &a
->work
);
2235 void bch2_do_pending_node_rewrites(struct bch_fs
*c
)
2237 struct async_btree_rewrite
*a
, *n
;
2239 mutex_lock(&c
->pending_node_rewrites_lock
);
2240 list_for_each_entry_safe(a
, n
, &c
->pending_node_rewrites
, list
) {
2243 bch2_write_ref_get(c
, BCH_WRITE_REF_node_rewrite
);
2244 queue_work(c
->btree_node_rewrite_worker
, &a
->work
);
2246 mutex_unlock(&c
->pending_node_rewrites_lock
);
2249 void bch2_free_pending_node_rewrites(struct bch_fs
*c
)
2251 struct async_btree_rewrite
*a
, *n
;
2253 mutex_lock(&c
->pending_node_rewrites_lock
);
2254 list_for_each_entry_safe(a
, n
, &c
->pending_node_rewrites
, list
) {
2259 mutex_unlock(&c
->pending_node_rewrites_lock
);
2262 static int __bch2_btree_node_update_key(struct btree_trans
*trans
,
2263 struct btree_iter
*iter
,
2264 struct btree
*b
, struct btree
*new_hash
,
2265 struct bkey_i
*new_key
,
2266 unsigned commit_flags
,
2269 struct bch_fs
*c
= trans
->c
;
2270 struct btree_iter iter2
= { NULL
};
2271 struct btree
*parent
;
2274 if (!skip_triggers
) {
2275 ret
= bch2_key_trigger_old(trans
, b
->c
.btree_id
, b
->c
.level
+ 1,
2276 bkey_i_to_s_c(&b
->key
),
2277 BTREE_TRIGGER_TRANSACTIONAL
) ?:
2278 bch2_key_trigger_new(trans
, b
->c
.btree_id
, b
->c
.level
+ 1,
2279 bkey_i_to_s(new_key
),
2280 BTREE_TRIGGER_TRANSACTIONAL
);
2286 bkey_copy(&new_hash
->key
, new_key
);
2287 ret
= bch2_btree_node_hash_insert(&c
->btree_cache
,
2288 new_hash
, b
->c
.level
, b
->c
.btree_id
);
2292 parent
= btree_node_parent(btree_iter_path(trans
, iter
), b
);
2294 bch2_trans_copy_iter(&iter2
, iter
);
2296 iter2
.path
= bch2_btree_path_make_mut(trans
, iter2
.path
,
2297 iter2
.flags
& BTREE_ITER_INTENT
,
2300 struct btree_path
*path2
= btree_iter_path(trans
, &iter2
);
2301 BUG_ON(path2
->level
!= b
->c
.level
);
2302 BUG_ON(!bpos_eq(path2
->pos
, new_key
->k
.p
));
2304 btree_path_set_level_up(trans
, path2
);
2306 trans
->paths_sorted
= false;
2308 ret
= bch2_btree_iter_traverse(&iter2
) ?:
2309 bch2_trans_update(trans
, &iter2
, new_key
, BTREE_TRIGGER_NORUN
);
2313 BUG_ON(btree_node_root(c
, b
) != b
);
2315 struct jset_entry
*e
= bch2_trans_jset_entry_alloc(trans
,
2316 jset_u64s(new_key
->k
.u64s
));
2317 ret
= PTR_ERR_OR_ZERO(e
);
2321 journal_entry_set(e
,
2322 BCH_JSET_ENTRY_btree_root
,
2323 b
->c
.btree_id
, b
->c
.level
,
2324 new_key
, new_key
->k
.u64s
);
2327 ret
= bch2_trans_commit(trans
, NULL
, NULL
, commit_flags
);
2331 bch2_btree_node_lock_write_nofail(trans
, btree_iter_path(trans
, iter
), &b
->c
);
2334 mutex_lock(&c
->btree_cache
.lock
);
2335 bch2_btree_node_hash_remove(&c
->btree_cache
, new_hash
);
2336 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
2338 bkey_copy(&b
->key
, new_key
);
2339 ret
= __bch2_btree_node_hash_insert(&c
->btree_cache
, b
);
2341 mutex_unlock(&c
->btree_cache
.lock
);
2343 bkey_copy(&b
->key
, new_key
);
2346 bch2_btree_node_unlock_write(trans
, btree_iter_path(trans
, iter
), b
);
2348 bch2_trans_iter_exit(trans
, &iter2
);
2352 mutex_lock(&c
->btree_cache
.lock
);
2353 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
2354 mutex_unlock(&c
->btree_cache
.lock
);
2359 int bch2_btree_node_update_key(struct btree_trans
*trans
, struct btree_iter
*iter
,
2360 struct btree
*b
, struct bkey_i
*new_key
,
2361 unsigned commit_flags
, bool skip_triggers
)
2363 struct bch_fs
*c
= trans
->c
;
2364 struct btree
*new_hash
= NULL
;
2365 struct btree_path
*path
= btree_iter_path(trans
, iter
);
2369 ret
= bch2_btree_path_upgrade(trans
, path
, b
->c
.level
+ 1);
2373 closure_init_stack(&cl
);
2376 * check btree_ptr_hash_val() after @b is locked by
2377 * btree_iter_traverse():
2379 if (btree_ptr_hash_val(new_key
) != b
->hash_val
) {
2380 ret
= bch2_btree_cache_cannibalize_lock(trans
, &cl
);
2382 ret
= drop_locks_do(trans
, (closure_sync(&cl
), 0));
2387 new_hash
= bch2_btree_node_mem_alloc(trans
, false);
2391 ret
= __bch2_btree_node_update_key(trans
, iter
, b
, new_hash
, new_key
,
2392 commit_flags
, skip_triggers
);
2396 mutex_lock(&c
->btree_cache
.lock
);
2397 list_move(&new_hash
->list
, &c
->btree_cache
.freeable
);
2398 mutex_unlock(&c
->btree_cache
.lock
);
2400 six_unlock_write(&new_hash
->c
.lock
);
2401 six_unlock_intent(&new_hash
->c
.lock
);
2404 bch2_btree_cache_cannibalize_unlock(trans
);
2408 int bch2_btree_node_update_key_get_iter(struct btree_trans
*trans
,
2409 struct btree
*b
, struct bkey_i
*new_key
,
2410 unsigned commit_flags
, bool skip_triggers
)
2412 struct btree_iter iter
;
2415 bch2_trans_node_iter_init(trans
, &iter
, b
->c
.btree_id
, b
->key
.k
.p
,
2416 BTREE_MAX_DEPTH
, b
->c
.level
,
2418 ret
= bch2_btree_iter_traverse(&iter
);
2422 /* has node been freed? */
2423 if (btree_iter_path(trans
, &iter
)->l
[b
->c
.level
].b
!= b
) {
2424 /* node has been freed: */
2425 BUG_ON(!btree_node_dying(b
));
2429 BUG_ON(!btree_node_hashed(b
));
2431 struct bch_extent_ptr
*ptr
;
2432 bch2_bkey_drop_ptrs(bkey_i_to_s(new_key
), ptr
,
2433 !bch2_bkey_has_device(bkey_i_to_s(&b
->key
), ptr
->dev
));
2435 ret
= bch2_btree_node_update_key(trans
, &iter
, b
, new_key
,
2436 commit_flags
, skip_triggers
);
2438 bch2_trans_iter_exit(trans
, &iter
);
2445 * Only for filesystem bringup, when first reading the btree roots or allocating
2446 * btree roots when initializing a new filesystem:
2448 void bch2_btree_set_root_for_read(struct bch_fs
*c
, struct btree
*b
)
2450 BUG_ON(btree_node_root(c
, b
));
2452 bch2_btree_set_root_inmem(c
, b
);
2455 static int __bch2_btree_root_alloc_fake(struct btree_trans
*trans
, enum btree_id id
, unsigned level
)
2457 struct bch_fs
*c
= trans
->c
;
2462 closure_init_stack(&cl
);
2465 ret
= bch2_btree_cache_cannibalize_lock(trans
, &cl
);
2469 b
= bch2_btree_node_mem_alloc(trans
, false);
2470 bch2_btree_cache_cannibalize_unlock(trans
);
2472 set_btree_node_fake(b
);
2473 set_btree_node_need_rewrite(b
);
2477 bkey_btree_ptr_init(&b
->key
);
2478 b
->key
.k
.p
= SPOS_MAX
;
2479 *((u64
*) bkey_i_to_btree_ptr(&b
->key
)->v
.start
) = U64_MAX
- id
;
2481 bch2_bset_init_first(b
, &b
->data
->keys
);
2482 bch2_btree_build_aux_trees(b
);
2485 btree_set_min(b
, POS_MIN
);
2486 btree_set_max(b
, SPOS_MAX
);
2487 b
->data
->format
= bch2_btree_calc_format(b
);
2488 btree_node_set_format(b
, b
->data
->format
);
2490 ret
= bch2_btree_node_hash_insert(&c
->btree_cache
, b
,
2491 b
->c
.level
, b
->c
.btree_id
);
2494 bch2_btree_set_root_inmem(c
, b
);
2496 six_unlock_write(&b
->c
.lock
);
2497 six_unlock_intent(&b
->c
.lock
);
2501 void bch2_btree_root_alloc_fake(struct bch_fs
*c
, enum btree_id id
, unsigned level
)
2503 bch2_trans_run(c
, __bch2_btree_root_alloc_fake(trans
, id
, level
));
2506 static void bch2_btree_update_to_text(struct printbuf
*out
, struct btree_update
*as
)
2508 prt_printf(out
, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
2509 (void *) as
->ip_started
,
2510 bch2_btree_id_str(as
->btree_id
),
2511 as
->update_level_start
,
2512 as
->update_level_end
,
2513 bch2_watermarks
[as
->watermark
],
2514 bch2_btree_update_modes
[as
->mode
],
2516 closure_nr_remaining(&as
->cl
),
2520 void bch2_btree_updates_to_text(struct printbuf
*out
, struct bch_fs
*c
)
2522 struct btree_update
*as
;
2524 mutex_lock(&c
->btree_interior_update_lock
);
2525 list_for_each_entry(as
, &c
->btree_interior_update_list
, list
)
2526 bch2_btree_update_to_text(out
, as
);
2527 mutex_unlock(&c
->btree_interior_update_lock
);
2530 static bool bch2_btree_interior_updates_pending(struct bch_fs
*c
)
2534 mutex_lock(&c
->btree_interior_update_lock
);
2535 ret
= !list_empty(&c
->btree_interior_update_list
);
2536 mutex_unlock(&c
->btree_interior_update_lock
);
2541 bool bch2_btree_interior_updates_flush(struct bch_fs
*c
)
2543 bool ret
= bch2_btree_interior_updates_pending(c
);
2546 closure_wait_event(&c
->btree_interior_update_wait
,
2547 !bch2_btree_interior_updates_pending(c
));
2551 void bch2_journal_entry_to_btree_root(struct bch_fs
*c
, struct jset_entry
*entry
)
2553 struct btree_root
*r
= bch2_btree_id_root(c
, entry
->btree_id
);
2555 mutex_lock(&c
->btree_root_lock
);
2557 r
->level
= entry
->level
;
2559 bkey_copy(&r
->key
, (struct bkey_i
*) entry
->start
);
2561 mutex_unlock(&c
->btree_root_lock
);
2565 bch2_btree_roots_to_journal_entries(struct bch_fs
*c
,
2566 struct jset_entry
*end
,
2571 mutex_lock(&c
->btree_root_lock
);
2573 for (i
= 0; i
< btree_id_nr_alive(c
); i
++) {
2574 struct btree_root
*r
= bch2_btree_id_root(c
, i
);
2576 if (r
->alive
&& !test_bit(i
, &skip
)) {
2577 journal_entry_set(end
, BCH_JSET_ENTRY_btree_root
,
2578 i
, r
->level
, &r
->key
, r
->key
.k
.u64s
);
2579 end
= vstruct_next(end
);
2583 mutex_unlock(&c
->btree_root_lock
);
2588 void bch2_fs_btree_interior_update_exit(struct bch_fs
*c
)
2590 if (c
->btree_node_rewrite_worker
)
2591 destroy_workqueue(c
->btree_node_rewrite_worker
);
2592 if (c
->btree_interior_update_worker
)
2593 destroy_workqueue(c
->btree_interior_update_worker
);
2594 mempool_exit(&c
->btree_interior_update_pool
);
2597 void bch2_fs_btree_interior_update_init_early(struct bch_fs
*c
)
2599 mutex_init(&c
->btree_reserve_cache_lock
);
2600 INIT_LIST_HEAD(&c
->btree_interior_update_list
);
2601 INIT_LIST_HEAD(&c
->btree_interior_updates_unwritten
);
2602 mutex_init(&c
->btree_interior_update_lock
);
2603 INIT_WORK(&c
->btree_interior_update_work
, btree_interior_update_work
);
2605 INIT_LIST_HEAD(&c
->pending_node_rewrites
);
2606 mutex_init(&c
->pending_node_rewrites_lock
);
2609 int bch2_fs_btree_interior_update_init(struct bch_fs
*c
)
2611 c
->btree_interior_update_worker
=
2612 alloc_workqueue("btree_update", WQ_UNBOUND
|WQ_MEM_RECLAIM
, 8);
2613 if (!c
->btree_interior_update_worker
)
2614 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init
;
2616 c
->btree_node_rewrite_worker
=
2617 alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND
);
2618 if (!c
->btree_node_rewrite_worker
)
2619 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init
;
2621 if (mempool_init_kmalloc_pool(&c
->btree_interior_update_pool
, 1,
2622 sizeof(struct btree_update
)))
2623 return -BCH_ERR_ENOMEM_btree_interior_update_pool_init
;