1 // SPDX-License-Identifier: GPL-2.0
4 #include "alloc_foreground.h"
5 #include "bkey_methods.h"
6 #include "btree_cache.h"
8 #include "btree_journal_iter.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
12 #include "btree_iter.h"
13 #include "btree_locking.h"
19 #include "journal_reclaim.h"
25 #include <linux/random.h>
27 static int bch2_btree_insert_node(struct btree_update
*, struct btree_trans
*,
28 struct btree_path
*, struct btree
*,
29 struct keylist
*, unsigned);
30 static void bch2_btree_update_add_new_node(struct btree_update
*, struct btree
*);
32 static struct btree_path
*get_unlocked_mut_path(struct btree_trans
*trans
,
33 enum btree_id btree_id
,
37 struct btree_path
*path
;
39 path
= bch2_path_get(trans
, btree_id
, pos
, level
+ 1, level
,
40 BTREE_ITER_NOPRESERVE
|
41 BTREE_ITER_INTENT
, _RET_IP_
);
42 path
= bch2_btree_path_make_mut(trans
, path
, true, _RET_IP_
);
43 bch2_btree_path_downgrade(trans
, path
);
44 __bch2_btree_path_unlock(trans
, path
);
51 * Verify that child nodes correctly span parent node's range:
53 static void btree_node_interior_verify(struct bch_fs
*c
, struct btree
*b
)
55 #ifdef CONFIG_BCACHEFS_DEBUG
56 struct bpos next_node
= b
->data
->min_key
;
57 struct btree_node_iter iter
;
59 struct bkey_s_c_btree_ptr_v2 bp
;
61 struct printbuf buf1
= PRINTBUF
, buf2
= PRINTBUF
;
65 if (!test_bit(JOURNAL_REPLAY_DONE
, &c
->journal
.flags
))
68 bch2_btree_node_iter_init_from_start(&iter
, b
);
71 k
= bch2_btree_node_iter_peek_unpack(&iter
, b
, &unpacked
);
72 if (k
.k
->type
!= KEY_TYPE_btree_ptr_v2
)
74 bp
= bkey_s_c_to_btree_ptr_v2(k
);
76 if (!bpos_eq(next_node
, bp
.v
->min_key
)) {
77 bch2_dump_btree_node(c
, b
);
78 bch2_bpos_to_text(&buf1
, next_node
);
79 bch2_bpos_to_text(&buf2
, bp
.v
->min_key
);
80 panic("expected next min_key %s got %s\n", buf1
.buf
, buf2
.buf
);
83 bch2_btree_node_iter_advance(&iter
, b
);
85 if (bch2_btree_node_iter_end(&iter
)) {
86 if (!bpos_eq(k
.k
->p
, b
->key
.k
.p
)) {
87 bch2_dump_btree_node(c
, b
);
88 bch2_bpos_to_text(&buf1
, b
->key
.k
.p
);
89 bch2_bpos_to_text(&buf2
, k
.k
->p
);
90 panic("expected end %s got %s\n", buf1
.buf
, buf2
.buf
);
95 next_node
= bpos_successor(k
.k
->p
);
100 /* Calculate ideal packed bkey format for new btree nodes: */
102 void __bch2_btree_calc_format(struct bkey_format_state
*s
, struct btree
*b
)
104 struct bkey_packed
*k
;
109 bset_tree_for_each_key(b
, t
, k
)
110 if (!bkey_deleted(k
)) {
111 uk
= bkey_unpack_key(b
, k
);
112 bch2_bkey_format_add_key(s
, &uk
);
116 static struct bkey_format
bch2_btree_calc_format(struct btree
*b
)
118 struct bkey_format_state s
;
120 bch2_bkey_format_init(&s
);
121 bch2_bkey_format_add_pos(&s
, b
->data
->min_key
);
122 bch2_bkey_format_add_pos(&s
, b
->data
->max_key
);
123 __bch2_btree_calc_format(&s
, b
);
125 return bch2_bkey_format_done(&s
);
128 static size_t btree_node_u64s_with_format(struct btree
*b
,
129 struct bkey_format
*new_f
)
131 struct bkey_format
*old_f
= &b
->format
;
133 /* stupid integer promotion rules */
135 (((int) new_f
->key_u64s
- old_f
->key_u64s
) *
136 (int) b
->nr
.packed_keys
) +
137 (((int) new_f
->key_u64s
- BKEY_U64s
) *
138 (int) b
->nr
.unpacked_keys
);
140 BUG_ON(delta
+ b
->nr
.live_u64s
< 0);
142 return b
->nr
.live_u64s
+ delta
;
146 * bch2_btree_node_format_fits - check if we could rewrite node with a new format
148 * @c: filesystem handle
149 * @b: btree node to rewrite
150 * @new_f: bkey format to translate keys to
152 * Returns: true if all re-packed keys will be able to fit in a new node.
154 * Assumes all keys will successfully pack with the new format.
156 bool bch2_btree_node_format_fits(struct bch_fs
*c
, struct btree
*b
,
157 struct bkey_format
*new_f
)
159 size_t u64s
= btree_node_u64s_with_format(b
, new_f
);
161 return __vstruct_bytes(struct btree_node
, u64s
) < btree_bytes(c
);
164 /* Btree node freeing/allocation: */
166 static void __btree_node_free(struct bch_fs
*c
, struct btree
*b
)
168 trace_and_count(c
, btree_node_free
, c
, b
);
170 BUG_ON(btree_node_write_blocked(b
));
171 BUG_ON(btree_node_dirty(b
));
172 BUG_ON(btree_node_need_write(b
));
173 BUG_ON(b
== btree_node_root(c
, b
));
175 BUG_ON(!list_empty(&b
->write_blocked
));
176 BUG_ON(b
->will_make_reachable
);
178 clear_btree_node_noevict(b
);
180 mutex_lock(&c
->btree_cache
.lock
);
181 list_move(&b
->list
, &c
->btree_cache
.freeable
);
182 mutex_unlock(&c
->btree_cache
.lock
);
185 static void bch2_btree_node_free_inmem(struct btree_trans
*trans
,
186 struct btree_path
*path
,
189 struct bch_fs
*c
= trans
->c
;
190 unsigned level
= b
->c
.level
;
192 bch2_btree_node_lock_write_nofail(trans
, path
, &b
->c
);
193 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
194 __btree_node_free(c
, b
);
195 six_unlock_write(&b
->c
.lock
);
196 mark_btree_node_locked_noreset(path
, level
, BTREE_NODE_INTENT_LOCKED
);
198 trans_for_each_path(trans
, path
)
199 if (path
->l
[level
].b
== b
) {
200 btree_node_unlock(trans
, path
, level
);
201 path
->l
[level
].b
= ERR_PTR(-BCH_ERR_no_btree_node_init
);
205 static void bch2_btree_node_free_never_used(struct btree_update
*as
,
206 struct btree_trans
*trans
,
209 struct bch_fs
*c
= as
->c
;
210 struct prealloc_nodes
*p
= &as
->prealloc_nodes
[b
->c
.lock
.readers
!= NULL
];
211 struct btree_path
*path
;
212 unsigned level
= b
->c
.level
;
214 BUG_ON(!list_empty(&b
->write_blocked
));
215 BUG_ON(b
->will_make_reachable
!= (1UL|(unsigned long) as
));
217 b
->will_make_reachable
= 0;
218 closure_put(&as
->cl
);
220 clear_btree_node_will_make_reachable(b
);
221 clear_btree_node_accessed(b
);
222 clear_btree_node_dirty_acct(c
, b
);
223 clear_btree_node_need_write(b
);
225 mutex_lock(&c
->btree_cache
.lock
);
226 list_del_init(&b
->list
);
227 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
228 mutex_unlock(&c
->btree_cache
.lock
);
230 BUG_ON(p
->nr
>= ARRAY_SIZE(p
->b
));
233 six_unlock_intent(&b
->c
.lock
);
235 trans_for_each_path(trans
, path
)
236 if (path
->l
[level
].b
== b
) {
237 btree_node_unlock(trans
, path
, level
);
238 path
->l
[level
].b
= ERR_PTR(-BCH_ERR_no_btree_node_init
);
242 static struct btree
*__bch2_btree_node_alloc(struct btree_trans
*trans
,
243 struct disk_reservation
*res
,
248 struct bch_fs
*c
= trans
->c
;
249 struct write_point
*wp
;
251 BKEY_PADDED_ONSTACK(k
, BKEY_BTREE_PTR_VAL_U64s_MAX
) tmp
;
252 struct open_buckets obs
= { .nr
= 0 };
253 struct bch_devs_list devs_have
= (struct bch_devs_list
) { 0 };
254 enum bch_watermark watermark
= flags
& BCH_WATERMARK_MASK
;
255 unsigned nr_reserve
= watermark
> BCH_WATERMARK_reclaim
260 mutex_lock(&c
->btree_reserve_cache_lock
);
261 if (c
->btree_reserve_cache_nr
> nr_reserve
) {
262 struct btree_alloc
*a
=
263 &c
->btree_reserve_cache
[--c
->btree_reserve_cache_nr
];
266 bkey_copy(&tmp
.k
, &a
->k
);
267 mutex_unlock(&c
->btree_reserve_cache_lock
);
270 mutex_unlock(&c
->btree_reserve_cache_lock
);
273 ret
= bch2_alloc_sectors_start_trans(trans
,
274 c
->opts
.metadata_target
?:
275 c
->opts
.foreground_target
,
277 writepoint_ptr(&c
->btree_write_point
),
280 c
->opts
.metadata_replicas_required
,
281 watermark
, 0, cl
, &wp
);
285 if (wp
->sectors_free
< btree_sectors(c
)) {
286 struct open_bucket
*ob
;
289 open_bucket_for_each(c
, &wp
->ptrs
, ob
, i
)
290 if (ob
->sectors_free
< btree_sectors(c
))
291 ob
->sectors_free
= 0;
293 bch2_alloc_sectors_done(c
, wp
);
297 bkey_btree_ptr_v2_init(&tmp
.k
);
298 bch2_alloc_sectors_append_ptrs(c
, wp
, &tmp
.k
, btree_sectors(c
), false);
300 bch2_open_bucket_get(c
, wp
, &obs
);
301 bch2_alloc_sectors_done(c
, wp
);
303 b
= bch2_btree_node_mem_alloc(trans
, interior_node
);
304 six_unlock_write(&b
->c
.lock
);
305 six_unlock_intent(&b
->c
.lock
);
307 /* we hold cannibalize_lock: */
311 bkey_copy(&b
->key
, &tmp
.k
);
317 static struct btree
*bch2_btree_node_alloc(struct btree_update
*as
,
318 struct btree_trans
*trans
,
321 struct bch_fs
*c
= as
->c
;
323 struct prealloc_nodes
*p
= &as
->prealloc_nodes
[!!level
];
326 BUG_ON(level
>= BTREE_MAX_DEPTH
);
331 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_intent
);
332 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_write
);
334 set_btree_node_accessed(b
);
335 set_btree_node_dirty_acct(c
, b
);
336 set_btree_node_need_write(b
);
338 bch2_bset_init_first(b
, &b
->data
->keys
);
340 b
->c
.btree_id
= as
->btree_id
;
341 b
->version_ondisk
= c
->sb
.version
;
343 memset(&b
->nr
, 0, sizeof(b
->nr
));
344 b
->data
->magic
= cpu_to_le64(bset_magic(c
));
345 memset(&b
->data
->_ptr
, 0, sizeof(b
->data
->_ptr
));
347 SET_BTREE_NODE_ID(b
->data
, as
->btree_id
);
348 SET_BTREE_NODE_LEVEL(b
->data
, level
);
350 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
) {
351 struct bkey_i_btree_ptr_v2
*bp
= bkey_i_to_btree_ptr_v2(&b
->key
);
354 bp
->v
.seq
= b
->data
->keys
.seq
;
355 bp
->v
.sectors_written
= 0;
358 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b
->data
, true);
360 bch2_btree_build_aux_trees(b
);
362 ret
= bch2_btree_node_hash_insert(&c
->btree_cache
, b
, level
, as
->btree_id
);
365 trace_and_count(c
, btree_node_alloc
, c
, b
);
366 bch2_increment_clock(c
, btree_sectors(c
), WRITE
);
370 static void btree_set_min(struct btree
*b
, struct bpos pos
)
372 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
)
373 bkey_i_to_btree_ptr_v2(&b
->key
)->v
.min_key
= pos
;
374 b
->data
->min_key
= pos
;
377 static void btree_set_max(struct btree
*b
, struct bpos pos
)
380 b
->data
->max_key
= pos
;
383 static struct btree
*bch2_btree_node_alloc_replacement(struct btree_update
*as
,
384 struct btree_trans
*trans
,
387 struct btree
*n
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
388 struct bkey_format format
= bch2_btree_calc_format(b
);
391 * The keys might expand with the new format - if they wouldn't fit in
392 * the btree node anymore, use the old format for now:
394 if (!bch2_btree_node_format_fits(as
->c
, b
, &format
))
397 SET_BTREE_NODE_SEQ(n
->data
, BTREE_NODE_SEQ(b
->data
) + 1);
399 btree_set_min(n
, b
->data
->min_key
);
400 btree_set_max(n
, b
->data
->max_key
);
402 n
->data
->format
= format
;
403 btree_node_set_format(n
, format
);
405 bch2_btree_sort_into(as
->c
, n
, b
);
407 btree_node_reset_sib_u64s(n
);
411 static struct btree
*__btree_root_alloc(struct btree_update
*as
,
412 struct btree_trans
*trans
, unsigned level
)
414 struct btree
*b
= bch2_btree_node_alloc(as
, trans
, level
);
416 btree_set_min(b
, POS_MIN
);
417 btree_set_max(b
, SPOS_MAX
);
418 b
->data
->format
= bch2_btree_calc_format(b
);
420 btree_node_set_format(b
, b
->data
->format
);
421 bch2_btree_build_aux_trees(b
);
426 static void bch2_btree_reserve_put(struct btree_update
*as
, struct btree_trans
*trans
)
428 struct bch_fs
*c
= as
->c
;
429 struct prealloc_nodes
*p
;
431 for (p
= as
->prealloc_nodes
;
432 p
< as
->prealloc_nodes
+ ARRAY_SIZE(as
->prealloc_nodes
);
435 struct btree
*b
= p
->b
[--p
->nr
];
437 mutex_lock(&c
->btree_reserve_cache_lock
);
439 if (c
->btree_reserve_cache_nr
<
440 ARRAY_SIZE(c
->btree_reserve_cache
)) {
441 struct btree_alloc
*a
=
442 &c
->btree_reserve_cache
[c
->btree_reserve_cache_nr
++];
446 bkey_copy(&a
->k
, &b
->key
);
448 bch2_open_buckets_put(c
, &b
->ob
);
451 mutex_unlock(&c
->btree_reserve_cache_lock
);
453 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_intent
);
454 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_write
);
455 __btree_node_free(c
, b
);
456 six_unlock_write(&b
->c
.lock
);
457 six_unlock_intent(&b
->c
.lock
);
462 static int bch2_btree_reserve_get(struct btree_trans
*trans
,
463 struct btree_update
*as
,
464 unsigned nr_nodes
[2],
468 struct bch_fs
*c
= as
->c
;
473 BUG_ON(nr_nodes
[0] + nr_nodes
[1] > BTREE_RESERVE_MAX
);
476 * Protects reaping from the btree node cache and using the btree node
477 * open bucket reserve:
479 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
480 * blocking on this lock:
482 ret
= bch2_btree_cache_cannibalize_lock(c
, cl
);
486 for (interior
= 0; interior
< 2; interior
++) {
487 struct prealloc_nodes
*p
= as
->prealloc_nodes
+ interior
;
489 while (p
->nr
< nr_nodes
[interior
]) {
490 b
= __bch2_btree_node_alloc(trans
, &as
->disk_res
,
491 flags
& BTREE_INSERT_NOWAIT
? NULL
: cl
,
502 bch2_btree_cache_cannibalize_unlock(c
);
506 /* Asynchronous interior node update machinery */
508 static void bch2_btree_update_free(struct btree_update
*as
, struct btree_trans
*trans
)
510 struct bch_fs
*c
= as
->c
;
512 if (as
->took_gc_lock
)
513 up_read(&c
->gc_lock
);
514 as
->took_gc_lock
= false;
516 bch2_journal_preres_put(&c
->journal
, &as
->journal_preres
);
518 bch2_journal_pin_drop(&c
->journal
, &as
->journal
);
519 bch2_journal_pin_flush(&c
->journal
, &as
->journal
);
520 bch2_disk_reservation_put(c
, &as
->disk_res
);
521 bch2_btree_reserve_put(as
, trans
);
523 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_interior_update_total
],
526 mutex_lock(&c
->btree_interior_update_lock
);
527 list_del(&as
->unwritten_list
);
530 closure_debug_destroy(&as
->cl
);
531 mempool_free(as
, &c
->btree_interior_update_pool
);
534 * Have to do the wakeup with btree_interior_update_lock still held,
535 * since being on btree_interior_update_list is our ref on @c:
537 closure_wake_up(&c
->btree_interior_update_wait
);
539 mutex_unlock(&c
->btree_interior_update_lock
);
542 static void btree_update_add_key(struct btree_update
*as
,
543 struct keylist
*keys
, struct btree
*b
)
545 struct bkey_i
*k
= &b
->key
;
547 BUG_ON(bch2_keylist_u64s(keys
) + k
->k
.u64s
>
548 ARRAY_SIZE(as
->_old_keys
));
550 bkey_copy(keys
->top
, k
);
551 bkey_i_to_btree_ptr_v2(keys
->top
)->v
.mem_ptr
= b
->c
.level
+ 1;
553 bch2_keylist_push(keys
);
557 * The transactional part of an interior btree node update, where we journal the
558 * update we did to the interior node and update alloc info:
560 static int btree_update_nodes_written_trans(struct btree_trans
*trans
,
561 struct btree_update
*as
)
566 ret
= darray_make_room(&trans
->extra_journal_entries
, as
->journal_u64s
);
570 memcpy(&darray_top(trans
->extra_journal_entries
),
572 as
->journal_u64s
* sizeof(u64
));
573 trans
->extra_journal_entries
.nr
+= as
->journal_u64s
;
575 trans
->journal_pin
= &as
->journal
;
577 for_each_keylist_key(&as
->old_keys
, k
) {
578 unsigned level
= bkey_i_to_btree_ptr_v2(k
)->v
.mem_ptr
;
580 ret
= bch2_trans_mark_old(trans
, as
->btree_id
, level
, bkey_i_to_s_c(k
), 0);
585 for_each_keylist_key(&as
->new_keys
, k
) {
586 unsigned level
= bkey_i_to_btree_ptr_v2(k
)->v
.mem_ptr
;
588 ret
= bch2_trans_mark_new(trans
, as
->btree_id
, level
, k
, 0);
596 static void btree_update_nodes_written(struct btree_update
*as
)
598 struct bch_fs
*c
= as
->c
;
600 struct btree_trans
*trans
= bch2_trans_get(c
);
606 * If we're already in an error state, it might be because a btree node
607 * was never written, and we might be trying to free that same btree
608 * node here, but it won't have been marked as allocated and we'll see
609 * spurious disk usage inconsistencies in the transactional part below
610 * if we don't skip it:
612 ret
= bch2_journal_error(&c
->journal
);
617 * Wait for any in flight writes to finish before we free the old nodes
620 for (i
= 0; i
< as
->nr_old_nodes
; i
++) {
623 b
= as
->old_nodes
[i
];
625 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
626 seq
= b
->data
? b
->data
->keys
.seq
: 0;
627 six_unlock_read(&b
->c
.lock
);
629 if (seq
== as
->old_nodes_seq
[i
])
630 wait_on_bit_io(&b
->flags
, BTREE_NODE_write_in_flight_inner
,
631 TASK_UNINTERRUPTIBLE
);
635 * We did an update to a parent node where the pointers we added pointed
636 * to child nodes that weren't written yet: now, the child nodes have
637 * been written so we can write out the update to the interior node.
641 * We can't call into journal reclaim here: we'd block on the journal
642 * reclaim lock, but we may need to release the open buckets we have
643 * pinned in order for other btree updates to make forward progress, and
644 * journal reclaim does btree updates when flushing bkey_cached entries,
645 * which may require allocations as well.
647 ret
= commit_do(trans
, &as
->disk_res
, &journal_seq
,
648 BCH_WATERMARK_reclaim
|
650 BTREE_INSERT_NOCHECK_RW
|
651 BTREE_INSERT_JOURNAL_RECLAIM
,
652 btree_update_nodes_written_trans(trans
, as
));
653 bch2_trans_unlock(trans
);
655 bch2_fs_fatal_err_on(ret
&& !bch2_journal_error(&c
->journal
), c
,
656 "%s(): error %s", __func__
, bch2_err_str(ret
));
659 struct btree_path
*path
;
662 path
= get_unlocked_mut_path(trans
, as
->btree_id
, b
->c
.level
, b
->key
.k
.p
);
664 * @b is the node we did the final insert into:
666 * On failure to get a journal reservation, we still have to
667 * unblock the write and allow most of the write path to happen
668 * so that shutdown works, but the i->journal_seq mechanism
669 * won't work to prevent the btree write from being visible (we
670 * didn't get a journal sequence number) - instead
671 * __bch2_btree_node_write() doesn't do the actual write if
672 * we're in journal error state:
676 * Ensure transaction is unlocked before using
677 * btree_node_lock_nopath() (the use of which is always suspect,
678 * we need to work on removing this in the future)
680 * It should be, but get_unlocked_mut_path() -> bch2_path_get()
681 * calls bch2_path_upgrade(), before we call path_make_mut(), so
682 * we may rarely end up with a locked path besides the one we
685 bch2_trans_unlock(trans
);
686 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_intent
);
687 mark_btree_node_locked(trans
, path
, b
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
688 path
->l
[b
->c
.level
].lock_seq
= six_lock_seq(&b
->c
.lock
);
689 path
->l
[b
->c
.level
].b
= b
;
691 bch2_btree_node_lock_write_nofail(trans
, path
, &b
->c
);
693 mutex_lock(&c
->btree_interior_update_lock
);
695 list_del(&as
->write_blocked_list
);
696 if (list_empty(&b
->write_blocked
))
697 clear_btree_node_write_blocked(b
);
700 * Node might have been freed, recheck under
701 * btree_interior_update_lock:
705 BUG_ON(!btree_node_dirty(b
));
708 struct bset
*last
= btree_bset_last(b
);
710 last
->journal_seq
= cpu_to_le64(
712 le64_to_cpu(last
->journal_seq
)));
714 bch2_btree_add_journal_pin(c
, b
, journal_seq
);
717 * If we didn't get a journal sequence number we
718 * can't write this btree node, because recovery
719 * won't know to ignore this write:
721 set_btree_node_never_write(b
);
725 mutex_unlock(&c
->btree_interior_update_lock
);
727 mark_btree_node_locked_noreset(path
, b
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
728 six_unlock_write(&b
->c
.lock
);
730 btree_node_write_if_need(c
, b
, SIX_LOCK_intent
);
731 btree_node_unlock(trans
, path
, b
->c
.level
);
732 bch2_path_put(trans
, path
, true);
735 bch2_journal_pin_drop(&c
->journal
, &as
->journal
);
737 bch2_journal_preres_put(&c
->journal
, &as
->journal_preres
);
739 mutex_lock(&c
->btree_interior_update_lock
);
740 for (i
= 0; i
< as
->nr_new_nodes
; i
++) {
741 b
= as
->new_nodes
[i
];
743 BUG_ON(b
->will_make_reachable
!= (unsigned long) as
);
744 b
->will_make_reachable
= 0;
745 clear_btree_node_will_make_reachable(b
);
747 mutex_unlock(&c
->btree_interior_update_lock
);
749 for (i
= 0; i
< as
->nr_new_nodes
; i
++) {
750 b
= as
->new_nodes
[i
];
752 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
753 btree_node_write_if_need(c
, b
, SIX_LOCK_read
);
754 six_unlock_read(&b
->c
.lock
);
757 for (i
= 0; i
< as
->nr_open_buckets
; i
++)
758 bch2_open_bucket_put(c
, c
->open_buckets
+ as
->open_buckets
[i
]);
760 bch2_btree_update_free(as
, trans
);
761 bch2_trans_put(trans
);
764 static void btree_interior_update_work(struct work_struct
*work
)
767 container_of(work
, struct bch_fs
, btree_interior_update_work
);
768 struct btree_update
*as
;
771 mutex_lock(&c
->btree_interior_update_lock
);
772 as
= list_first_entry_or_null(&c
->btree_interior_updates_unwritten
,
773 struct btree_update
, unwritten_list
);
774 if (as
&& !as
->nodes_written
)
776 mutex_unlock(&c
->btree_interior_update_lock
);
781 btree_update_nodes_written(as
);
785 static void btree_update_set_nodes_written(struct closure
*cl
)
787 struct btree_update
*as
= container_of(cl
, struct btree_update
, cl
);
788 struct bch_fs
*c
= as
->c
;
790 mutex_lock(&c
->btree_interior_update_lock
);
791 as
->nodes_written
= true;
792 mutex_unlock(&c
->btree_interior_update_lock
);
794 queue_work(c
->btree_interior_update_worker
, &c
->btree_interior_update_work
);
798 * We're updating @b with pointers to nodes that haven't finished writing yet:
799 * block @b from being written until @as completes
801 static void btree_update_updated_node(struct btree_update
*as
, struct btree
*b
)
803 struct bch_fs
*c
= as
->c
;
805 mutex_lock(&c
->btree_interior_update_lock
);
806 list_add_tail(&as
->unwritten_list
, &c
->btree_interior_updates_unwritten
);
808 BUG_ON(as
->mode
!= BTREE_INTERIOR_NO_UPDATE
);
809 BUG_ON(!btree_node_dirty(b
));
812 as
->mode
= BTREE_INTERIOR_UPDATING_NODE
;
815 set_btree_node_write_blocked(b
);
816 list_add(&as
->write_blocked_list
, &b
->write_blocked
);
818 mutex_unlock(&c
->btree_interior_update_lock
);
821 static void btree_update_reparent(struct btree_update
*as
,
822 struct btree_update
*child
)
824 struct bch_fs
*c
= as
->c
;
826 lockdep_assert_held(&c
->btree_interior_update_lock
);
829 child
->mode
= BTREE_INTERIOR_UPDATING_AS
;
831 bch2_journal_pin_copy(&c
->journal
, &as
->journal
, &child
->journal
, NULL
);
834 static void btree_update_updated_root(struct btree_update
*as
, struct btree
*b
)
836 struct bkey_i
*insert
= &b
->key
;
837 struct bch_fs
*c
= as
->c
;
839 BUG_ON(as
->mode
!= BTREE_INTERIOR_NO_UPDATE
);
841 BUG_ON(as
->journal_u64s
+ jset_u64s(insert
->k
.u64s
) >
842 ARRAY_SIZE(as
->journal_entries
));
845 journal_entry_set((void *) &as
->journal_entries
[as
->journal_u64s
],
846 BCH_JSET_ENTRY_btree_root
,
847 b
->c
.btree_id
, b
->c
.level
,
848 insert
, insert
->k
.u64s
);
850 mutex_lock(&c
->btree_interior_update_lock
);
851 list_add_tail(&as
->unwritten_list
, &c
->btree_interior_updates_unwritten
);
853 as
->mode
= BTREE_INTERIOR_UPDATING_ROOT
;
854 mutex_unlock(&c
->btree_interior_update_lock
);
858 * bch2_btree_update_add_new_node:
860 * This causes @as to wait on @b to be written, before it gets to
861 * bch2_btree_update_nodes_written
863 * Additionally, it sets b->will_make_reachable to prevent any additional writes
864 * to @b from happening besides the first until @b is reachable on disk
866 * And it adds @b to the list of @as's new nodes, so that we can update sector
867 * counts in bch2_btree_update_nodes_written:
869 static void bch2_btree_update_add_new_node(struct btree_update
*as
, struct btree
*b
)
871 struct bch_fs
*c
= as
->c
;
873 closure_get(&as
->cl
);
875 mutex_lock(&c
->btree_interior_update_lock
);
876 BUG_ON(as
->nr_new_nodes
>= ARRAY_SIZE(as
->new_nodes
));
877 BUG_ON(b
->will_make_reachable
);
879 as
->new_nodes
[as
->nr_new_nodes
++] = b
;
880 b
->will_make_reachable
= 1UL|(unsigned long) as
;
881 set_btree_node_will_make_reachable(b
);
883 mutex_unlock(&c
->btree_interior_update_lock
);
885 btree_update_add_key(as
, &as
->new_keys
, b
);
887 if (b
->key
.k
.type
== KEY_TYPE_btree_ptr_v2
) {
888 unsigned bytes
= vstruct_end(&b
->data
->keys
) - (void *) b
->data
;
889 unsigned sectors
= round_up(bytes
, block_bytes(c
)) >> 9;
891 bkey_i_to_btree_ptr_v2(&b
->key
)->v
.sectors_written
=
892 cpu_to_le16(sectors
);
897 * returns true if @b was a new node
899 static void btree_update_drop_new_node(struct bch_fs
*c
, struct btree
*b
)
901 struct btree_update
*as
;
905 mutex_lock(&c
->btree_interior_update_lock
);
907 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
908 * dropped when it gets written by bch2_btree_complete_write - the
909 * xchg() is for synchronization with bch2_btree_complete_write:
911 v
= xchg(&b
->will_make_reachable
, 0);
912 clear_btree_node_will_make_reachable(b
);
913 as
= (struct btree_update
*) (v
& ~1UL);
916 mutex_unlock(&c
->btree_interior_update_lock
);
920 for (i
= 0; i
< as
->nr_new_nodes
; i
++)
921 if (as
->new_nodes
[i
] == b
)
926 array_remove_item(as
->new_nodes
, as
->nr_new_nodes
, i
);
927 mutex_unlock(&c
->btree_interior_update_lock
);
930 closure_put(&as
->cl
);
933 static void bch2_btree_update_get_open_buckets(struct btree_update
*as
, struct btree
*b
)
936 as
->open_buckets
[as
->nr_open_buckets
++] =
941 * @b is being split/rewritten: it may have pointers to not-yet-written btree
942 * nodes and thus outstanding btree_updates - redirect @b's
943 * btree_updates to point to this btree_update:
945 static void bch2_btree_interior_update_will_free_node(struct btree_update
*as
,
948 struct bch_fs
*c
= as
->c
;
949 struct btree_update
*p
, *n
;
950 struct btree_write
*w
;
952 set_btree_node_dying(b
);
954 if (btree_node_fake(b
))
957 mutex_lock(&c
->btree_interior_update_lock
);
960 * Does this node have any btree_update operations preventing
961 * it from being written?
963 * If so, redirect them to point to this btree_update: we can
964 * write out our new nodes, but we won't make them visible until those
965 * operations complete
967 list_for_each_entry_safe(p
, n
, &b
->write_blocked
, write_blocked_list
) {
968 list_del_init(&p
->write_blocked_list
);
969 btree_update_reparent(as
, p
);
972 * for flush_held_btree_writes() waiting on updates to flush or
973 * nodes to be writeable:
975 closure_wake_up(&c
->btree_interior_update_wait
);
978 clear_btree_node_dirty_acct(c
, b
);
979 clear_btree_node_need_write(b
);
980 clear_btree_node_write_blocked(b
);
983 * Does this node have unwritten data that has a pin on the journal?
985 * If so, transfer that pin to the btree_update operation -
986 * note that if we're freeing multiple nodes, we only need to keep the
987 * oldest pin of any of the nodes we're freeing. We'll release the pin
988 * when the new nodes are persistent and reachable on disk:
990 w
= btree_current_write(b
);
991 bch2_journal_pin_copy(&c
->journal
, &as
->journal
, &w
->journal
, NULL
);
992 bch2_journal_pin_drop(&c
->journal
, &w
->journal
);
994 w
= btree_prev_write(b
);
995 bch2_journal_pin_copy(&c
->journal
, &as
->journal
, &w
->journal
, NULL
);
996 bch2_journal_pin_drop(&c
->journal
, &w
->journal
);
998 mutex_unlock(&c
->btree_interior_update_lock
);
1001 * Is this a node that isn't reachable on disk yet?
1003 * Nodes that aren't reachable yet have writes blocked until they're
1004 * reachable - now that we've cancelled any pending writes and moved
1005 * things waiting on that write to wait on this update, we can drop this
1006 * node from the list of nodes that the other update is making
1007 * reachable, prior to freeing it:
1009 btree_update_drop_new_node(c
, b
);
1011 btree_update_add_key(as
, &as
->old_keys
, b
);
1013 as
->old_nodes
[as
->nr_old_nodes
] = b
;
1014 as
->old_nodes_seq
[as
->nr_old_nodes
] = b
->data
->keys
.seq
;
1018 static void bch2_btree_update_done(struct btree_update
*as
, struct btree_trans
*trans
)
1020 struct bch_fs
*c
= as
->c
;
1021 u64 start_time
= as
->start_time
;
1023 BUG_ON(as
->mode
== BTREE_INTERIOR_NO_UPDATE
);
1025 if (as
->took_gc_lock
)
1026 up_read(&as
->c
->gc_lock
);
1027 as
->took_gc_lock
= false;
1029 bch2_btree_reserve_put(as
, trans
);
1031 continue_at(&as
->cl
, btree_update_set_nodes_written
,
1032 as
->c
->btree_interior_update_worker
);
1034 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_interior_update_foreground
],
1038 static struct btree_update
*
1039 bch2_btree_update_start(struct btree_trans
*trans
, struct btree_path
*path
,
1040 unsigned level
, bool split
, unsigned flags
)
1042 struct bch_fs
*c
= trans
->c
;
1043 struct btree_update
*as
;
1044 u64 start_time
= local_clock();
1045 int disk_res_flags
= (flags
& BTREE_INSERT_NOFAIL
)
1046 ? BCH_DISK_RESERVATION_NOFAIL
: 0;
1047 unsigned nr_nodes
[2] = { 0, 0 };
1048 unsigned update_level
= level
;
1049 enum bch_watermark watermark
= flags
& BCH_WATERMARK_MASK
;
1050 unsigned journal_flags
= 0;
1052 u32 restart_count
= trans
->restart_count
;
1054 BUG_ON(!path
->should_be_locked
);
1056 if (watermark
== BCH_WATERMARK_copygc
)
1057 watermark
= BCH_WATERMARK_btree_copygc
;
1058 if (watermark
< BCH_WATERMARK_btree
)
1059 watermark
= BCH_WATERMARK_btree
;
1061 flags
&= ~BCH_WATERMARK_MASK
;
1064 if (flags
& BTREE_INSERT_JOURNAL_RECLAIM
)
1065 journal_flags
|= JOURNAL_RES_GET_NONBLOCK
;
1066 journal_flags
|= watermark
;
1069 nr_nodes
[!!update_level
] += 1 + split
;
1072 ret
= bch2_btree_path_upgrade(trans
, path
, update_level
+ 1);
1074 return ERR_PTR(ret
);
1076 if (!btree_path_node(path
, update_level
)) {
1077 /* Allocating new root? */
1078 nr_nodes
[1] += split
;
1079 update_level
= BTREE_MAX_DEPTH
;
1083 if (bch2_btree_node_insert_fits(c
, path
->l
[update_level
].b
,
1084 BKEY_BTREE_PTR_U64s_MAX
* (1 + split
)))
1087 split
= path
->l
[update_level
].b
->nr
.live_u64s
> BTREE_SPLIT_THRESHOLD(c
);
1090 if (flags
& BTREE_INSERT_GC_LOCK_HELD
)
1091 lockdep_assert_held(&c
->gc_lock
);
1092 else if (!down_read_trylock(&c
->gc_lock
)) {
1093 ret
= drop_locks_do(trans
, (down_read(&c
->gc_lock
), 0));
1095 up_read(&c
->gc_lock
);
1096 return ERR_PTR(ret
);
1100 as
= mempool_alloc(&c
->btree_interior_update_pool
, GFP_NOFS
);
1101 memset(as
, 0, sizeof(*as
));
1102 closure_init(&as
->cl
, NULL
);
1104 as
->start_time
= start_time
;
1105 as
->mode
= BTREE_INTERIOR_NO_UPDATE
;
1106 as
->took_gc_lock
= !(flags
& BTREE_INSERT_GC_LOCK_HELD
);
1107 as
->btree_id
= path
->btree_id
;
1108 as
->update_level
= update_level
;
1109 INIT_LIST_HEAD(&as
->list
);
1110 INIT_LIST_HEAD(&as
->unwritten_list
);
1111 INIT_LIST_HEAD(&as
->write_blocked_list
);
1112 bch2_keylist_init(&as
->old_keys
, as
->_old_keys
);
1113 bch2_keylist_init(&as
->new_keys
, as
->_new_keys
);
1114 bch2_keylist_init(&as
->parent_keys
, as
->inline_keys
);
1116 mutex_lock(&c
->btree_interior_update_lock
);
1117 list_add_tail(&as
->list
, &c
->btree_interior_update_list
);
1118 mutex_unlock(&c
->btree_interior_update_lock
);
1121 * We don't want to allocate if we're in an error state, that can cause
1122 * deadlock on emergency shutdown due to open buckets getting stuck in
1123 * the btree_reserve_cache after allocator shutdown has cleared it out.
1124 * This check needs to come after adding us to the btree_interior_update
1125 * list but before calling bch2_btree_reserve_get, to synchronize with
1126 * __bch2_fs_read_only().
1128 ret
= bch2_journal_error(&c
->journal
);
1132 ret
= bch2_journal_preres_get(&c
->journal
, &as
->journal_preres
,
1133 BTREE_UPDATE_JOURNAL_RES
,
1134 journal_flags
|JOURNAL_RES_GET_NONBLOCK
);
1136 if (flags
& BTREE_INSERT_JOURNAL_RECLAIM
) {
1137 ret
= -BCH_ERR_journal_reclaim_would_deadlock
;
1141 ret
= drop_locks_do(trans
,
1142 bch2_journal_preres_get(&c
->journal
, &as
->journal_preres
,
1143 BTREE_UPDATE_JOURNAL_RES
,
1145 if (ret
== -BCH_ERR_journal_preres_get_blocked
) {
1146 trace_and_count(c
, trans_restart_journal_preres_get
, trans
, _RET_IP_
, journal_flags
);
1147 ret
= btree_trans_restart(trans
, BCH_ERR_transaction_restart_journal_preres_get
);
1153 ret
= bch2_disk_reservation_get(c
, &as
->disk_res
,
1154 (nr_nodes
[0] + nr_nodes
[1]) * btree_sectors(c
),
1155 c
->opts
.metadata_replicas
,
1160 ret
= bch2_btree_reserve_get(trans
, as
, nr_nodes
, flags
, NULL
);
1161 if (bch2_err_matches(ret
, ENOSPC
) ||
1162 bch2_err_matches(ret
, ENOMEM
)) {
1166 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
1169 if (bch2_err_matches(ret
, ENOSPC
) &&
1170 (flags
& BTREE_INSERT_JOURNAL_RECLAIM
) &&
1171 watermark
!= BCH_WATERMARK_reclaim
) {
1172 ret
= -BCH_ERR_journal_reclaim_would_deadlock
;
1176 closure_init_stack(&cl
);
1179 ret
= bch2_btree_reserve_get(trans
, as
, nr_nodes
, flags
, &cl
);
1181 bch2_trans_unlock(trans
);
1183 } while (bch2_err_matches(ret
, BCH_ERR_operation_blocked
));
1187 trace_and_count(c
, btree_reserve_get_fail
, trans
->fn
,
1188 _RET_IP_
, nr_nodes
[0] + nr_nodes
[1], ret
);
1192 ret
= bch2_trans_relock(trans
);
1196 bch2_trans_verify_not_restarted(trans
, restart_count
);
1199 bch2_btree_update_free(as
, trans
);
1200 return ERR_PTR(ret
);
1203 /* Btree root updates: */
1205 static void bch2_btree_set_root_inmem(struct bch_fs
*c
, struct btree
*b
)
1207 /* Root nodes cannot be reaped */
1208 mutex_lock(&c
->btree_cache
.lock
);
1209 list_del_init(&b
->list
);
1210 mutex_unlock(&c
->btree_cache
.lock
);
1212 mutex_lock(&c
->btree_root_lock
);
1213 BUG_ON(btree_node_root(c
, b
) &&
1214 (b
->c
.level
< btree_node_root(c
, b
)->c
.level
||
1215 !btree_node_dying(btree_node_root(c
, b
))));
1217 bch2_btree_id_root(c
, b
->c
.btree_id
)->b
= b
;
1218 mutex_unlock(&c
->btree_root_lock
);
1220 bch2_recalc_btree_reserve(c
);
1223 static void bch2_btree_set_root(struct btree_update
*as
,
1224 struct btree_trans
*trans
,
1225 struct btree_path
*path
,
1228 struct bch_fs
*c
= as
->c
;
1231 trace_and_count(c
, btree_node_set_root
, c
, b
);
1233 old
= btree_node_root(c
, b
);
1236 * Ensure no one is using the old root while we switch to the
1239 bch2_btree_node_lock_write_nofail(trans
, path
, &old
->c
);
1241 bch2_btree_set_root_inmem(c
, b
);
1243 btree_update_updated_root(as
, b
);
1246 * Unlock old root after new root is visible:
1248 * The new root isn't persistent, but that's ok: we still have
1249 * an intent lock on the new root, and any updates that would
1250 * depend on the new root would have to update the new root.
1252 bch2_btree_node_unlock_write(trans
, path
, old
);
1255 /* Interior node updates: */
1257 static void bch2_insert_fixup_btree_ptr(struct btree_update
*as
,
1258 struct btree_trans
*trans
,
1259 struct btree_path
*path
,
1261 struct btree_node_iter
*node_iter
,
1262 struct bkey_i
*insert
)
1264 struct bch_fs
*c
= as
->c
;
1265 struct bkey_packed
*k
;
1266 struct printbuf buf
= PRINTBUF
;
1267 unsigned long old
, new, v
;
1269 BUG_ON(insert
->k
.type
== KEY_TYPE_btree_ptr_v2
&&
1270 !btree_ptr_sectors_written(insert
));
1272 if (unlikely(!test_bit(JOURNAL_REPLAY_DONE
, &c
->journal
.flags
)))
1273 bch2_journal_key_overwritten(c
, b
->c
.btree_id
, b
->c
.level
, insert
->k
.p
);
1275 if (bch2_bkey_invalid(c
, bkey_i_to_s_c(insert
),
1276 btree_node_type(b
), WRITE
, &buf
) ?:
1277 bch2_bkey_in_btree_node(c
, b
, bkey_i_to_s_c(insert
), &buf
)) {
1278 printbuf_reset(&buf
);
1279 prt_printf(&buf
, "inserting invalid bkey\n ");
1280 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(insert
));
1281 prt_printf(&buf
, "\n ");
1282 bch2_bkey_invalid(c
, bkey_i_to_s_c(insert
),
1283 btree_node_type(b
), WRITE
, &buf
);
1284 bch2_bkey_in_btree_node(c
, b
, bkey_i_to_s_c(insert
), &buf
);
1286 bch2_fs_inconsistent(c
, "%s", buf
.buf
);
1290 BUG_ON(as
->journal_u64s
+ jset_u64s(insert
->k
.u64s
) >
1291 ARRAY_SIZE(as
->journal_entries
));
1294 journal_entry_set((void *) &as
->journal_entries
[as
->journal_u64s
],
1295 BCH_JSET_ENTRY_btree_keys
,
1296 b
->c
.btree_id
, b
->c
.level
,
1297 insert
, insert
->k
.u64s
);
1299 while ((k
= bch2_btree_node_iter_peek_all(node_iter
, b
)) &&
1300 bkey_iter_pos_cmp(b
, k
, &insert
->k
.p
) < 0)
1301 bch2_btree_node_iter_advance(node_iter
, b
);
1303 bch2_btree_bset_insert_key(trans
, path
, b
, node_iter
, insert
);
1304 set_btree_node_dirty_acct(c
, b
);
1306 v
= READ_ONCE(b
->flags
);
1310 new &= ~BTREE_WRITE_TYPE_MASK
;
1311 new |= BTREE_WRITE_interior
;
1312 new |= 1 << BTREE_NODE_need_write
;
1313 } while ((v
= cmpxchg(&b
->flags
, old
, new)) != old
);
1315 printbuf_exit(&buf
);
1319 __bch2_btree_insert_keys_interior(struct btree_update
*as
,
1320 struct btree_trans
*trans
,
1321 struct btree_path
*path
,
1323 struct btree_node_iter node_iter
,
1324 struct keylist
*keys
)
1326 struct bkey_i
*insert
= bch2_keylist_front(keys
);
1327 struct bkey_packed
*k
;
1329 BUG_ON(btree_node_type(b
) != BKEY_TYPE_btree
);
1331 while ((k
= bch2_btree_node_iter_prev_all(&node_iter
, b
)) &&
1332 (bkey_cmp_left_packed(b
, k
, &insert
->k
.p
) >= 0))
1335 while (!bch2_keylist_empty(keys
)) {
1336 insert
= bch2_keylist_front(keys
);
1338 if (bpos_gt(insert
->k
.p
, b
->key
.k
.p
))
1341 bch2_insert_fixup_btree_ptr(as
, trans
, path
, b
, &node_iter
, insert
);
1342 bch2_keylist_pop_front(keys
);
1347 * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1350 static void __btree_split_node(struct btree_update
*as
,
1351 struct btree_trans
*trans
,
1355 struct bkey_packed
*k
;
1356 struct bpos n1_pos
= POS_MIN
;
1357 struct btree_node_iter iter
;
1358 struct bset
*bsets
[2];
1359 struct bkey_format_state format
[2];
1360 struct bkey_packed
*out
[2];
1362 unsigned u64s
, n1_u64s
= (b
->nr
.live_u64s
* 3) / 5;
1365 for (i
= 0; i
< 2; i
++) {
1366 BUG_ON(n
[i
]->nsets
!= 1);
1368 bsets
[i
] = btree_bset_first(n
[i
]);
1369 out
[i
] = bsets
[i
]->start
;
1371 SET_BTREE_NODE_SEQ(n
[i
]->data
, BTREE_NODE_SEQ(b
->data
) + 1);
1372 bch2_bkey_format_init(&format
[i
]);
1376 for_each_btree_node_key(b
, k
, &iter
) {
1377 if (bkey_deleted(k
))
1380 i
= u64s
>= n1_u64s
;
1382 uk
= bkey_unpack_key(b
, k
);
1385 bch2_bkey_format_add_key(&format
[i
], &uk
);
1388 btree_set_min(n
[0], b
->data
->min_key
);
1389 btree_set_max(n
[0], n1_pos
);
1390 btree_set_min(n
[1], bpos_successor(n1_pos
));
1391 btree_set_max(n
[1], b
->data
->max_key
);
1393 for (i
= 0; i
< 2; i
++) {
1394 bch2_bkey_format_add_pos(&format
[i
], n
[i
]->data
->min_key
);
1395 bch2_bkey_format_add_pos(&format
[i
], n
[i
]->data
->max_key
);
1397 n
[i
]->data
->format
= bch2_bkey_format_done(&format
[i
]);
1398 btree_node_set_format(n
[i
], n
[i
]->data
->format
);
1402 for_each_btree_node_key(b
, k
, &iter
) {
1403 if (bkey_deleted(k
))
1406 i
= u64s
>= n1_u64s
;
1409 if (bch2_bkey_transform(&n
[i
]->format
, out
[i
], bkey_packed(k
)
1410 ? &b
->format
: &bch2_bkey_format_current
, k
))
1411 out
[i
]->format
= KEY_FORMAT_LOCAL_BTREE
;
1413 bch2_bkey_unpack(b
, (void *) out
[i
], k
);
1415 out
[i
]->needs_whiteout
= false;
1417 btree_keys_account_key_add(&n
[i
]->nr
, 0, out
[i
]);
1418 out
[i
] = bkey_p_next(out
[i
]);
1421 for (i
= 0; i
< 2; i
++) {
1422 bsets
[i
]->u64s
= cpu_to_le16((u64
*) out
[i
] - bsets
[i
]->_data
);
1424 BUG_ON(!bsets
[i
]->u64s
);
1426 set_btree_bset_end(n
[i
], n
[i
]->set
);
1428 btree_node_reset_sib_u64s(n
[i
]);
1430 bch2_verify_btree_nr_keys(n
[i
]);
1433 btree_node_interior_verify(as
->c
, n
[i
]);
1438 * For updates to interior nodes, we've got to do the insert before we split
1439 * because the stuff we're inserting has to be inserted atomically. Post split,
1440 * the keys might have to go in different nodes and the split would no longer be
1443 * Worse, if the insert is from btree node coalescing, if we do the insert after
1444 * we do the split (and pick the pivot) - the pivot we pick might be between
1445 * nodes that were coalesced, and thus in the middle of a child node post
1448 static void btree_split_insert_keys(struct btree_update
*as
,
1449 struct btree_trans
*trans
,
1450 struct btree_path
*path
,
1452 struct keylist
*keys
)
1454 if (!bch2_keylist_empty(keys
) &&
1455 bpos_le(bch2_keylist_front(keys
)->k
.p
, b
->data
->max_key
)) {
1456 struct btree_node_iter node_iter
;
1458 bch2_btree_node_iter_init(&node_iter
, b
, &bch2_keylist_front(keys
)->k
.p
);
1460 __bch2_btree_insert_keys_interior(as
, trans
, path
, b
, node_iter
, keys
);
1462 btree_node_interior_verify(as
->c
, b
);
1466 static int btree_split(struct btree_update
*as
, struct btree_trans
*trans
,
1467 struct btree_path
*path
, struct btree
*b
,
1468 struct keylist
*keys
, unsigned flags
)
1470 struct bch_fs
*c
= as
->c
;
1471 struct btree
*parent
= btree_node_parent(path
, b
);
1472 struct btree
*n1
, *n2
= NULL
, *n3
= NULL
;
1473 struct btree_path
*path1
= NULL
, *path2
= NULL
;
1474 u64 start_time
= local_clock();
1477 BUG_ON(!parent
&& (b
!= btree_node_root(c
, b
)));
1478 BUG_ON(parent
&& !btree_node_intent_locked(path
, b
->c
.level
+ 1));
1480 bch2_btree_interior_update_will_free_node(as
, b
);
1482 if (b
->nr
.live_u64s
> BTREE_SPLIT_THRESHOLD(c
)) {
1485 trace_and_count(c
, btree_node_split
, c
, b
);
1487 n
[0] = n1
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
1488 n
[1] = n2
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
1490 __btree_split_node(as
, trans
, b
, n
);
1493 btree_split_insert_keys(as
, trans
, path
, n1
, keys
);
1494 btree_split_insert_keys(as
, trans
, path
, n2
, keys
);
1495 BUG_ON(!bch2_keylist_empty(keys
));
1498 bch2_btree_build_aux_trees(n2
);
1499 bch2_btree_build_aux_trees(n1
);
1501 bch2_btree_update_add_new_node(as
, n1
);
1502 bch2_btree_update_add_new_node(as
, n2
);
1503 six_unlock_write(&n2
->c
.lock
);
1504 six_unlock_write(&n1
->c
.lock
);
1506 path1
= get_unlocked_mut_path(trans
, path
->btree_id
, n1
->c
.level
, n1
->key
.k
.p
);
1507 six_lock_increment(&n1
->c
.lock
, SIX_LOCK_intent
);
1508 mark_btree_node_locked(trans
, path1
, n1
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1509 bch2_btree_path_level_init(trans
, path1
, n1
);
1511 path2
= get_unlocked_mut_path(trans
, path
->btree_id
, n2
->c
.level
, n2
->key
.k
.p
);
1512 six_lock_increment(&n2
->c
.lock
, SIX_LOCK_intent
);
1513 mark_btree_node_locked(trans
, path2
, n2
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1514 bch2_btree_path_level_init(trans
, path2
, n2
);
1517 * Note that on recursive parent_keys == keys, so we
1518 * can't start adding new keys to parent_keys before emptying it
1519 * out (which we did with btree_split_insert_keys() above)
1521 bch2_keylist_add(&as
->parent_keys
, &n1
->key
);
1522 bch2_keylist_add(&as
->parent_keys
, &n2
->key
);
1525 /* Depth increases, make a new root */
1526 n3
= __btree_root_alloc(as
, trans
, b
->c
.level
+ 1);
1528 bch2_btree_update_add_new_node(as
, n3
);
1529 six_unlock_write(&n3
->c
.lock
);
1531 path2
->locks_want
++;
1532 BUG_ON(btree_node_locked(path2
, n3
->c
.level
));
1533 six_lock_increment(&n3
->c
.lock
, SIX_LOCK_intent
);
1534 mark_btree_node_locked(trans
, path2
, n3
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1535 bch2_btree_path_level_init(trans
, path2
, n3
);
1537 n3
->sib_u64s
[0] = U16_MAX
;
1538 n3
->sib_u64s
[1] = U16_MAX
;
1540 btree_split_insert_keys(as
, trans
, path
, n3
, &as
->parent_keys
);
1543 trace_and_count(c
, btree_node_compact
, c
, b
);
1545 n1
= bch2_btree_node_alloc_replacement(as
, trans
, b
);
1548 btree_split_insert_keys(as
, trans
, path
, n1
, keys
);
1549 BUG_ON(!bch2_keylist_empty(keys
));
1552 bch2_btree_build_aux_trees(n1
);
1553 bch2_btree_update_add_new_node(as
, n1
);
1554 six_unlock_write(&n1
->c
.lock
);
1556 path1
= get_unlocked_mut_path(trans
, path
->btree_id
, n1
->c
.level
, n1
->key
.k
.p
);
1557 six_lock_increment(&n1
->c
.lock
, SIX_LOCK_intent
);
1558 mark_btree_node_locked(trans
, path1
, n1
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1559 bch2_btree_path_level_init(trans
, path1
, n1
);
1562 bch2_keylist_add(&as
->parent_keys
, &n1
->key
);
1565 /* New nodes all written, now make them visible: */
1568 /* Split a non root node */
1569 ret
= bch2_btree_insert_node(as
, trans
, path
, parent
, &as
->parent_keys
, flags
);
1573 bch2_btree_set_root(as
, trans
, path
, n3
);
1575 /* Root filled up but didn't need to be split */
1576 bch2_btree_set_root(as
, trans
, path
, n1
);
1580 bch2_btree_update_get_open_buckets(as
, n3
);
1581 bch2_btree_node_write(c
, n3
, SIX_LOCK_intent
, 0);
1584 bch2_btree_update_get_open_buckets(as
, n2
);
1585 bch2_btree_node_write(c
, n2
, SIX_LOCK_intent
, 0);
1587 bch2_btree_update_get_open_buckets(as
, n1
);
1588 bch2_btree_node_write(c
, n1
, SIX_LOCK_intent
, 0);
1591 * The old node must be freed (in memory) _before_ unlocking the new
1592 * nodes - else another thread could re-acquire a read lock on the old
1593 * node after another thread has locked and updated the new node, thus
1594 * seeing stale data:
1596 bch2_btree_node_free_inmem(trans
, path
, b
);
1599 bch2_trans_node_add(trans
, n3
);
1601 bch2_trans_node_add(trans
, n2
);
1602 bch2_trans_node_add(trans
, n1
);
1605 six_unlock_intent(&n3
->c
.lock
);
1607 six_unlock_intent(&n2
->c
.lock
);
1608 six_unlock_intent(&n1
->c
.lock
);
1611 __bch2_btree_path_unlock(trans
, path2
);
1612 bch2_path_put(trans
, path2
, true);
1615 __bch2_btree_path_unlock(trans
, path1
);
1616 bch2_path_put(trans
, path1
, true);
1619 bch2_trans_verify_locks(trans
);
1621 bch2_time_stats_update(&c
->times
[n2
1622 ? BCH_TIME_btree_node_split
1623 : BCH_TIME_btree_node_compact
],
1628 bch2_btree_node_free_never_used(as
, trans
, n3
);
1630 bch2_btree_node_free_never_used(as
, trans
, n2
);
1631 bch2_btree_node_free_never_used(as
, trans
, n1
);
1636 bch2_btree_insert_keys_interior(struct btree_update
*as
,
1637 struct btree_trans
*trans
,
1638 struct btree_path
*path
,
1640 struct keylist
*keys
)
1642 struct btree_path
*linked
;
1644 __bch2_btree_insert_keys_interior(as
, trans
, path
, b
,
1645 path
->l
[b
->c
.level
].iter
, keys
);
1647 btree_update_updated_node(as
, b
);
1649 trans_for_each_path_with_node(trans
, b
, linked
)
1650 bch2_btree_node_iter_peek(&linked
->l
[b
->c
.level
].iter
, b
);
1652 bch2_trans_verify_paths(trans
);
1656 * bch2_btree_insert_node - insert bkeys into a given btree node
1658 * @as: btree_update object
1659 * @trans: btree_trans object
1660 * @path: path that points to current node
1661 * @b: node to insert keys into
1662 * @keys: list of keys to insert
1663 * @flags: transaction commit flags
1665 * Returns: 0 on success, typically transaction restart error on failure
1667 * Inserts as many keys as it can into a given btree node, splitting it if full.
1668 * If a split occurred, this function will return early. This can only happen
1669 * for leaf nodes -- inserts into interior nodes have to be atomic.
1671 static int bch2_btree_insert_node(struct btree_update
*as
, struct btree_trans
*trans
,
1672 struct btree_path
*path
, struct btree
*b
,
1673 struct keylist
*keys
, unsigned flags
)
1675 struct bch_fs
*c
= as
->c
;
1676 int old_u64s
= le16_to_cpu(btree_bset_last(b
)->u64s
);
1677 int old_live_u64s
= b
->nr
.live_u64s
;
1678 int live_u64s_added
, u64s_added
;
1681 lockdep_assert_held(&c
->gc_lock
);
1682 BUG_ON(!btree_node_intent_locked(path
, b
->c
.level
));
1683 BUG_ON(!b
->c
.level
);
1684 BUG_ON(!as
|| as
->b
);
1685 bch2_verify_keylist_sorted(keys
);
1687 ret
= bch2_btree_node_lock_write(trans
, path
, &b
->c
);
1691 bch2_btree_node_prep_for_write(trans
, path
, b
);
1693 if (!bch2_btree_node_insert_fits(c
, b
, bch2_keylist_u64s(keys
))) {
1694 bch2_btree_node_unlock_write(trans
, path
, b
);
1698 btree_node_interior_verify(c
, b
);
1700 bch2_btree_insert_keys_interior(as
, trans
, path
, b
, keys
);
1702 live_u64s_added
= (int) b
->nr
.live_u64s
- old_live_u64s
;
1703 u64s_added
= (int) le16_to_cpu(btree_bset_last(b
)->u64s
) - old_u64s
;
1705 if (b
->sib_u64s
[0] != U16_MAX
&& live_u64s_added
< 0)
1706 b
->sib_u64s
[0] = max(0, (int) b
->sib_u64s
[0] + live_u64s_added
);
1707 if (b
->sib_u64s
[1] != U16_MAX
&& live_u64s_added
< 0)
1708 b
->sib_u64s
[1] = max(0, (int) b
->sib_u64s
[1] + live_u64s_added
);
1710 if (u64s_added
> live_u64s_added
&&
1711 bch2_maybe_compact_whiteouts(c
, b
))
1712 bch2_trans_node_reinit_iter(trans
, b
);
1714 bch2_btree_node_unlock_write(trans
, path
, b
);
1716 btree_node_interior_verify(c
, b
);
1720 * We could attempt to avoid the transaction restart, by calling
1721 * bch2_btree_path_upgrade() and allocating more nodes:
1723 if (b
->c
.level
>= as
->update_level
) {
1724 trace_and_count(c
, trans_restart_split_race
, trans
, _THIS_IP_
, b
);
1725 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_split_race
);
1728 return btree_split(as
, trans
, path
, b
, keys
, flags
);
1731 int bch2_btree_split_leaf(struct btree_trans
*trans
,
1732 struct btree_path
*path
,
1735 struct btree
*b
= path_l(path
)->b
;
1736 struct btree_update
*as
;
1740 as
= bch2_btree_update_start(trans
, path
, path
->level
,
1745 ret
= btree_split(as
, trans
, path
, b
, NULL
, flags
);
1747 bch2_btree_update_free(as
, trans
);
1751 bch2_btree_update_done(as
, trans
);
1753 for (l
= path
->level
+ 1; btree_node_intent_locked(path
, l
) && !ret
; l
++)
1754 ret
= bch2_foreground_maybe_merge(trans
, path
, l
, flags
);
1759 int __bch2_foreground_maybe_merge(struct btree_trans
*trans
,
1760 struct btree_path
*path
,
1763 enum btree_node_sibling sib
)
1765 struct bch_fs
*c
= trans
->c
;
1766 struct btree_path
*sib_path
= NULL
, *new_path
= NULL
;
1767 struct btree_update
*as
;
1768 struct bkey_format_state new_s
;
1769 struct bkey_format new_f
;
1770 struct bkey_i
delete;
1771 struct btree
*b
, *m
, *n
, *prev
, *next
, *parent
;
1772 struct bpos sib_pos
;
1774 u64 start_time
= local_clock();
1777 BUG_ON(!path
->should_be_locked
);
1778 BUG_ON(!btree_node_locked(path
, level
));
1780 b
= path
->l
[level
].b
;
1782 if ((sib
== btree_prev_sib
&& bpos_eq(b
->data
->min_key
, POS_MIN
)) ||
1783 (sib
== btree_next_sib
&& bpos_eq(b
->data
->max_key
, SPOS_MAX
))) {
1784 b
->sib_u64s
[sib
] = U16_MAX
;
1788 sib_pos
= sib
== btree_prev_sib
1789 ? bpos_predecessor(b
->data
->min_key
)
1790 : bpos_successor(b
->data
->max_key
);
1792 sib_path
= bch2_path_get(trans
, path
->btree_id
, sib_pos
,
1793 U8_MAX
, level
, BTREE_ITER_INTENT
, _THIS_IP_
);
1794 ret
= bch2_btree_path_traverse(trans
, sib_path
, false);
1798 btree_path_set_should_be_locked(sib_path
);
1800 m
= sib_path
->l
[level
].b
;
1802 if (btree_node_parent(path
, b
) !=
1803 btree_node_parent(sib_path
, m
)) {
1804 b
->sib_u64s
[sib
] = U16_MAX
;
1808 if (sib
== btree_prev_sib
) {
1816 if (!bpos_eq(bpos_successor(prev
->data
->max_key
), next
->data
->min_key
)) {
1817 struct printbuf buf1
= PRINTBUF
, buf2
= PRINTBUF
;
1819 bch2_bpos_to_text(&buf1
, prev
->data
->max_key
);
1820 bch2_bpos_to_text(&buf2
, next
->data
->min_key
);
1822 "%s(): btree topology error:\n"
1823 " prev ends at %s\n"
1824 " next starts at %s",
1825 __func__
, buf1
.buf
, buf2
.buf
);
1826 printbuf_exit(&buf1
);
1827 printbuf_exit(&buf2
);
1828 bch2_topology_error(c
);
1833 bch2_bkey_format_init(&new_s
);
1834 bch2_bkey_format_add_pos(&new_s
, prev
->data
->min_key
);
1835 __bch2_btree_calc_format(&new_s
, prev
);
1836 __bch2_btree_calc_format(&new_s
, next
);
1837 bch2_bkey_format_add_pos(&new_s
, next
->data
->max_key
);
1838 new_f
= bch2_bkey_format_done(&new_s
);
1840 sib_u64s
= btree_node_u64s_with_format(b
, &new_f
) +
1841 btree_node_u64s_with_format(m
, &new_f
);
1843 if (sib_u64s
> BTREE_FOREGROUND_MERGE_HYSTERESIS(c
)) {
1844 sib_u64s
-= BTREE_FOREGROUND_MERGE_HYSTERESIS(c
);
1846 sib_u64s
+= BTREE_FOREGROUND_MERGE_HYSTERESIS(c
);
1849 sib_u64s
= min(sib_u64s
, btree_max_u64s(c
));
1850 sib_u64s
= min(sib_u64s
, (size_t) U16_MAX
- 1);
1851 b
->sib_u64s
[sib
] = sib_u64s
;
1853 if (b
->sib_u64s
[sib
] > c
->btree_foreground_merge_threshold
)
1856 parent
= btree_node_parent(path
, b
);
1857 as
= bch2_btree_update_start(trans
, path
, level
, false,
1858 BTREE_INSERT_NOFAIL
|flags
);
1859 ret
= PTR_ERR_OR_ZERO(as
);
1863 trace_and_count(c
, btree_node_merge
, c
, b
);
1865 bch2_btree_interior_update_will_free_node(as
, b
);
1866 bch2_btree_interior_update_will_free_node(as
, m
);
1868 n
= bch2_btree_node_alloc(as
, trans
, b
->c
.level
);
1870 SET_BTREE_NODE_SEQ(n
->data
,
1871 max(BTREE_NODE_SEQ(b
->data
),
1872 BTREE_NODE_SEQ(m
->data
)) + 1);
1874 btree_set_min(n
, prev
->data
->min_key
);
1875 btree_set_max(n
, next
->data
->max_key
);
1877 n
->data
->format
= new_f
;
1878 btree_node_set_format(n
, new_f
);
1880 bch2_btree_sort_into(c
, n
, prev
);
1881 bch2_btree_sort_into(c
, n
, next
);
1883 bch2_btree_build_aux_trees(n
);
1884 bch2_btree_update_add_new_node(as
, n
);
1885 six_unlock_write(&n
->c
.lock
);
1887 new_path
= get_unlocked_mut_path(trans
, path
->btree_id
, n
->c
.level
, n
->key
.k
.p
);
1888 six_lock_increment(&n
->c
.lock
, SIX_LOCK_intent
);
1889 mark_btree_node_locked(trans
, new_path
, n
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1890 bch2_btree_path_level_init(trans
, new_path
, n
);
1892 bkey_init(&delete.k
);
1893 delete.k
.p
= prev
->key
.k
.p
;
1894 bch2_keylist_add(&as
->parent_keys
, &delete);
1895 bch2_keylist_add(&as
->parent_keys
, &n
->key
);
1897 bch2_trans_verify_paths(trans
);
1899 ret
= bch2_btree_insert_node(as
, trans
, path
, parent
, &as
->parent_keys
, flags
);
1901 goto err_free_update
;
1903 bch2_trans_verify_paths(trans
);
1905 bch2_btree_update_get_open_buckets(as
, n
);
1906 bch2_btree_node_write(c
, n
, SIX_LOCK_intent
, 0);
1908 bch2_btree_node_free_inmem(trans
, path
, b
);
1909 bch2_btree_node_free_inmem(trans
, sib_path
, m
);
1911 bch2_trans_node_add(trans
, n
);
1913 bch2_trans_verify_paths(trans
);
1915 six_unlock_intent(&n
->c
.lock
);
1917 bch2_btree_update_done(as
, trans
);
1919 bch2_time_stats_update(&c
->times
[BCH_TIME_btree_node_merge
], start_time
);
1923 bch2_path_put(trans
, new_path
, true);
1924 bch2_path_put(trans
, sib_path
, true);
1925 bch2_trans_verify_locks(trans
);
1928 bch2_btree_node_free_never_used(as
, trans
, n
);
1929 bch2_btree_update_free(as
, trans
);
1933 int bch2_btree_node_rewrite(struct btree_trans
*trans
,
1934 struct btree_iter
*iter
,
1938 struct bch_fs
*c
= trans
->c
;
1939 struct btree_path
*new_path
= NULL
;
1940 struct btree
*n
, *parent
;
1941 struct btree_update
*as
;
1944 flags
|= BTREE_INSERT_NOFAIL
;
1946 parent
= btree_node_parent(iter
->path
, b
);
1947 as
= bch2_btree_update_start(trans
, iter
->path
, b
->c
.level
,
1949 ret
= PTR_ERR_OR_ZERO(as
);
1953 bch2_btree_interior_update_will_free_node(as
, b
);
1955 n
= bch2_btree_node_alloc_replacement(as
, trans
, b
);
1957 bch2_btree_build_aux_trees(n
);
1958 bch2_btree_update_add_new_node(as
, n
);
1959 six_unlock_write(&n
->c
.lock
);
1961 new_path
= get_unlocked_mut_path(trans
, iter
->btree_id
, n
->c
.level
, n
->key
.k
.p
);
1962 six_lock_increment(&n
->c
.lock
, SIX_LOCK_intent
);
1963 mark_btree_node_locked(trans
, new_path
, n
->c
.level
, BTREE_NODE_INTENT_LOCKED
);
1964 bch2_btree_path_level_init(trans
, new_path
, n
);
1966 trace_and_count(c
, btree_node_rewrite
, c
, b
);
1969 bch2_keylist_add(&as
->parent_keys
, &n
->key
);
1970 ret
= bch2_btree_insert_node(as
, trans
, iter
->path
, parent
,
1971 &as
->parent_keys
, flags
);
1975 bch2_btree_set_root(as
, trans
, iter
->path
, n
);
1978 bch2_btree_update_get_open_buckets(as
, n
);
1979 bch2_btree_node_write(c
, n
, SIX_LOCK_intent
, 0);
1981 bch2_btree_node_free_inmem(trans
, iter
->path
, b
);
1983 bch2_trans_node_add(trans
, n
);
1984 six_unlock_intent(&n
->c
.lock
);
1986 bch2_btree_update_done(as
, trans
);
1989 bch2_path_put(trans
, new_path
, true);
1990 bch2_trans_downgrade(trans
);
1993 bch2_btree_node_free_never_used(as
, trans
, n
);
1994 bch2_btree_update_free(as
, trans
);
1998 struct async_btree_rewrite
{
2000 struct work_struct work
;
2001 struct list_head list
;
2002 enum btree_id btree_id
;
2008 static int async_btree_node_rewrite_trans(struct btree_trans
*trans
,
2009 struct async_btree_rewrite
*a
)
2011 struct bch_fs
*c
= trans
->c
;
2012 struct btree_iter iter
;
2016 bch2_trans_node_iter_init(trans
, &iter
, a
->btree_id
, a
->pos
,
2017 BTREE_MAX_DEPTH
, a
->level
, 0);
2018 b
= bch2_btree_iter_peek_node(&iter
);
2019 ret
= PTR_ERR_OR_ZERO(b
);
2023 if (!b
|| b
->data
->keys
.seq
!= a
->seq
) {
2024 struct printbuf buf
= PRINTBUF
;
2027 bch2_bkey_val_to_text(&buf
, c
, bkey_i_to_s_c(&b
->key
));
2029 prt_str(&buf
, "(null");
2030 bch_info(c
, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
2031 __func__
, a
->seq
, buf
.buf
);
2032 printbuf_exit(&buf
);
2036 ret
= bch2_btree_node_rewrite(trans
, &iter
, b
, 0);
2038 bch2_trans_iter_exit(trans
, &iter
);
2043 static void async_btree_node_rewrite_work(struct work_struct
*work
)
2045 struct async_btree_rewrite
*a
=
2046 container_of(work
, struct async_btree_rewrite
, work
);
2047 struct bch_fs
*c
= a
->c
;
2050 ret
= bch2_trans_do(c
, NULL
, NULL
, 0,
2051 async_btree_node_rewrite_trans(trans
, a
));
2054 bch2_write_ref_put(c
, BCH_WRITE_REF_node_rewrite
);
2058 void bch2_btree_node_rewrite_async(struct bch_fs
*c
, struct btree
*b
)
2060 struct async_btree_rewrite
*a
;
2063 a
= kmalloc(sizeof(*a
), GFP_NOFS
);
2065 bch_err(c
, "%s: error allocating memory", __func__
);
2070 a
->btree_id
= b
->c
.btree_id
;
2071 a
->level
= b
->c
.level
;
2072 a
->pos
= b
->key
.k
.p
;
2073 a
->seq
= b
->data
->keys
.seq
;
2074 INIT_WORK(&a
->work
, async_btree_node_rewrite_work
);
2076 if (unlikely(!test_bit(BCH_FS_MAY_GO_RW
, &c
->flags
))) {
2077 mutex_lock(&c
->pending_node_rewrites_lock
);
2078 list_add(&a
->list
, &c
->pending_node_rewrites
);
2079 mutex_unlock(&c
->pending_node_rewrites_lock
);
2083 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_node_rewrite
)) {
2084 if (test_bit(BCH_FS_STARTED
, &c
->flags
)) {
2085 bch_err(c
, "%s: error getting c->writes ref", __func__
);
2090 ret
= bch2_fs_read_write_early(c
);
2092 bch_err_msg(c
, ret
, "going read-write");
2097 bch2_write_ref_get(c
, BCH_WRITE_REF_node_rewrite
);
2100 queue_work(c
->btree_interior_update_worker
, &a
->work
);
2103 void bch2_do_pending_node_rewrites(struct bch_fs
*c
)
2105 struct async_btree_rewrite
*a
, *n
;
2107 mutex_lock(&c
->pending_node_rewrites_lock
);
2108 list_for_each_entry_safe(a
, n
, &c
->pending_node_rewrites
, list
) {
2111 bch2_write_ref_get(c
, BCH_WRITE_REF_node_rewrite
);
2112 queue_work(c
->btree_interior_update_worker
, &a
->work
);
2114 mutex_unlock(&c
->pending_node_rewrites_lock
);
2117 void bch2_free_pending_node_rewrites(struct bch_fs
*c
)
2119 struct async_btree_rewrite
*a
, *n
;
2121 mutex_lock(&c
->pending_node_rewrites_lock
);
2122 list_for_each_entry_safe(a
, n
, &c
->pending_node_rewrites
, list
) {
2127 mutex_unlock(&c
->pending_node_rewrites_lock
);
2130 static int __bch2_btree_node_update_key(struct btree_trans
*trans
,
2131 struct btree_iter
*iter
,
2132 struct btree
*b
, struct btree
*new_hash
,
2133 struct bkey_i
*new_key
,
2134 unsigned commit_flags
,
2137 struct bch_fs
*c
= trans
->c
;
2138 struct btree_iter iter2
= { NULL
};
2139 struct btree
*parent
;
2142 if (!skip_triggers
) {
2143 ret
= bch2_trans_mark_old(trans
, b
->c
.btree_id
, b
->c
.level
+ 1,
2144 bkey_i_to_s_c(&b
->key
), 0);
2148 ret
= bch2_trans_mark_new(trans
, b
->c
.btree_id
, b
->c
.level
+ 1,
2155 bkey_copy(&new_hash
->key
, new_key
);
2156 ret
= bch2_btree_node_hash_insert(&c
->btree_cache
,
2157 new_hash
, b
->c
.level
, b
->c
.btree_id
);
2161 parent
= btree_node_parent(iter
->path
, b
);
2163 bch2_trans_copy_iter(&iter2
, iter
);
2165 iter2
.path
= bch2_btree_path_make_mut(trans
, iter2
.path
,
2166 iter2
.flags
& BTREE_ITER_INTENT
,
2169 BUG_ON(iter2
.path
->level
!= b
->c
.level
);
2170 BUG_ON(!bpos_eq(iter2
.path
->pos
, new_key
->k
.p
));
2172 btree_path_set_level_up(trans
, iter2
.path
);
2174 trans
->paths_sorted
= false;
2176 ret
= bch2_btree_iter_traverse(&iter2
) ?:
2177 bch2_trans_update(trans
, &iter2
, new_key
, BTREE_TRIGGER_NORUN
);
2181 BUG_ON(btree_node_root(c
, b
) != b
);
2183 ret
= darray_make_room(&trans
->extra_journal_entries
,
2184 jset_u64s(new_key
->k
.u64s
));
2188 journal_entry_set((void *) &darray_top(trans
->extra_journal_entries
),
2189 BCH_JSET_ENTRY_btree_root
,
2190 b
->c
.btree_id
, b
->c
.level
,
2191 new_key
, new_key
->k
.u64s
);
2192 trans
->extra_journal_entries
.nr
+= jset_u64s(new_key
->k
.u64s
);
2195 ret
= bch2_trans_commit(trans
, NULL
, NULL
, commit_flags
);
2199 bch2_btree_node_lock_write_nofail(trans
, iter
->path
, &b
->c
);
2202 mutex_lock(&c
->btree_cache
.lock
);
2203 bch2_btree_node_hash_remove(&c
->btree_cache
, new_hash
);
2204 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
2206 bkey_copy(&b
->key
, new_key
);
2207 ret
= __bch2_btree_node_hash_insert(&c
->btree_cache
, b
);
2209 mutex_unlock(&c
->btree_cache
.lock
);
2211 bkey_copy(&b
->key
, new_key
);
2214 bch2_btree_node_unlock_write(trans
, iter
->path
, b
);
2216 bch2_trans_iter_exit(trans
, &iter2
);
2220 mutex_lock(&c
->btree_cache
.lock
);
2221 bch2_btree_node_hash_remove(&c
->btree_cache
, b
);
2222 mutex_unlock(&c
->btree_cache
.lock
);
2227 int bch2_btree_node_update_key(struct btree_trans
*trans
, struct btree_iter
*iter
,
2228 struct btree
*b
, struct bkey_i
*new_key
,
2229 unsigned commit_flags
, bool skip_triggers
)
2231 struct bch_fs
*c
= trans
->c
;
2232 struct btree
*new_hash
= NULL
;
2233 struct btree_path
*path
= iter
->path
;
2237 ret
= bch2_btree_path_upgrade(trans
, path
, b
->c
.level
+ 1);
2241 closure_init_stack(&cl
);
2244 * check btree_ptr_hash_val() after @b is locked by
2245 * btree_iter_traverse():
2247 if (btree_ptr_hash_val(new_key
) != b
->hash_val
) {
2248 ret
= bch2_btree_cache_cannibalize_lock(c
, &cl
);
2250 ret
= drop_locks_do(trans
, (closure_sync(&cl
), 0));
2255 new_hash
= bch2_btree_node_mem_alloc(trans
, false);
2259 ret
= __bch2_btree_node_update_key(trans
, iter
, b
, new_hash
, new_key
,
2260 commit_flags
, skip_triggers
);
2264 mutex_lock(&c
->btree_cache
.lock
);
2265 list_move(&new_hash
->list
, &c
->btree_cache
.freeable
);
2266 mutex_unlock(&c
->btree_cache
.lock
);
2268 six_unlock_write(&new_hash
->c
.lock
);
2269 six_unlock_intent(&new_hash
->c
.lock
);
2272 bch2_btree_cache_cannibalize_unlock(c
);
2276 int bch2_btree_node_update_key_get_iter(struct btree_trans
*trans
,
2277 struct btree
*b
, struct bkey_i
*new_key
,
2278 unsigned commit_flags
, bool skip_triggers
)
2280 struct btree_iter iter
;
2283 bch2_trans_node_iter_init(trans
, &iter
, b
->c
.btree_id
, b
->key
.k
.p
,
2284 BTREE_MAX_DEPTH
, b
->c
.level
,
2286 ret
= bch2_btree_iter_traverse(&iter
);
2290 /* has node been freed? */
2291 if (iter
.path
->l
[b
->c
.level
].b
!= b
) {
2292 /* node has been freed: */
2293 BUG_ON(!btree_node_dying(b
));
2297 BUG_ON(!btree_node_hashed(b
));
2299 ret
= bch2_btree_node_update_key(trans
, &iter
, b
, new_key
,
2300 commit_flags
, skip_triggers
);
2302 bch2_trans_iter_exit(trans
, &iter
);
2309 * Only for filesystem bringup, when first reading the btree roots or allocating
2310 * btree roots when initializing a new filesystem:
2312 void bch2_btree_set_root_for_read(struct bch_fs
*c
, struct btree
*b
)
2314 BUG_ON(btree_node_root(c
, b
));
2316 bch2_btree_set_root_inmem(c
, b
);
2319 static int __bch2_btree_root_alloc(struct btree_trans
*trans
, enum btree_id id
)
2321 struct bch_fs
*c
= trans
->c
;
2326 closure_init_stack(&cl
);
2329 ret
= bch2_btree_cache_cannibalize_lock(c
, &cl
);
2333 b
= bch2_btree_node_mem_alloc(trans
, false);
2334 bch2_btree_cache_cannibalize_unlock(c
);
2336 set_btree_node_fake(b
);
2337 set_btree_node_need_rewrite(b
);
2341 bkey_btree_ptr_init(&b
->key
);
2342 b
->key
.k
.p
= SPOS_MAX
;
2343 *((u64
*) bkey_i_to_btree_ptr(&b
->key
)->v
.start
) = U64_MAX
- id
;
2345 bch2_bset_init_first(b
, &b
->data
->keys
);
2346 bch2_btree_build_aux_trees(b
);
2349 btree_set_min(b
, POS_MIN
);
2350 btree_set_max(b
, SPOS_MAX
);
2351 b
->data
->format
= bch2_btree_calc_format(b
);
2352 btree_node_set_format(b
, b
->data
->format
);
2354 ret
= bch2_btree_node_hash_insert(&c
->btree_cache
, b
,
2355 b
->c
.level
, b
->c
.btree_id
);
2358 bch2_btree_set_root_inmem(c
, b
);
2360 six_unlock_write(&b
->c
.lock
);
2361 six_unlock_intent(&b
->c
.lock
);
2365 void bch2_btree_root_alloc(struct bch_fs
*c
, enum btree_id id
)
2367 bch2_trans_run(c
, __bch2_btree_root_alloc(trans
, id
));
2370 void bch2_btree_updates_to_text(struct printbuf
*out
, struct bch_fs
*c
)
2372 struct btree_update
*as
;
2374 mutex_lock(&c
->btree_interior_update_lock
);
2375 list_for_each_entry(as
, &c
->btree_interior_update_list
, list
)
2376 prt_printf(out
, "%p m %u w %u r %u j %llu\n",
2380 closure_nr_remaining(&as
->cl
),
2382 mutex_unlock(&c
->btree_interior_update_lock
);
2385 static bool bch2_btree_interior_updates_pending(struct bch_fs
*c
)
2389 mutex_lock(&c
->btree_interior_update_lock
);
2390 ret
= !list_empty(&c
->btree_interior_update_list
);
2391 mutex_unlock(&c
->btree_interior_update_lock
);
2396 bool bch2_btree_interior_updates_flush(struct bch_fs
*c
)
2398 bool ret
= bch2_btree_interior_updates_pending(c
);
2401 closure_wait_event(&c
->btree_interior_update_wait
,
2402 !bch2_btree_interior_updates_pending(c
));
2406 void bch2_journal_entry_to_btree_root(struct bch_fs
*c
, struct jset_entry
*entry
)
2408 struct btree_root
*r
= bch2_btree_id_root(c
, entry
->btree_id
);
2410 mutex_lock(&c
->btree_root_lock
);
2412 r
->level
= entry
->level
;
2414 bkey_copy(&r
->key
, (struct bkey_i
*) entry
->start
);
2416 mutex_unlock(&c
->btree_root_lock
);
2420 bch2_btree_roots_to_journal_entries(struct bch_fs
*c
,
2421 struct jset_entry
*end
,
2426 mutex_lock(&c
->btree_root_lock
);
2428 for (i
= 0; i
< btree_id_nr_alive(c
); i
++) {
2429 struct btree_root
*r
= bch2_btree_id_root(c
, i
);
2431 if (r
->alive
&& !test_bit(i
, &skip
)) {
2432 journal_entry_set(end
, BCH_JSET_ENTRY_btree_root
,
2433 i
, r
->level
, &r
->key
, r
->key
.k
.u64s
);
2434 end
= vstruct_next(end
);
2438 mutex_unlock(&c
->btree_root_lock
);
2443 void bch2_fs_btree_interior_update_exit(struct bch_fs
*c
)
2445 if (c
->btree_interior_update_worker
)
2446 destroy_workqueue(c
->btree_interior_update_worker
);
2447 mempool_exit(&c
->btree_interior_update_pool
);
2450 void bch2_fs_btree_interior_update_init_early(struct bch_fs
*c
)
2452 mutex_init(&c
->btree_reserve_cache_lock
);
2453 INIT_LIST_HEAD(&c
->btree_interior_update_list
);
2454 INIT_LIST_HEAD(&c
->btree_interior_updates_unwritten
);
2455 mutex_init(&c
->btree_interior_update_lock
);
2456 INIT_WORK(&c
->btree_interior_update_work
, btree_interior_update_work
);
2458 INIT_LIST_HEAD(&c
->pending_node_rewrites
);
2459 mutex_init(&c
->pending_node_rewrites_lock
);
2462 int bch2_fs_btree_interior_update_init(struct bch_fs
*c
)
2464 c
->btree_interior_update_worker
=
2465 alloc_workqueue("btree_update", WQ_UNBOUND
|WQ_MEM_RECLAIM
, 1);
2466 if (!c
->btree_interior_update_worker
)
2467 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init
;
2469 if (mempool_init_kmalloc_pool(&c
->btree_interior_update_pool
, 1,
2470 sizeof(struct btree_update
)))
2471 return -BCH_ERR_ENOMEM_btree_interior_update_pool_init
;