1 // SPDX-License-Identifier: GPL-2.0
4 #include "bkey_methods.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_journal_iter.h"
9 #include "btree_key_cache.h"
10 #include "btree_locking.h"
11 #include "btree_update.h"
20 #include <linux/random.h>
21 #include <linux/prefetch.h>
23 static inline void btree_path_list_remove(struct btree_trans
*, struct btree_path
*);
24 static inline void btree_path_list_add(struct btree_trans
*, struct btree_path
*,
27 static inline unsigned long btree_iter_ip_allocated(struct btree_iter
*iter
)
29 #ifdef TRACK_PATH_ALLOCATED
30 return iter
->ip_allocated
;
36 static struct btree_path
*btree_path_alloc(struct btree_trans
*, struct btree_path
*);
38 static inline int __btree_path_cmp(const struct btree_path
*l
,
39 enum btree_id r_btree_id
,
45 * Must match lock ordering as defined by __bch2_btree_node_lock:
47 return cmp_int(l
->btree_id
, r_btree_id
) ?:
48 cmp_int((int) l
->cached
, (int) r_cached
) ?:
49 bpos_cmp(l
->pos
, r_pos
) ?:
50 -cmp_int(l
->level
, r_level
);
53 static inline int btree_path_cmp(const struct btree_path
*l
,
54 const struct btree_path
*r
)
56 return __btree_path_cmp(l
, r
->btree_id
, r
->cached
, r
->pos
, r
->level
);
59 static inline struct bpos
bkey_successor(struct btree_iter
*iter
, struct bpos p
)
61 /* Are we iterating over keys in all snapshots? */
62 if (iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
) {
63 p
= bpos_successor(p
);
65 p
= bpos_nosnap_successor(p
);
66 p
.snapshot
= iter
->snapshot
;
72 static inline struct bpos
bkey_predecessor(struct btree_iter
*iter
, struct bpos p
)
74 /* Are we iterating over keys in all snapshots? */
75 if (iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
) {
76 p
= bpos_predecessor(p
);
78 p
= bpos_nosnap_predecessor(p
);
79 p
.snapshot
= iter
->snapshot
;
85 static inline struct bpos
btree_iter_search_key(struct btree_iter
*iter
)
87 struct bpos pos
= iter
->pos
;
89 if ((iter
->flags
& BTREE_ITER_IS_EXTENTS
) &&
90 !bkey_eq(pos
, POS_MAX
))
91 pos
= bkey_successor(iter
, pos
);
95 static inline bool btree_path_pos_before_node(struct btree_path
*path
,
98 return bpos_lt(path
->pos
, b
->data
->min_key
);
101 static inline bool btree_path_pos_after_node(struct btree_path
*path
,
104 return bpos_gt(path
->pos
, b
->key
.k
.p
);
107 static inline bool btree_path_pos_in_node(struct btree_path
*path
,
110 return path
->btree_id
== b
->c
.btree_id
&&
111 !btree_path_pos_before_node(path
, b
) &&
112 !btree_path_pos_after_node(path
, b
);
115 /* Btree iterator: */
117 #ifdef CONFIG_BCACHEFS_DEBUG
119 static void bch2_btree_path_verify_cached(struct btree_trans
*trans
,
120 struct btree_path
*path
)
122 struct bkey_cached
*ck
;
123 bool locked
= btree_node_locked(path
, 0);
125 if (!bch2_btree_node_relock(trans
, path
, 0))
128 ck
= (void *) path
->l
[0].b
;
129 BUG_ON(ck
->key
.btree_id
!= path
->btree_id
||
130 !bkey_eq(ck
->key
.pos
, path
->pos
));
133 btree_node_unlock(trans
, path
, 0);
136 static void bch2_btree_path_verify_level(struct btree_trans
*trans
,
137 struct btree_path
*path
, unsigned level
)
139 struct btree_path_level
*l
;
140 struct btree_node_iter tmp
;
142 struct bkey_packed
*p
, *k
;
143 struct printbuf buf1
= PRINTBUF
;
144 struct printbuf buf2
= PRINTBUF
;
145 struct printbuf buf3
= PRINTBUF
;
148 if (!bch2_debug_check_iterators
)
153 locked
= btree_node_locked(path
, level
);
157 bch2_btree_path_verify_cached(trans
, path
);
161 if (!btree_path_node(path
, level
))
164 if (!bch2_btree_node_relock_notrace(trans
, path
, level
))
167 BUG_ON(!btree_path_pos_in_node(path
, l
->b
));
169 bch2_btree_node_iter_verify(&l
->iter
, l
->b
);
172 * For interior nodes, the iterator will have skipped past deleted keys:
175 ? bch2_btree_node_iter_prev(&tmp
, l
->b
)
176 : bch2_btree_node_iter_prev_all(&tmp
, l
->b
);
177 k
= bch2_btree_node_iter_peek_all(&l
->iter
, l
->b
);
179 if (p
&& bkey_iter_pos_cmp(l
->b
, p
, &path
->pos
) >= 0) {
184 if (k
&& bkey_iter_pos_cmp(l
->b
, k
, &path
->pos
) < 0) {
190 btree_node_unlock(trans
, path
, level
);
193 bch2_bpos_to_text(&buf1
, path
->pos
);
196 struct bkey uk
= bkey_unpack_key(l
->b
, p
);
198 bch2_bkey_to_text(&buf2
, &uk
);
200 prt_printf(&buf2
, "(none)");
204 struct bkey uk
= bkey_unpack_key(l
->b
, k
);
206 bch2_bkey_to_text(&buf3
, &uk
);
208 prt_printf(&buf3
, "(none)");
211 panic("path should be %s key at level %u:\n"
215 msg
, level
, buf1
.buf
, buf2
.buf
, buf3
.buf
);
218 static void bch2_btree_path_verify(struct btree_trans
*trans
,
219 struct btree_path
*path
)
221 struct bch_fs
*c
= trans
->c
;
224 EBUG_ON(path
->btree_id
>= BTREE_ID_NR
);
226 for (i
= 0; i
< (!path
->cached
? BTREE_MAX_DEPTH
: 1); i
++) {
228 BUG_ON(!path
->cached
&&
229 bch2_btree_id_root(c
, path
->btree_id
)->b
->c
.level
> i
);
233 bch2_btree_path_verify_level(trans
, path
, i
);
236 bch2_btree_path_verify_locks(path
);
239 void bch2_trans_verify_paths(struct btree_trans
*trans
)
241 struct btree_path
*path
;
243 trans_for_each_path(trans
, path
)
244 bch2_btree_path_verify(trans
, path
);
247 static void bch2_btree_iter_verify(struct btree_iter
*iter
)
249 struct btree_trans
*trans
= iter
->trans
;
251 BUG_ON(iter
->btree_id
>= BTREE_ID_NR
);
253 BUG_ON(!!(iter
->flags
& BTREE_ITER_CACHED
) != iter
->path
->cached
);
255 BUG_ON((iter
->flags
& BTREE_ITER_IS_EXTENTS
) &&
256 (iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
));
258 BUG_ON(!(iter
->flags
& __BTREE_ITER_ALL_SNAPSHOTS
) &&
259 (iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
) &&
260 !btree_type_has_snapshot_field(iter
->btree_id
));
262 if (iter
->update_path
)
263 bch2_btree_path_verify(trans
, iter
->update_path
);
264 bch2_btree_path_verify(trans
, iter
->path
);
267 static void bch2_btree_iter_verify_entry_exit(struct btree_iter
*iter
)
269 BUG_ON((iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
) &&
270 !iter
->pos
.snapshot
);
272 BUG_ON(!(iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
) &&
273 iter
->pos
.snapshot
!= iter
->snapshot
);
275 BUG_ON(bkey_lt(iter
->pos
, bkey_start_pos(&iter
->k
)) ||
276 bkey_gt(iter
->pos
, iter
->k
.p
));
279 static int bch2_btree_iter_verify_ret(struct btree_iter
*iter
, struct bkey_s_c k
)
281 struct btree_trans
*trans
= iter
->trans
;
282 struct btree_iter copy
;
283 struct bkey_s_c prev
;
286 if (!bch2_debug_check_iterators
)
289 if (!(iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
))
292 if (bkey_err(k
) || !k
.k
)
295 BUG_ON(!bch2_snapshot_is_ancestor(trans
->c
,
299 bch2_trans_iter_init(trans
, ©
, iter
->btree_id
, iter
->pos
,
300 BTREE_ITER_NOPRESERVE
|
301 BTREE_ITER_ALL_SNAPSHOTS
);
302 prev
= bch2_btree_iter_prev(©
);
306 ret
= bkey_err(prev
);
310 if (bkey_eq(prev
.k
->p
, k
.k
->p
) &&
311 bch2_snapshot_is_ancestor(trans
->c
, iter
->snapshot
,
312 prev
.k
->p
.snapshot
) > 0) {
313 struct printbuf buf1
= PRINTBUF
, buf2
= PRINTBUF
;
315 bch2_bkey_to_text(&buf1
, k
.k
);
316 bch2_bkey_to_text(&buf2
, prev
.k
);
318 panic("iter snap %u\n"
325 bch2_trans_iter_exit(trans
, ©
);
329 void bch2_assert_pos_locked(struct btree_trans
*trans
, enum btree_id id
,
330 struct bpos pos
, bool key_cache
)
332 struct btree_path
*path
;
334 struct printbuf buf
= PRINTBUF
;
336 btree_trans_sort_paths(trans
);
338 trans_for_each_path_inorder(trans
, path
, idx
) {
339 int cmp
= cmp_int(path
->btree_id
, id
) ?:
340 cmp_int(path
->cached
, key_cache
);
347 if (!btree_node_locked(path
, 0) ||
348 !path
->should_be_locked
)
352 if (bkey_ge(pos
, path
->l
[0].b
->data
->min_key
) &&
353 bkey_le(pos
, path
->l
[0].b
->key
.k
.p
))
356 if (bkey_eq(pos
, path
->pos
))
361 bch2_dump_trans_paths_updates(trans
);
362 bch2_bpos_to_text(&buf
, pos
);
364 panic("not locked: %s %s%s\n",
365 bch2_btree_id_str(id
), buf
.buf
,
366 key_cache
? " cached" : "");
371 static inline void bch2_btree_path_verify_level(struct btree_trans
*trans
,
372 struct btree_path
*path
, unsigned l
) {}
373 static inline void bch2_btree_path_verify(struct btree_trans
*trans
,
374 struct btree_path
*path
) {}
375 static inline void bch2_btree_iter_verify(struct btree_iter
*iter
) {}
376 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter
*iter
) {}
377 static inline int bch2_btree_iter_verify_ret(struct btree_iter
*iter
, struct bkey_s_c k
) { return 0; }
381 /* Btree path: fixups after btree updates */
383 static void btree_node_iter_set_set_pos(struct btree_node_iter
*iter
,
386 struct bkey_packed
*k
)
388 struct btree_node_iter_set
*set
;
390 btree_node_iter_for_each(iter
, set
)
391 if (set
->end
== t
->end_offset
) {
392 set
->k
= __btree_node_key_to_offset(b
, k
);
393 bch2_btree_node_iter_sort(iter
, b
);
397 bch2_btree_node_iter_push(iter
, b
, k
, btree_bkey_last(b
, t
));
400 static void __bch2_btree_path_fix_key_modified(struct btree_path
*path
,
402 struct bkey_packed
*where
)
404 struct btree_path_level
*l
= &path
->l
[b
->c
.level
];
406 if (where
!= bch2_btree_node_iter_peek_all(&l
->iter
, l
->b
))
409 if (bkey_iter_pos_cmp(l
->b
, where
, &path
->pos
) < 0)
410 bch2_btree_node_iter_advance(&l
->iter
, l
->b
);
413 void bch2_btree_path_fix_key_modified(struct btree_trans
*trans
,
415 struct bkey_packed
*where
)
417 struct btree_path
*path
;
419 trans_for_each_path_with_node(trans
, b
, path
) {
420 __bch2_btree_path_fix_key_modified(path
, b
, where
);
421 bch2_btree_path_verify_level(trans
, path
, b
->c
.level
);
425 static void __bch2_btree_node_iter_fix(struct btree_path
*path
,
427 struct btree_node_iter
*node_iter
,
429 struct bkey_packed
*where
,
430 unsigned clobber_u64s
,
433 const struct bkey_packed
*end
= btree_bkey_last(b
, t
);
434 struct btree_node_iter_set
*set
;
435 unsigned offset
= __btree_node_key_to_offset(b
, where
);
436 int shift
= new_u64s
- clobber_u64s
;
437 unsigned old_end
= t
->end_offset
- shift
;
438 unsigned orig_iter_pos
= node_iter
->data
[0].k
;
439 bool iter_current_key_modified
=
440 orig_iter_pos
>= offset
&&
441 orig_iter_pos
<= offset
+ clobber_u64s
;
443 btree_node_iter_for_each(node_iter
, set
)
444 if (set
->end
== old_end
)
447 /* didn't find the bset in the iterator - might have to readd it: */
449 bkey_iter_pos_cmp(b
, where
, &path
->pos
) >= 0) {
450 bch2_btree_node_iter_push(node_iter
, b
, where
, end
);
453 /* Iterator is after key that changed */
457 set
->end
= t
->end_offset
;
459 /* Iterator hasn't gotten to the key that changed yet: */
464 bkey_iter_pos_cmp(b
, where
, &path
->pos
) >= 0) {
466 } else if (set
->k
< offset
+ clobber_u64s
) {
467 set
->k
= offset
+ new_u64s
;
468 if (set
->k
== set
->end
)
469 bch2_btree_node_iter_set_drop(node_iter
, set
);
471 /* Iterator is after key that changed */
472 set
->k
= (int) set
->k
+ shift
;
476 bch2_btree_node_iter_sort(node_iter
, b
);
478 if (node_iter
->data
[0].k
!= orig_iter_pos
)
479 iter_current_key_modified
= true;
482 * When a new key is added, and the node iterator now points to that
483 * key, the iterator might have skipped past deleted keys that should
484 * come after the key the iterator now points to. We have to rewind to
485 * before those deleted keys - otherwise
486 * bch2_btree_node_iter_prev_all() breaks:
488 if (!bch2_btree_node_iter_end(node_iter
) &&
489 iter_current_key_modified
&&
491 struct bkey_packed
*k
, *k2
, *p
;
493 k
= bch2_btree_node_iter_peek_all(node_iter
, b
);
495 for_each_bset(b
, t
) {
496 bool set_pos
= false;
498 if (node_iter
->data
[0].end
== t
->end_offset
)
501 k2
= bch2_btree_node_iter_bset_pos(node_iter
, b
, t
);
503 while ((p
= bch2_bkey_prev_all(b
, t
, k2
)) &&
504 bkey_iter_cmp(b
, k
, p
) < 0) {
510 btree_node_iter_set_set_pos(node_iter
,
516 void bch2_btree_node_iter_fix(struct btree_trans
*trans
,
517 struct btree_path
*path
,
519 struct btree_node_iter
*node_iter
,
520 struct bkey_packed
*where
,
521 unsigned clobber_u64s
,
524 struct bset_tree
*t
= bch2_bkey_to_bset_inlined(b
, where
);
525 struct btree_path
*linked
;
527 if (node_iter
!= &path
->l
[b
->c
.level
].iter
) {
528 __bch2_btree_node_iter_fix(path
, b
, node_iter
, t
,
529 where
, clobber_u64s
, new_u64s
);
531 if (bch2_debug_check_iterators
)
532 bch2_btree_node_iter_verify(node_iter
, b
);
535 trans_for_each_path_with_node(trans
, b
, linked
) {
536 __bch2_btree_node_iter_fix(linked
, b
,
537 &linked
->l
[b
->c
.level
].iter
, t
,
538 where
, clobber_u64s
, new_u64s
);
539 bch2_btree_path_verify_level(trans
, linked
, b
->c
.level
);
543 /* Btree path level: pointer to a particular btree node and node iter */
545 static inline struct bkey_s_c
__btree_iter_unpack(struct bch_fs
*c
,
546 struct btree_path_level
*l
,
548 struct bkey_packed
*k
)
552 * signal to bch2_btree_iter_peek_slot() that we're currently at
555 u
->type
= KEY_TYPE_deleted
;
556 return bkey_s_c_null
;
559 return bkey_disassemble(l
->b
, k
, u
);
562 static inline struct bkey_s_c
btree_path_level_peek_all(struct bch_fs
*c
,
563 struct btree_path_level
*l
,
566 return __btree_iter_unpack(c
, l
, u
,
567 bch2_btree_node_iter_peek_all(&l
->iter
, l
->b
));
570 static inline struct bkey_s_c
btree_path_level_peek(struct btree_trans
*trans
,
571 struct btree_path
*path
,
572 struct btree_path_level
*l
,
575 struct bkey_s_c k
= __btree_iter_unpack(trans
->c
, l
, u
,
576 bch2_btree_node_iter_peek(&l
->iter
, l
->b
));
578 path
->pos
= k
.k
? k
.k
->p
: l
->b
->key
.k
.p
;
579 trans
->paths_sorted
= false;
580 bch2_btree_path_verify_level(trans
, path
, l
- path
->l
);
584 static inline struct bkey_s_c
btree_path_level_prev(struct btree_trans
*trans
,
585 struct btree_path
*path
,
586 struct btree_path_level
*l
,
589 struct bkey_s_c k
= __btree_iter_unpack(trans
->c
, l
, u
,
590 bch2_btree_node_iter_prev(&l
->iter
, l
->b
));
592 path
->pos
= k
.k
? k
.k
->p
: l
->b
->data
->min_key
;
593 trans
->paths_sorted
= false;
594 bch2_btree_path_verify_level(trans
, path
, l
- path
->l
);
598 static inline bool btree_path_advance_to_pos(struct btree_path
*path
,
599 struct btree_path_level
*l
,
602 struct bkey_packed
*k
;
605 while ((k
= bch2_btree_node_iter_peek_all(&l
->iter
, l
->b
)) &&
606 bkey_iter_pos_cmp(l
->b
, k
, &path
->pos
) < 0) {
607 if (max_advance
> 0 && nr_advanced
>= max_advance
)
610 bch2_btree_node_iter_advance(&l
->iter
, l
->b
);
617 static inline void __btree_path_level_init(struct btree_path
*path
,
620 struct btree_path_level
*l
= &path
->l
[level
];
622 bch2_btree_node_iter_init(&l
->iter
, l
->b
, &path
->pos
);
625 * Iterators to interior nodes should always be pointed at the first non
629 bch2_btree_node_iter_peek(&l
->iter
, l
->b
);
632 void bch2_btree_path_level_init(struct btree_trans
*trans
,
633 struct btree_path
*path
,
636 BUG_ON(path
->cached
);
638 EBUG_ON(!btree_path_pos_in_node(path
, b
));
640 path
->l
[b
->c
.level
].lock_seq
= six_lock_seq(&b
->c
.lock
);
641 path
->l
[b
->c
.level
].b
= b
;
642 __btree_path_level_init(path
, b
->c
.level
);
645 /* Btree path: fixups after btree node updates: */
647 static void bch2_trans_revalidate_updates_in_node(struct btree_trans
*trans
, struct btree
*b
)
649 struct bch_fs
*c
= trans
->c
;
650 struct btree_insert_entry
*i
;
652 trans_for_each_update(trans
, i
)
654 i
->level
== b
->c
.level
&&
655 i
->btree_id
== b
->c
.btree_id
&&
656 bpos_cmp(i
->k
->k
.p
, b
->data
->min_key
) >= 0 &&
657 bpos_cmp(i
->k
->k
.p
, b
->data
->max_key
) <= 0) {
658 i
->old_v
= bch2_btree_path_peek_slot(i
->path
, &i
->old_k
).v
;
660 if (unlikely(trans
->journal_replay_not_finished
)) {
662 bch2_journal_keys_peek_slot(c
, i
->btree_id
, i
->level
,
674 * A btree node is being replaced - update the iterator to point to the new
677 void bch2_trans_node_add(struct btree_trans
*trans
, struct btree
*b
)
679 struct btree_path
*path
;
681 trans_for_each_path(trans
, path
)
682 if (path
->uptodate
== BTREE_ITER_UPTODATE
&&
684 btree_path_pos_in_node(path
, b
)) {
685 enum btree_node_locked_type t
=
686 btree_lock_want(path
, b
->c
.level
);
688 if (t
!= BTREE_NODE_UNLOCKED
) {
689 btree_node_unlock(trans
, path
, b
->c
.level
);
690 six_lock_increment(&b
->c
.lock
, (enum six_lock_type
) t
);
691 mark_btree_node_locked(trans
, path
, b
->c
.level
, t
);
694 bch2_btree_path_level_init(trans
, path
, b
);
697 bch2_trans_revalidate_updates_in_node(trans
, b
);
701 * A btree node has been modified in such a way as to invalidate iterators - fix
704 void bch2_trans_node_reinit_iter(struct btree_trans
*trans
, struct btree
*b
)
706 struct btree_path
*path
;
708 trans_for_each_path_with_node(trans
, b
, path
)
709 __btree_path_level_init(path
, b
->c
.level
);
711 bch2_trans_revalidate_updates_in_node(trans
, b
);
714 /* Btree path: traverse, set_pos: */
716 static inline int btree_path_lock_root(struct btree_trans
*trans
,
717 struct btree_path
*path
,
719 unsigned long trace_ip
)
721 struct bch_fs
*c
= trans
->c
;
722 struct btree
*b
, **rootp
= &bch2_btree_id_root(c
, path
->btree_id
)->b
;
723 enum six_lock_type lock_type
;
727 EBUG_ON(path
->nodes_locked
);
730 b
= READ_ONCE(*rootp
);
731 path
->level
= READ_ONCE(b
->c
.level
);
733 if (unlikely(path
->level
< depth_want
)) {
735 * the root is at a lower depth than the depth we want:
736 * got to the end of the btree, or we're walking nodes
737 * greater than some depth and there are no nodes >=
740 path
->level
= depth_want
;
741 for (i
= path
->level
; i
< BTREE_MAX_DEPTH
; i
++)
746 lock_type
= __btree_lock_want(path
, path
->level
);
747 ret
= btree_node_lock(trans
, path
, &b
->c
,
748 path
->level
, lock_type
, trace_ip
);
750 if (bch2_err_matches(ret
, BCH_ERR_lock_fail_root_changed
))
752 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
757 if (likely(b
== READ_ONCE(*rootp
) &&
758 b
->c
.level
== path
->level
&&
760 for (i
= 0; i
< path
->level
; i
++)
761 path
->l
[i
].b
= ERR_PTR(-BCH_ERR_no_btree_node_lock_root
);
762 path
->l
[path
->level
].b
= b
;
763 for (i
= path
->level
+ 1; i
< BTREE_MAX_DEPTH
; i
++)
766 mark_btree_node_locked(trans
, path
, path
->level
,
767 (enum btree_node_locked_type
) lock_type
);
768 bch2_btree_path_level_init(trans
, path
, b
);
772 six_unlock_type(&b
->c
.lock
, lock_type
);
777 static int btree_path_prefetch(struct btree_trans
*trans
, struct btree_path
*path
)
779 struct bch_fs
*c
= trans
->c
;
780 struct btree_path_level
*l
= path_l(path
);
781 struct btree_node_iter node_iter
= l
->iter
;
782 struct bkey_packed
*k
;
784 unsigned nr
= test_bit(BCH_FS_STARTED
, &c
->flags
)
785 ? (path
->level
> 1 ? 0 : 2)
786 : (path
->level
> 1 ? 1 : 16);
787 bool was_locked
= btree_node_locked(path
, path
->level
);
790 bch2_bkey_buf_init(&tmp
);
792 while (nr
-- && !ret
) {
793 if (!bch2_btree_node_relock(trans
, path
, path
->level
))
796 bch2_btree_node_iter_advance(&node_iter
, l
->b
);
797 k
= bch2_btree_node_iter_peek(&node_iter
, l
->b
);
801 bch2_bkey_buf_unpack(&tmp
, c
, l
->b
, k
);
802 ret
= bch2_btree_node_prefetch(trans
, path
, tmp
.k
, path
->btree_id
,
807 btree_node_unlock(trans
, path
, path
->level
);
809 bch2_bkey_buf_exit(&tmp
, c
);
813 static int btree_path_prefetch_j(struct btree_trans
*trans
, struct btree_path
*path
,
814 struct btree_and_journal_iter
*jiter
)
816 struct bch_fs
*c
= trans
->c
;
819 unsigned nr
= test_bit(BCH_FS_STARTED
, &c
->flags
)
820 ? (path
->level
> 1 ? 0 : 2)
821 : (path
->level
> 1 ? 1 : 16);
822 bool was_locked
= btree_node_locked(path
, path
->level
);
825 bch2_bkey_buf_init(&tmp
);
827 while (nr
-- && !ret
) {
828 if (!bch2_btree_node_relock(trans
, path
, path
->level
))
831 bch2_btree_and_journal_iter_advance(jiter
);
832 k
= bch2_btree_and_journal_iter_peek(jiter
);
836 bch2_bkey_buf_reassemble(&tmp
, c
, k
);
837 ret
= bch2_btree_node_prefetch(trans
, path
, tmp
.k
, path
->btree_id
,
842 btree_node_unlock(trans
, path
, path
->level
);
844 bch2_bkey_buf_exit(&tmp
, c
);
848 static noinline
void btree_node_mem_ptr_set(struct btree_trans
*trans
,
849 struct btree_path
*path
,
850 unsigned plevel
, struct btree
*b
)
852 struct btree_path_level
*l
= &path
->l
[plevel
];
853 bool locked
= btree_node_locked(path
, plevel
);
854 struct bkey_packed
*k
;
855 struct bch_btree_ptr_v2
*bp
;
857 if (!bch2_btree_node_relock(trans
, path
, plevel
))
860 k
= bch2_btree_node_iter_peek_all(&l
->iter
, l
->b
);
861 BUG_ON(k
->type
!= KEY_TYPE_btree_ptr_v2
);
863 bp
= (void *) bkeyp_val(&l
->b
->format
, k
);
864 bp
->mem_ptr
= (unsigned long)b
;
867 btree_node_unlock(trans
, path
, plevel
);
870 static noinline
int btree_node_iter_and_journal_peek(struct btree_trans
*trans
,
871 struct btree_path
*path
,
873 struct bkey_buf
*out
)
875 struct bch_fs
*c
= trans
->c
;
876 struct btree_path_level
*l
= path_l(path
);
877 struct btree_and_journal_iter jiter
;
881 __bch2_btree_and_journal_iter_init_node_iter(&jiter
, c
, l
->b
, l
->iter
, path
->pos
);
883 k
= bch2_btree_and_journal_iter_peek(&jiter
);
885 bch2_bkey_buf_reassemble(out
, c
, k
);
887 if (flags
& BTREE_ITER_PREFETCH
)
888 ret
= btree_path_prefetch_j(trans
, path
, &jiter
);
890 bch2_btree_and_journal_iter_exit(&jiter
);
894 static __always_inline
int btree_path_down(struct btree_trans
*trans
,
895 struct btree_path
*path
,
897 unsigned long trace_ip
)
899 struct bch_fs
*c
= trans
->c
;
900 struct btree_path_level
*l
= path_l(path
);
902 unsigned level
= path
->level
- 1;
903 enum six_lock_type lock_type
= __btree_lock_want(path
, level
);
907 EBUG_ON(!btree_node_locked(path
, path
->level
));
909 bch2_bkey_buf_init(&tmp
);
911 if (unlikely(trans
->journal_replay_not_finished
)) {
912 ret
= btree_node_iter_and_journal_peek(trans
, path
, flags
, &tmp
);
916 bch2_bkey_buf_unpack(&tmp
, c
, l
->b
,
917 bch2_btree_node_iter_peek(&l
->iter
, l
->b
));
919 if (flags
& BTREE_ITER_PREFETCH
) {
920 ret
= btree_path_prefetch(trans
, path
);
926 b
= bch2_btree_node_get(trans
, path
, tmp
.k
, level
, lock_type
, trace_ip
);
927 ret
= PTR_ERR_OR_ZERO(b
);
931 if (likely(!trans
->journal_replay_not_finished
&&
932 tmp
.k
->k
.type
== KEY_TYPE_btree_ptr_v2
) &&
933 unlikely(b
!= btree_node_mem_ptr(tmp
.k
)))
934 btree_node_mem_ptr_set(trans
, path
, level
+ 1, b
);
936 if (btree_node_read_locked(path
, level
+ 1))
937 btree_node_unlock(trans
, path
, level
+ 1);
939 mark_btree_node_locked(trans
, path
, level
,
940 (enum btree_node_locked_type
) lock_type
);
942 bch2_btree_path_level_init(trans
, path
, b
);
944 bch2_btree_path_verify_locks(path
);
946 bch2_bkey_buf_exit(&tmp
, c
);
951 static int bch2_btree_path_traverse_all(struct btree_trans
*trans
)
953 struct bch_fs
*c
= trans
->c
;
954 struct btree_path
*path
;
955 unsigned long trace_ip
= _RET_IP_
;
958 if (trans
->in_traverse_all
)
959 return -BCH_ERR_transaction_restart_in_traverse_all
;
961 trans
->in_traverse_all
= true;
963 trans
->restarted
= 0;
964 trans
->last_restarted_ip
= 0;
966 trans_for_each_path(trans
, path
)
967 path
->should_be_locked
= false;
969 btree_trans_sort_paths(trans
);
971 bch2_trans_unlock(trans
);
974 if (unlikely(trans
->memory_allocation_failure
)) {
977 closure_init_stack(&cl
);
980 ret
= bch2_btree_cache_cannibalize_lock(c
, &cl
);
985 /* Now, redo traversals in correct order: */
987 while (i
< trans
->nr_sorted
) {
988 path
= trans
->paths
+ trans
->sorted
[i
];
991 * Traversing a path can cause another path to be added at about
994 if (path
->uptodate
) {
995 __btree_path_get(path
, false);
996 ret
= bch2_btree_path_traverse_one(trans
, path
, 0, _THIS_IP_
);
997 __btree_path_put(path
, false);
999 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
) ||
1000 bch2_err_matches(ret
, ENOMEM
))
1010 * We used to assert that all paths had been traversed here
1011 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
1012 * path->should_be_locked is not set yet, we might have unlocked and
1013 * then failed to relock a path - that's fine.
1016 bch2_btree_cache_cannibalize_unlock(c
);
1018 trans
->in_traverse_all
= false;
1020 trace_and_count(c
, trans_traverse_all
, trans
, trace_ip
);
1024 static inline bool btree_path_check_pos_in_node(struct btree_path
*path
,
1025 unsigned l
, int check_pos
)
1027 if (check_pos
< 0 && btree_path_pos_before_node(path
, path
->l
[l
].b
))
1029 if (check_pos
> 0 && btree_path_pos_after_node(path
, path
->l
[l
].b
))
1034 static inline bool btree_path_good_node(struct btree_trans
*trans
,
1035 struct btree_path
*path
,
1036 unsigned l
, int check_pos
)
1038 return is_btree_node(path
, l
) &&
1039 bch2_btree_node_relock(trans
, path
, l
) &&
1040 btree_path_check_pos_in_node(path
, l
, check_pos
);
1043 static void btree_path_set_level_down(struct btree_trans
*trans
,
1044 struct btree_path
*path
,
1049 path
->level
= new_level
;
1051 for (l
= path
->level
+ 1; l
< BTREE_MAX_DEPTH
; l
++)
1052 if (btree_lock_want(path
, l
) == BTREE_NODE_UNLOCKED
)
1053 btree_node_unlock(trans
, path
, l
);
1055 btree_path_set_dirty(path
, BTREE_ITER_NEED_TRAVERSE
);
1056 bch2_btree_path_verify(trans
, path
);
1059 static noinline
unsigned __btree_path_up_until_good_node(struct btree_trans
*trans
,
1060 struct btree_path
*path
,
1063 unsigned i
, l
= path
->level
;
1065 while (btree_path_node(path
, l
) &&
1066 !btree_path_good_node(trans
, path
, l
, check_pos
))
1067 __btree_path_set_level_up(trans
, path
, l
++);
1069 /* If we need intent locks, take them too: */
1071 i
< path
->locks_want
&& btree_path_node(path
, i
);
1073 if (!bch2_btree_node_relock(trans
, path
, i
)) {
1075 __btree_path_set_level_up(trans
, path
, l
++);
1082 static inline unsigned btree_path_up_until_good_node(struct btree_trans
*trans
,
1083 struct btree_path
*path
,
1086 return likely(btree_node_locked(path
, path
->level
) &&
1087 btree_path_check_pos_in_node(path
, path
->level
, check_pos
))
1089 : __btree_path_up_until_good_node(trans
, path
, check_pos
);
1093 * This is the main state machine for walking down the btree - walks down to a
1096 * Returns 0 on success, -EIO on error (error reading in a btree node).
1098 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1099 * stashed in the iterator and returned from bch2_trans_exit().
1101 int bch2_btree_path_traverse_one(struct btree_trans
*trans
,
1102 struct btree_path
*path
,
1104 unsigned long trace_ip
)
1106 unsigned depth_want
= path
->level
;
1107 int ret
= -((int) trans
->restarted
);
1112 if (unlikely(!trans
->srcu_held
))
1113 bch2_trans_srcu_lock(trans
);
1116 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
1117 * and re-traverse the path without a transaction restart:
1119 if (path
->should_be_locked
) {
1120 ret
= bch2_btree_path_relock(trans
, path
, trace_ip
);
1125 ret
= bch2_btree_path_traverse_cached(trans
, path
, flags
);
1129 if (unlikely(path
->level
>= BTREE_MAX_DEPTH
))
1132 path
->level
= btree_path_up_until_good_node(trans
, path
, 0);
1134 EBUG_ON(btree_path_node(path
, path
->level
) &&
1135 !btree_node_locked(path
, path
->level
));
1138 * Note: path->nodes[path->level] may be temporarily NULL here - that
1139 * would indicate to other code that we got to the end of the btree,
1140 * here it indicates that relocking the root failed - it's critical that
1141 * btree_path_lock_root() comes next and that it can't fail
1143 while (path
->level
> depth_want
) {
1144 ret
= btree_path_node(path
, path
->level
)
1145 ? btree_path_down(trans
, path
, flags
, trace_ip
)
1146 : btree_path_lock_root(trans
, path
, depth_want
, trace_ip
);
1147 if (unlikely(ret
)) {
1150 * No nodes at this level - got to the end of
1157 __bch2_btree_path_unlock(trans
, path
);
1158 path
->level
= depth_want
;
1159 path
->l
[path
->level
].b
= ERR_PTR(ret
);
1164 path
->uptodate
= BTREE_ITER_UPTODATE
;
1166 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
) != !!trans
->restarted
)
1167 panic("ret %s (%i) trans->restarted %s (%i)\n",
1168 bch2_err_str(ret
), ret
,
1169 bch2_err_str(trans
->restarted
), trans
->restarted
);
1170 bch2_btree_path_verify(trans
, path
);
1174 static inline void btree_path_copy(struct btree_trans
*trans
, struct btree_path
*dst
,
1175 struct btree_path
*src
)
1177 unsigned i
, offset
= offsetof(struct btree_path
, pos
);
1179 memcpy((void *) dst
+ offset
,
1180 (void *) src
+ offset
,
1181 sizeof(struct btree_path
) - offset
);
1183 for (i
= 0; i
< BTREE_MAX_DEPTH
; i
++) {
1184 unsigned t
= btree_node_locked_type(dst
, i
);
1186 if (t
!= BTREE_NODE_UNLOCKED
)
1187 six_lock_increment(&dst
->l
[i
].b
->c
.lock
, t
);
1191 static struct btree_path
*btree_path_clone(struct btree_trans
*trans
, struct btree_path
*src
,
1194 struct btree_path
*new = btree_path_alloc(trans
, src
);
1196 btree_path_copy(trans
, new, src
);
1197 __btree_path_get(new, intent
);
1202 struct btree_path
*__bch2_btree_path_make_mut(struct btree_trans
*trans
,
1203 struct btree_path
*path
, bool intent
,
1206 __btree_path_put(path
, intent
);
1207 path
= btree_path_clone(trans
, path
, intent
);
1208 path
->preserve
= false;
1212 struct btree_path
* __must_check
1213 __bch2_btree_path_set_pos(struct btree_trans
*trans
,
1214 struct btree_path
*path
, struct bpos new_pos
,
1215 bool intent
, unsigned long ip
, int cmp
)
1217 unsigned level
= path
->level
;
1219 bch2_trans_verify_not_in_restart(trans
);
1220 EBUG_ON(!path
->ref
);
1222 path
= bch2_btree_path_make_mut(trans
, path
, intent
, ip
);
1224 path
->pos
= new_pos
;
1225 trans
->paths_sorted
= false;
1227 if (unlikely(path
->cached
)) {
1228 btree_node_unlock(trans
, path
, 0);
1229 path
->l
[0].b
= ERR_PTR(-BCH_ERR_no_btree_node_up
);
1230 btree_path_set_dirty(path
, BTREE_ITER_NEED_TRAVERSE
);
1234 level
= btree_path_up_until_good_node(trans
, path
, cmp
);
1236 if (btree_path_node(path
, level
)) {
1237 struct btree_path_level
*l
= &path
->l
[level
];
1239 BUG_ON(!btree_node_locked(path
, level
));
1241 * We might have to skip over many keys, or just a few: try
1242 * advancing the node iterator, and if we have to skip over too
1243 * many keys just reinit it (or if we're rewinding, since that
1247 !btree_path_advance_to_pos(path
, l
, 8))
1248 bch2_btree_node_iter_init(&l
->iter
, l
->b
, &path
->pos
);
1251 * Iterators to interior nodes should always be pointed at the first non
1254 if (unlikely(level
))
1255 bch2_btree_node_iter_peek(&l
->iter
, l
->b
);
1258 if (unlikely(level
!= path
->level
)) {
1259 btree_path_set_dirty(path
, BTREE_ITER_NEED_TRAVERSE
);
1260 __bch2_btree_path_unlock(trans
, path
);
1263 bch2_btree_path_verify(trans
, path
);
1267 /* Btree path: main interface: */
1269 static struct btree_path
*have_path_at_pos(struct btree_trans
*trans
, struct btree_path
*path
)
1271 struct btree_path
*sib
;
1273 sib
= prev_btree_path(trans
, path
);
1274 if (sib
&& !btree_path_cmp(sib
, path
))
1277 sib
= next_btree_path(trans
, path
);
1278 if (sib
&& !btree_path_cmp(sib
, path
))
1284 static struct btree_path
*have_node_at_pos(struct btree_trans
*trans
, struct btree_path
*path
)
1286 struct btree_path
*sib
;
1288 sib
= prev_btree_path(trans
, path
);
1289 if (sib
&& sib
->level
== path
->level
&& path_l(sib
)->b
== path_l(path
)->b
)
1292 sib
= next_btree_path(trans
, path
);
1293 if (sib
&& sib
->level
== path
->level
&& path_l(sib
)->b
== path_l(path
)->b
)
1299 static inline void __bch2_path_free(struct btree_trans
*trans
, struct btree_path
*path
)
1301 __bch2_btree_path_unlock(trans
, path
);
1302 btree_path_list_remove(trans
, path
);
1303 trans
->paths_allocated
&= ~(1ULL << path
->idx
);
1306 void bch2_path_put(struct btree_trans
*trans
, struct btree_path
*path
, bool intent
)
1308 struct btree_path
*dup
;
1310 EBUG_ON(trans
->paths
+ path
->idx
!= path
);
1311 EBUG_ON(!path
->ref
);
1313 if (!__btree_path_put(path
, intent
))
1316 dup
= path
->preserve
1317 ? have_path_at_pos(trans
, path
)
1318 : have_node_at_pos(trans
, path
);
1320 if (!dup
&& !(!path
->preserve
&& !is_btree_node(path
, path
->level
)))
1323 if (path
->should_be_locked
&&
1324 !trans
->restarted
&&
1325 (!dup
|| !bch2_btree_path_relock_norestart(trans
, dup
, _THIS_IP_
)))
1329 dup
->preserve
|= path
->preserve
;
1330 dup
->should_be_locked
|= path
->should_be_locked
;
1333 __bch2_path_free(trans
, path
);
1336 static void bch2_path_put_nokeep(struct btree_trans
*trans
, struct btree_path
*path
,
1339 EBUG_ON(trans
->paths
+ path
->idx
!= path
);
1340 EBUG_ON(!path
->ref
);
1342 if (!__btree_path_put(path
, intent
))
1345 __bch2_path_free(trans
, path
);
1348 void __noreturn
bch2_trans_restart_error(struct btree_trans
*trans
, u32 restart_count
)
1350 panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
1351 trans
->restart_count
, restart_count
,
1352 (void *) trans
->last_begin_ip
);
1355 void __noreturn
bch2_trans_in_restart_error(struct btree_trans
*trans
)
1357 panic("in transaction restart: %s, last restarted by %pS\n",
1358 bch2_err_str(trans
->restarted
),
1359 (void *) trans
->last_restarted_ip
);
1363 void bch2_trans_updates_to_text(struct printbuf
*buf
, struct btree_trans
*trans
)
1365 struct btree_insert_entry
*i
;
1366 struct btree_write_buffered_key
*wb
;
1368 prt_printf(buf
, "transaction updates for %s journal seq %llu",
1369 trans
->fn
, trans
->journal_res
.seq
);
1371 printbuf_indent_add(buf
, 2);
1373 trans_for_each_update(trans
, i
) {
1374 struct bkey_s_c old
= { &i
->old_k
, i
->old_v
};
1376 prt_printf(buf
, "update: btree=%s cached=%u %pS",
1377 bch2_btree_id_str(i
->btree_id
),
1379 (void *) i
->ip_allocated
);
1382 prt_printf(buf
, " old ");
1383 bch2_bkey_val_to_text(buf
, trans
->c
, old
);
1386 prt_printf(buf
, " new ");
1387 bch2_bkey_val_to_text(buf
, trans
->c
, bkey_i_to_s_c(i
->k
));
1391 trans_for_each_wb_update(trans
, wb
) {
1392 prt_printf(buf
, "update: btree=%s wb=1 %pS",
1393 bch2_btree_id_str(wb
->btree
),
1394 (void *) i
->ip_allocated
);
1397 prt_printf(buf
, " new ");
1398 bch2_bkey_val_to_text(buf
, trans
->c
, bkey_i_to_s_c(&wb
->k
));
1402 printbuf_indent_sub(buf
, 2);
1406 void bch2_dump_trans_updates(struct btree_trans
*trans
)
1408 struct printbuf buf
= PRINTBUF
;
1410 bch2_trans_updates_to_text(&buf
, trans
);
1411 bch2_print_string_as_lines(KERN_ERR
, buf
.buf
);
1412 printbuf_exit(&buf
);
1416 void bch2_btree_path_to_text(struct printbuf
*out
, struct btree_path
*path
)
1418 prt_printf(out
, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
1419 path
->idx
, path
->ref
, path
->intent_ref
,
1420 path
->preserve
? 'P' : ' ',
1421 path
->should_be_locked
? 'S' : ' ',
1422 bch2_btree_id_str(path
->btree_id
),
1424 bch2_bpos_to_text(out
, path
->pos
);
1426 prt_printf(out
, " locks %u", path
->nodes_locked
);
1427 #ifdef TRACK_PATH_ALLOCATED
1428 prt_printf(out
, " %pS", (void *) path
->ip_allocated
);
1433 static noinline __cold
1434 void __bch2_trans_paths_to_text(struct printbuf
*out
, struct btree_trans
*trans
,
1437 struct btree_path
*path
;
1441 btree_trans_sort_paths(trans
);
1443 trans_for_each_path_inorder(trans
, path
, idx
)
1444 bch2_btree_path_to_text(out
, path
);
1448 void bch2_trans_paths_to_text(struct printbuf
*out
, struct btree_trans
*trans
)
1450 __bch2_trans_paths_to_text(out
, trans
, false);
1453 static noinline __cold
1454 void __bch2_dump_trans_paths_updates(struct btree_trans
*trans
, bool nosort
)
1456 struct printbuf buf
= PRINTBUF
;
1458 __bch2_trans_paths_to_text(&buf
, trans
, nosort
);
1459 bch2_trans_updates_to_text(&buf
, trans
);
1461 bch2_print_string_as_lines(KERN_ERR
, buf
.buf
);
1462 printbuf_exit(&buf
);
1466 void bch2_dump_trans_paths_updates(struct btree_trans
*trans
)
1468 __bch2_dump_trans_paths_updates(trans
, false);
1472 static void bch2_trans_update_max_paths(struct btree_trans
*trans
)
1474 struct btree_transaction_stats
*s
= btree_trans_stats(trans
);
1475 struct printbuf buf
= PRINTBUF
;
1480 bch2_trans_paths_to_text(&buf
, trans
);
1482 if (!buf
.allocation_failure
) {
1483 mutex_lock(&s
->lock
);
1484 if (s
->nr_max_paths
< hweight64(trans
->paths_allocated
)) {
1485 s
->nr_max_paths
= trans
->nr_max_paths
=
1486 hweight64(trans
->paths_allocated
);
1487 swap(s
->max_paths_text
, buf
.buf
);
1489 mutex_unlock(&s
->lock
);
1492 printbuf_exit(&buf
);
1494 trans
->nr_max_paths
= hweight64(trans
->paths_allocated
);
1497 static noinline
void btree_path_overflow(struct btree_trans
*trans
)
1499 bch2_dump_trans_paths_updates(trans
);
1500 panic("trans path overflow\n");
1503 static inline struct btree_path
*btree_path_alloc(struct btree_trans
*trans
,
1504 struct btree_path
*pos
)
1506 struct btree_path
*path
;
1509 if (unlikely(trans
->paths_allocated
==
1510 ~((~0ULL << 1) << (BTREE_ITER_MAX
- 1))))
1511 btree_path_overflow(trans
);
1513 idx
= __ffs64(~trans
->paths_allocated
);
1516 * Do this before marking the new path as allocated, since it won't be
1519 if (unlikely(idx
> trans
->nr_max_paths
))
1520 bch2_trans_update_max_paths(trans
);
1522 trans
->paths_allocated
|= 1ULL << idx
;
1524 path
= &trans
->paths
[idx
];
1527 path
->intent_ref
= 0;
1528 path
->nodes_locked
= 0;
1531 btree_path_list_add(trans
, pos
, path
);
1532 trans
->paths_sorted
= false;
1536 struct btree_path
*bch2_path_get(struct btree_trans
*trans
,
1537 enum btree_id btree_id
, struct bpos pos
,
1538 unsigned locks_want
, unsigned level
,
1539 unsigned flags
, unsigned long ip
)
1541 struct btree_path
*path
, *path_pos
= NULL
;
1542 bool cached
= flags
& BTREE_ITER_CACHED
;
1543 bool intent
= flags
& BTREE_ITER_INTENT
;
1546 bch2_trans_verify_not_in_restart(trans
);
1547 bch2_trans_verify_locks(trans
);
1549 btree_trans_sort_paths(trans
);
1551 trans_for_each_path_inorder(trans
, path
, i
) {
1552 if (__btree_path_cmp(path
,
1563 path_pos
->cached
== cached
&&
1564 path_pos
->btree_id
== btree_id
&&
1565 path_pos
->level
== level
) {
1566 __btree_path_get(path_pos
, intent
);
1567 path
= bch2_btree_path_set_pos(trans
, path_pos
, pos
, intent
, ip
);
1569 path
= btree_path_alloc(trans
, path_pos
);
1572 __btree_path_get(path
, intent
);
1574 path
->btree_id
= btree_id
;
1575 path
->cached
= cached
;
1576 path
->uptodate
= BTREE_ITER_NEED_TRAVERSE
;
1577 path
->should_be_locked
= false;
1578 path
->level
= level
;
1579 path
->locks_want
= locks_want
;
1580 path
->nodes_locked
= 0;
1581 for (i
= 0; i
< ARRAY_SIZE(path
->l
); i
++)
1582 path
->l
[i
].b
= ERR_PTR(-BCH_ERR_no_btree_node_init
);
1583 #ifdef TRACK_PATH_ALLOCATED
1584 path
->ip_allocated
= ip
;
1586 trans
->paths_sorted
= false;
1589 if (!(flags
& BTREE_ITER_NOPRESERVE
))
1590 path
->preserve
= true;
1592 if (path
->intent_ref
)
1593 locks_want
= max(locks_want
, level
+ 1);
1596 * If the path has locks_want greater than requested, we don't downgrade
1597 * it here - on transaction restart because btree node split needs to
1598 * upgrade locks, we might be putting/getting the iterator again.
1599 * Downgrading iterators only happens via bch2_trans_downgrade(), after
1600 * a successful transaction commit.
1603 locks_want
= min(locks_want
, BTREE_MAX_DEPTH
);
1604 if (locks_want
> path
->locks_want
)
1605 bch2_btree_path_upgrade_noupgrade_sibs(trans
, path
, locks_want
, NULL
);
1610 struct bkey_s_c
bch2_btree_path_peek_slot(struct btree_path
*path
, struct bkey
*u
)
1613 struct btree_path_level
*l
= path_l(path
);
1614 struct bkey_packed
*_k
;
1617 if (unlikely(!l
->b
))
1618 return bkey_s_c_null
;
1620 EBUG_ON(path
->uptodate
!= BTREE_ITER_UPTODATE
);
1621 EBUG_ON(!btree_node_locked(path
, path
->level
));
1623 if (!path
->cached
) {
1624 _k
= bch2_btree_node_iter_peek_all(&l
->iter
, l
->b
);
1625 k
= _k
? bkey_disassemble(l
->b
, _k
, u
) : bkey_s_c_null
;
1627 EBUG_ON(k
.k
&& bkey_deleted(k
.k
) && bpos_eq(k
.k
->p
, path
->pos
));
1629 if (!k
.k
|| !bpos_eq(path
->pos
, k
.k
->p
))
1632 struct bkey_cached
*ck
= (void *) path
->l
[0].b
;
1635 (path
->btree_id
!= ck
->key
.btree_id
||
1636 !bkey_eq(path
->pos
, ck
->key
.pos
)));
1637 if (!ck
|| !ck
->valid
)
1638 return bkey_s_c_null
;
1641 k
= bkey_i_to_s_c(ck
->k
);
1648 return (struct bkey_s_c
) { u
, NULL
};
1651 /* Btree iterators: */
1654 __bch2_btree_iter_traverse(struct btree_iter
*iter
)
1656 return bch2_btree_path_traverse(iter
->trans
, iter
->path
, iter
->flags
);
1660 bch2_btree_iter_traverse(struct btree_iter
*iter
)
1664 iter
->path
= bch2_btree_path_set_pos(iter
->trans
, iter
->path
,
1665 btree_iter_search_key(iter
),
1666 iter
->flags
& BTREE_ITER_INTENT
,
1667 btree_iter_ip_allocated(iter
));
1669 ret
= bch2_btree_path_traverse(iter
->trans
, iter
->path
, iter
->flags
);
1673 btree_path_set_should_be_locked(iter
->path
);
1677 /* Iterate across nodes (leaf and interior nodes) */
1679 struct btree
*bch2_btree_iter_peek_node(struct btree_iter
*iter
)
1681 struct btree_trans
*trans
= iter
->trans
;
1682 struct btree
*b
= NULL
;
1685 EBUG_ON(iter
->path
->cached
);
1686 bch2_btree_iter_verify(iter
);
1688 ret
= bch2_btree_path_traverse(trans
, iter
->path
, iter
->flags
);
1692 b
= btree_path_node(iter
->path
, iter
->path
->level
);
1696 BUG_ON(bpos_lt(b
->key
.k
.p
, iter
->pos
));
1698 bkey_init(&iter
->k
);
1699 iter
->k
.p
= iter
->pos
= b
->key
.k
.p
;
1701 iter
->path
= bch2_btree_path_set_pos(trans
, iter
->path
, b
->key
.k
.p
,
1702 iter
->flags
& BTREE_ITER_INTENT
,
1703 btree_iter_ip_allocated(iter
));
1704 btree_path_set_should_be_locked(iter
->path
);
1706 bch2_btree_iter_verify_entry_exit(iter
);
1707 bch2_btree_iter_verify(iter
);
1715 struct btree
*bch2_btree_iter_peek_node_and_restart(struct btree_iter
*iter
)
1719 while (b
= bch2_btree_iter_peek_node(iter
),
1720 bch2_err_matches(PTR_ERR_OR_ZERO(b
), BCH_ERR_transaction_restart
))
1721 bch2_trans_begin(iter
->trans
);
1726 struct btree
*bch2_btree_iter_next_node(struct btree_iter
*iter
)
1728 struct btree_trans
*trans
= iter
->trans
;
1729 struct btree_path
*path
= iter
->path
;
1730 struct btree
*b
= NULL
;
1733 bch2_trans_verify_not_in_restart(trans
);
1734 EBUG_ON(iter
->path
->cached
);
1735 bch2_btree_iter_verify(iter
);
1737 /* already at end? */
1738 if (!btree_path_node(path
, path
->level
))
1742 if (!btree_path_node(path
, path
->level
+ 1)) {
1743 btree_path_set_level_up(trans
, path
);
1747 if (!bch2_btree_node_relock(trans
, path
, path
->level
+ 1)) {
1748 __bch2_btree_path_unlock(trans
, path
);
1749 path
->l
[path
->level
].b
= ERR_PTR(-BCH_ERR_no_btree_node_relock
);
1750 path
->l
[path
->level
+ 1].b
= ERR_PTR(-BCH_ERR_no_btree_node_relock
);
1751 btree_path_set_dirty(path
, BTREE_ITER_NEED_TRAVERSE
);
1752 trace_and_count(trans
->c
, trans_restart_relock_next_node
, trans
, _THIS_IP_
, path
);
1753 ret
= btree_trans_restart(trans
, BCH_ERR_transaction_restart_relock
);
1757 b
= btree_path_node(path
, path
->level
+ 1);
1759 if (bpos_eq(iter
->pos
, b
->key
.k
.p
)) {
1760 __btree_path_set_level_up(trans
, path
, path
->level
++);
1763 * Haven't gotten to the end of the parent node: go back down to
1764 * the next child node
1767 bch2_btree_path_set_pos(trans
, path
, bpos_successor(iter
->pos
),
1768 iter
->flags
& BTREE_ITER_INTENT
,
1769 btree_iter_ip_allocated(iter
));
1771 btree_path_set_level_down(trans
, path
, iter
->min_depth
);
1773 ret
= bch2_btree_path_traverse(trans
, path
, iter
->flags
);
1777 b
= path
->l
[path
->level
].b
;
1780 bkey_init(&iter
->k
);
1781 iter
->k
.p
= iter
->pos
= b
->key
.k
.p
;
1783 iter
->path
= bch2_btree_path_set_pos(trans
, iter
->path
, b
->key
.k
.p
,
1784 iter
->flags
& BTREE_ITER_INTENT
,
1785 btree_iter_ip_allocated(iter
));
1786 btree_path_set_should_be_locked(iter
->path
);
1787 BUG_ON(iter
->path
->uptodate
);
1789 bch2_btree_iter_verify_entry_exit(iter
);
1790 bch2_btree_iter_verify(iter
);
1798 /* Iterate across keys (in leaf nodes only) */
1800 inline bool bch2_btree_iter_advance(struct btree_iter
*iter
)
1802 if (likely(!(iter
->flags
& BTREE_ITER_ALL_LEVELS
))) {
1803 struct bpos pos
= iter
->k
.p
;
1804 bool ret
= !(iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
1805 ? bpos_eq(pos
, SPOS_MAX
)
1806 : bkey_eq(pos
, SPOS_MAX
));
1808 if (ret
&& !(iter
->flags
& BTREE_ITER_IS_EXTENTS
))
1809 pos
= bkey_successor(iter
, pos
);
1810 bch2_btree_iter_set_pos(iter
, pos
);
1813 if (!btree_path_node(iter
->path
, iter
->path
->level
))
1816 iter
->advanced
= true;
1821 inline bool bch2_btree_iter_rewind(struct btree_iter
*iter
)
1823 struct bpos pos
= bkey_start_pos(&iter
->k
);
1824 bool ret
= !(iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
1825 ? bpos_eq(pos
, POS_MIN
)
1826 : bkey_eq(pos
, POS_MIN
));
1828 if (ret
&& !(iter
->flags
& BTREE_ITER_IS_EXTENTS
))
1829 pos
= bkey_predecessor(iter
, pos
);
1830 bch2_btree_iter_set_pos(iter
, pos
);
1835 struct bkey_i
*__bch2_btree_trans_peek_updates(struct btree_iter
*iter
)
1837 struct btree_insert_entry
*i
;
1838 struct bkey_i
*ret
= NULL
;
1840 trans_for_each_update(iter
->trans
, i
) {
1841 if (i
->btree_id
< iter
->btree_id
)
1843 if (i
->btree_id
> iter
->btree_id
)
1845 if (bpos_lt(i
->k
->k
.p
, iter
->path
->pos
))
1847 if (i
->key_cache_already_flushed
)
1849 if (!ret
|| bpos_lt(i
->k
->k
.p
, ret
->k
.p
))
1856 static inline struct bkey_i
*btree_trans_peek_updates(struct btree_iter
*iter
)
1858 return iter
->flags
& BTREE_ITER_WITH_UPDATES
1859 ? __bch2_btree_trans_peek_updates(iter
)
1863 static struct bkey_i
*bch2_btree_journal_peek(struct btree_trans
*trans
,
1864 struct btree_iter
*iter
,
1865 struct bpos end_pos
)
1869 if (bpos_lt(iter
->path
->pos
, iter
->journal_pos
))
1870 iter
->journal_idx
= 0;
1872 k
= bch2_journal_keys_peek_upto(trans
->c
, iter
->btree_id
,
1876 &iter
->journal_idx
);
1878 iter
->journal_pos
= k
? k
->k
.p
: end_pos
;
1883 struct bkey_s_c
btree_trans_peek_slot_journal(struct btree_trans
*trans
,
1884 struct btree_iter
*iter
)
1886 struct bkey_i
*k
= bch2_btree_journal_peek(trans
, iter
, iter
->path
->pos
);
1890 return bkey_i_to_s_c(k
);
1892 return bkey_s_c_null
;
1897 struct bkey_s_c
btree_trans_peek_journal(struct btree_trans
*trans
,
1898 struct btree_iter
*iter
,
1901 struct bkey_i
*next_journal
=
1902 bch2_btree_journal_peek(trans
, iter
,
1903 k
.k
? k
.k
->p
: path_l(iter
->path
)->b
->key
.k
.p
);
1906 iter
->k
= next_journal
->k
;
1907 k
= bkey_i_to_s_c(next_journal
);
1914 * Checks btree key cache for key at iter->pos and returns it if present, or
1918 struct bkey_s_c
btree_trans_peek_key_cache(struct btree_iter
*iter
, struct bpos pos
)
1920 struct btree_trans
*trans
= iter
->trans
;
1921 struct bch_fs
*c
= trans
->c
;
1926 if ((iter
->flags
& BTREE_ITER_KEY_CACHE_FILL
) &&
1927 bpos_eq(iter
->pos
, pos
))
1928 return bkey_s_c_null
;
1930 if (!bch2_btree_key_cache_find(c
, iter
->btree_id
, pos
))
1931 return bkey_s_c_null
;
1933 if (!iter
->key_cache_path
)
1934 iter
->key_cache_path
= bch2_path_get(trans
, iter
->btree_id
, pos
,
1935 iter
->flags
& BTREE_ITER_INTENT
, 0,
1936 iter
->flags
|BTREE_ITER_CACHED
|
1937 BTREE_ITER_CACHED_NOFILL
,
1940 iter
->key_cache_path
= bch2_btree_path_set_pos(trans
, iter
->key_cache_path
, pos
,
1941 iter
->flags
& BTREE_ITER_INTENT
,
1942 btree_iter_ip_allocated(iter
));
1944 ret
= bch2_btree_path_traverse(trans
, iter
->key_cache_path
,
1945 iter
->flags
|BTREE_ITER_CACHED
) ?:
1946 bch2_btree_path_relock(trans
, iter
->path
, _THIS_IP_
);
1948 return bkey_s_c_err(ret
);
1950 btree_path_set_should_be_locked(iter
->key_cache_path
);
1952 k
= bch2_btree_path_peek_slot(iter
->key_cache_path
, &u
);
1953 if (k
.k
&& !bkey_err(k
)) {
1960 static struct bkey_s_c
__bch2_btree_iter_peek(struct btree_iter
*iter
, struct bpos search_key
)
1962 struct btree_trans
*trans
= iter
->trans
;
1963 struct bkey_i
*next_update
;
1964 struct bkey_s_c k
, k2
;
1967 EBUG_ON(iter
->path
->cached
);
1968 bch2_btree_iter_verify(iter
);
1971 struct btree_path_level
*l
;
1973 iter
->path
= bch2_btree_path_set_pos(trans
, iter
->path
, search_key
,
1974 iter
->flags
& BTREE_ITER_INTENT
,
1975 btree_iter_ip_allocated(iter
));
1977 ret
= bch2_btree_path_traverse(trans
, iter
->path
, iter
->flags
);
1978 if (unlikely(ret
)) {
1979 /* ensure that iter->k is consistent with iter->pos: */
1980 bch2_btree_iter_set_pos(iter
, iter
->pos
);
1981 k
= bkey_s_c_err(ret
);
1985 l
= path_l(iter
->path
);
1987 if (unlikely(!l
->b
)) {
1988 /* No btree nodes at requested level: */
1989 bch2_btree_iter_set_pos(iter
, SPOS_MAX
);
1994 btree_path_set_should_be_locked(iter
->path
);
1996 k
= btree_path_level_peek_all(trans
->c
, l
, &iter
->k
);
1998 if (unlikely(iter
->flags
& BTREE_ITER_WITH_KEY_CACHE
) &&
2000 (k2
= btree_trans_peek_key_cache(iter
, k
.k
->p
)).k
) {
2004 bch2_btree_iter_set_pos(iter
, iter
->pos
);
2009 if (unlikely(iter
->flags
& BTREE_ITER_WITH_JOURNAL
))
2010 k
= btree_trans_peek_journal(trans
, iter
, k
);
2012 next_update
= btree_trans_peek_updates(iter
);
2015 bpos_le(next_update
->k
.p
,
2016 k
.k
? k
.k
->p
: l
->b
->key
.k
.p
)) {
2017 iter
->k
= next_update
->k
;
2018 k
= bkey_i_to_s_c(next_update
);
2021 if (k
.k
&& bkey_deleted(k
.k
)) {
2023 * If we've got a whiteout, and it's after the search
2024 * key, advance the search key to the whiteout instead
2025 * of just after the whiteout - it might be a btree
2026 * whiteout, with a real key at the same position, since
2027 * in the btree deleted keys sort before non deleted.
2029 search_key
= !bpos_eq(search_key
, k
.k
->p
)
2031 : bpos_successor(k
.k
->p
);
2037 } else if (likely(!bpos_eq(l
->b
->key
.k
.p
, SPOS_MAX
))) {
2038 /* Advance to next leaf node: */
2039 search_key
= bpos_successor(l
->b
->key
.k
.p
);
2042 bch2_btree_iter_set_pos(iter
, SPOS_MAX
);
2048 bch2_btree_iter_verify(iter
);
2054 * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
2055 * iterator's current position
2056 * @iter: iterator to peek from
2057 * @end: search limit: returns keys less than or equal to @end
2059 * Returns: key if found, or an error extractable with bkey_err().
2061 struct bkey_s_c
bch2_btree_iter_peek_upto(struct btree_iter
*iter
, struct bpos end
)
2063 struct btree_trans
*trans
= iter
->trans
;
2064 struct bpos search_key
= btree_iter_search_key(iter
);
2066 struct bpos iter_pos
;
2069 EBUG_ON(iter
->flags
& BTREE_ITER_ALL_LEVELS
);
2070 EBUG_ON((iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
) && bkey_eq(end
, POS_MAX
));
2072 if (iter
->update_path
) {
2073 bch2_path_put_nokeep(trans
, iter
->update_path
,
2074 iter
->flags
& BTREE_ITER_INTENT
);
2075 iter
->update_path
= NULL
;
2078 bch2_btree_iter_verify_entry_exit(iter
);
2081 k
= __bch2_btree_iter_peek(iter
, search_key
);
2084 if (unlikely(bkey_err(k
)))
2088 * iter->pos should be mononotically increasing, and always be
2089 * equal to the key we just returned - except extents can
2090 * straddle iter->pos:
2092 if (!(iter
->flags
& BTREE_ITER_IS_EXTENTS
))
2095 iter_pos
= bkey_max(iter
->pos
, bkey_start_pos(k
.k
));
2097 if (unlikely(!(iter
->flags
& BTREE_ITER_IS_EXTENTS
)
2098 ? bkey_gt(iter_pos
, end
)
2099 : bkey_ge(iter_pos
, end
)))
2102 if (iter
->update_path
&&
2103 !bkey_eq(iter
->update_path
->pos
, k
.k
->p
)) {
2104 bch2_path_put_nokeep(trans
, iter
->update_path
,
2105 iter
->flags
& BTREE_ITER_INTENT
);
2106 iter
->update_path
= NULL
;
2109 if ((iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
) &&
2110 (iter
->flags
& BTREE_ITER_INTENT
) &&
2111 !(iter
->flags
& BTREE_ITER_IS_EXTENTS
) &&
2112 !iter
->update_path
) {
2113 struct bpos pos
= k
.k
->p
;
2115 if (pos
.snapshot
< iter
->snapshot
) {
2116 search_key
= bpos_successor(k
.k
->p
);
2120 pos
.snapshot
= iter
->snapshot
;
2123 * advance, same as on exit for iter->path, but only up
2126 __btree_path_get(iter
->path
, iter
->flags
& BTREE_ITER_INTENT
);
2127 iter
->update_path
= iter
->path
;
2129 iter
->update_path
= bch2_btree_path_set_pos(trans
,
2130 iter
->update_path
, pos
,
2131 iter
->flags
& BTREE_ITER_INTENT
,
2133 ret
= bch2_btree_path_traverse(trans
, iter
->update_path
, iter
->flags
);
2134 if (unlikely(ret
)) {
2135 k
= bkey_s_c_err(ret
);
2141 * We can never have a key in a leaf node at POS_MAX, so
2142 * we don't have to check these successor() calls:
2144 if ((iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
) &&
2145 !bch2_snapshot_is_ancestor(trans
->c
,
2148 search_key
= bpos_successor(k
.k
->p
);
2152 if (bkey_whiteout(k
.k
) &&
2153 !(iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
)) {
2154 search_key
= bkey_successor(iter
, k
.k
->p
);
2161 iter
->pos
= iter_pos
;
2163 iter
->path
= bch2_btree_path_set_pos(trans
, iter
->path
, k
.k
->p
,
2164 iter
->flags
& BTREE_ITER_INTENT
,
2165 btree_iter_ip_allocated(iter
));
2167 btree_path_set_should_be_locked(iter
->path
);
2169 if (iter
->update_path
) {
2170 ret
= bch2_btree_path_relock(trans
, iter
->update_path
, _THIS_IP_
);
2172 k
= bkey_s_c_err(ret
);
2174 btree_path_set_should_be_locked(iter
->update_path
);
2177 if (!(iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
))
2178 iter
->pos
.snapshot
= iter
->snapshot
;
2180 ret
= bch2_btree_iter_verify_ret(iter
, k
);
2181 if (unlikely(ret
)) {
2182 bch2_btree_iter_set_pos(iter
, iter
->pos
);
2183 k
= bkey_s_c_err(ret
);
2186 bch2_btree_iter_verify_entry_exit(iter
);
2190 bch2_btree_iter_set_pos(iter
, end
);
2196 * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
2197 * equal to iterator's current position, returning keys from every level of the
2198 * btree. For keys at different levels of the btree that compare equal, the key
2199 * from the lower level (leaf) is returned first.
2200 * @iter: iterator to peek from
2202 * Returns: key if found, or an error extractable with bkey_err().
2204 struct bkey_s_c
bch2_btree_iter_peek_all_levels(struct btree_iter
*iter
)
2206 struct btree_trans
*trans
= iter
->trans
;
2210 EBUG_ON(iter
->path
->cached
);
2211 bch2_btree_iter_verify(iter
);
2212 BUG_ON(iter
->path
->level
< iter
->min_depth
);
2213 BUG_ON(!(iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
));
2214 EBUG_ON(!(iter
->flags
& BTREE_ITER_ALL_LEVELS
));
2217 iter
->path
= bch2_btree_path_set_pos(trans
, iter
->path
, iter
->pos
,
2218 iter
->flags
& BTREE_ITER_INTENT
,
2219 btree_iter_ip_allocated(iter
));
2221 ret
= bch2_btree_path_traverse(trans
, iter
->path
, iter
->flags
);
2222 if (unlikely(ret
)) {
2223 /* ensure that iter->k is consistent with iter->pos: */
2224 bch2_btree_iter_set_pos(iter
, iter
->pos
);
2225 k
= bkey_s_c_err(ret
);
2229 /* Already at end? */
2230 if (!btree_path_node(iter
->path
, iter
->path
->level
)) {
2235 k
= btree_path_level_peek_all(trans
->c
,
2236 &iter
->path
->l
[iter
->path
->level
], &iter
->k
);
2238 /* Check if we should go up to the parent node: */
2241 bpos_eq(path_l(iter
->path
)->b
->key
.k
.p
, iter
->pos
))) {
2242 iter
->pos
= path_l(iter
->path
)->b
->key
.k
.p
;
2243 btree_path_set_level_up(trans
, iter
->path
);
2244 iter
->advanced
= false;
2249 * Check if we should go back down to a leaf:
2250 * If we're not in a leaf node, we only return the current key
2251 * if it exactly matches iter->pos - otherwise we first have to
2252 * go back to the leaf:
2254 if (iter
->path
->level
!= iter
->min_depth
&&
2257 !bpos_eq(iter
->pos
, k
.k
->p
))) {
2258 btree_path_set_level_down(trans
, iter
->path
, iter
->min_depth
);
2259 iter
->pos
= bpos_successor(iter
->pos
);
2260 iter
->advanced
= false;
2264 /* Check if we should go to the next key: */
2265 if (iter
->path
->level
== iter
->min_depth
&&
2268 bpos_eq(iter
->pos
, k
.k
->p
)) {
2269 iter
->pos
= bpos_successor(iter
->pos
);
2270 iter
->advanced
= false;
2274 if (iter
->advanced
&&
2275 iter
->path
->level
== iter
->min_depth
&&
2276 !bpos_eq(k
.k
->p
, iter
->pos
))
2277 iter
->advanced
= false;
2279 BUG_ON(iter
->advanced
);
2285 btree_path_set_should_be_locked(iter
->path
);
2287 bch2_btree_iter_verify(iter
);
2293 * bch2_btree_iter_next() - returns first key greater than iterator's current
2295 * @iter: iterator to peek from
2297 * Returns: key if found, or an error extractable with bkey_err().
2299 struct bkey_s_c
bch2_btree_iter_next(struct btree_iter
*iter
)
2301 if (!bch2_btree_iter_advance(iter
))
2302 return bkey_s_c_null
;
2304 return bch2_btree_iter_peek(iter
);
2308 * bch2_btree_iter_peek_prev() - returns first key less than or equal to
2309 * iterator's current position
2310 * @iter: iterator to peek from
2312 * Returns: key if found, or an error extractable with bkey_err().
2314 struct bkey_s_c
bch2_btree_iter_peek_prev(struct btree_iter
*iter
)
2316 struct btree_trans
*trans
= iter
->trans
;
2317 struct bpos search_key
= iter
->pos
;
2318 struct btree_path
*saved_path
= NULL
;
2320 struct bkey saved_k
;
2321 const struct bch_val
*saved_v
;
2324 EBUG_ON(iter
->path
->cached
|| iter
->path
->level
);
2325 EBUG_ON(iter
->flags
& BTREE_ITER_WITH_UPDATES
);
2327 if (iter
->flags
& BTREE_ITER_WITH_JOURNAL
)
2328 return bkey_s_c_err(-EIO
);
2330 bch2_btree_iter_verify(iter
);
2331 bch2_btree_iter_verify_entry_exit(iter
);
2333 if (iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
)
2334 search_key
.snapshot
= U32_MAX
;
2337 iter
->path
= bch2_btree_path_set_pos(trans
, iter
->path
, search_key
,
2338 iter
->flags
& BTREE_ITER_INTENT
,
2339 btree_iter_ip_allocated(iter
));
2341 ret
= bch2_btree_path_traverse(trans
, iter
->path
, iter
->flags
);
2342 if (unlikely(ret
)) {
2343 /* ensure that iter->k is consistent with iter->pos: */
2344 bch2_btree_iter_set_pos(iter
, iter
->pos
);
2345 k
= bkey_s_c_err(ret
);
2349 k
= btree_path_level_peek(trans
, iter
->path
,
2350 &iter
->path
->l
[0], &iter
->k
);
2352 ((iter
->flags
& BTREE_ITER_IS_EXTENTS
)
2353 ? bpos_ge(bkey_start_pos(k
.k
), search_key
)
2354 : bpos_gt(k
.k
->p
, search_key
)))
2355 k
= btree_path_level_prev(trans
, iter
->path
,
2356 &iter
->path
->l
[0], &iter
->k
);
2359 if (iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
) {
2360 if (k
.k
->p
.snapshot
== iter
->snapshot
)
2364 * If we have a saved candidate, and we're no
2365 * longer at the same _key_ (not pos), return
2368 if (saved_path
&& !bkey_eq(k
.k
->p
, saved_k
.p
)) {
2369 bch2_path_put_nokeep(trans
, iter
->path
,
2370 iter
->flags
& BTREE_ITER_INTENT
);
2371 iter
->path
= saved_path
;
2378 if (bch2_snapshot_is_ancestor(iter
->trans
->c
,
2382 bch2_path_put_nokeep(trans
, saved_path
,
2383 iter
->flags
& BTREE_ITER_INTENT
);
2384 saved_path
= btree_path_clone(trans
, iter
->path
,
2385 iter
->flags
& BTREE_ITER_INTENT
);
2390 search_key
= bpos_predecessor(k
.k
->p
);
2394 if (bkey_whiteout(k
.k
) &&
2395 !(iter
->flags
& BTREE_ITER_ALL_SNAPSHOTS
)) {
2396 search_key
= bkey_predecessor(iter
, k
.k
->p
);
2397 if (iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
)
2398 search_key
.snapshot
= U32_MAX
;
2403 } else if (likely(!bpos_eq(iter
->path
->l
[0].b
->data
->min_key
, POS_MIN
))) {
2404 /* Advance to previous leaf node: */
2405 search_key
= bpos_predecessor(iter
->path
->l
[0].b
->data
->min_key
);
2407 /* Start of btree: */
2408 bch2_btree_iter_set_pos(iter
, POS_MIN
);
2414 EBUG_ON(bkey_gt(bkey_start_pos(k
.k
), iter
->pos
));
2416 /* Extents can straddle iter->pos: */
2417 if (bkey_lt(k
.k
->p
, iter
->pos
))
2420 if (iter
->flags
& BTREE_ITER_FILTER_SNAPSHOTS
)
2421 iter
->pos
.snapshot
= iter
->snapshot
;
2423 btree_path_set_should_be_locked(iter
->path
);
2426 bch2_path_put_nokeep(trans
, saved_path
, iter
->flags
& BTREE_ITER_INTENT
);
2428 bch2_btree_iter_verify_entry_exit(iter
);
2429 bch2_btree_iter_verify(iter
);
2435 * bch2_btree_iter_prev() - returns first key less than iterator's current
2437 * @iter: iterator to peek from
2439 * Returns: key if found, or an error extractable with bkey_err().
2441 struct bkey_s_c
bch2_btree_iter_prev(struct btree_iter
*iter
)
2443 if (!bch2_btree_iter_rewind(iter
))
2444 return bkey_s_c_null
;
2446 return bch2_btree_iter_peek_prev(iter
);
2449 struct bkey_s_c
bch2_btree_iter_peek_slot(struct btree_iter
*iter
)
2451 struct btree_trans
*trans
= iter
->trans
;
2452 struct bpos search_key
;
2456 bch2_btree_iter_verify(iter
);
2457 bch2_btree_iter_verify_entry_exit(iter
);
2458 EBUG_ON(iter
->flags
& BTREE_ITER_ALL_LEVELS
);
2459 EBUG_ON(iter
->path
->level
&& (iter
->flags
& BTREE_ITER_WITH_KEY_CACHE
));
2461 /* extents can't span inode numbers: */
2462 if ((iter
->flags
& BTREE_ITER_IS_EXTENTS
) &&
2463 unlikely(iter
->pos
.offset
== KEY_OFFSET_MAX
)) {
2464 if (iter
->pos
.inode
== KEY_INODE_MAX
)
2465 return bkey_s_c_null
;
2467 bch2_btree_iter_set_pos(iter
, bpos_nosnap_successor(iter
->pos
));
2470 search_key
= btree_iter_search_key(iter
);
2471 iter
->path
= bch2_btree_path_set_pos(trans
, iter
->path
, search_key
,
2472 iter
->flags
& BTREE_ITER_INTENT
,
2473 btree_iter_ip_allocated(iter
));
2475 ret
= bch2_btree_path_traverse(trans
, iter
->path
, iter
->flags
);
2476 if (unlikely(ret
)) {
2477 k
= bkey_s_c_err(ret
);
2481 if ((iter
->flags
& BTREE_ITER_CACHED
) ||
2482 !(iter
->flags
& (BTREE_ITER_IS_EXTENTS
|BTREE_ITER_FILTER_SNAPSHOTS
))) {
2483 struct bkey_i
*next_update
;
2485 if ((next_update
= btree_trans_peek_updates(iter
)) &&
2486 bpos_eq(next_update
->k
.p
, iter
->pos
)) {
2487 iter
->k
= next_update
->k
;
2488 k
= bkey_i_to_s_c(next_update
);
2492 if (unlikely(iter
->flags
& BTREE_ITER_WITH_JOURNAL
) &&
2493 (k
= btree_trans_peek_slot_journal(trans
, iter
)).k
)
2496 if (unlikely(iter
->flags
& BTREE_ITER_WITH_KEY_CACHE
) &&
2497 (k
= btree_trans_peek_key_cache(iter
, iter
->pos
)).k
) {
2500 /* We're not returning a key from iter->path: */
2504 k
= bch2_btree_path_peek_slot(iter
->path
, &iter
->k
);
2509 struct bpos end
= iter
->pos
;
2511 if (iter
->flags
& BTREE_ITER_IS_EXTENTS
)
2512 end
.offset
= U64_MAX
;
2514 EBUG_ON(iter
->path
->level
);
2516 if (iter
->flags
& BTREE_ITER_INTENT
) {
2517 struct btree_iter iter2
;
2519 bch2_trans_copy_iter(&iter2
, iter
);
2520 k
= bch2_btree_iter_peek_upto(&iter2
, end
);
2522 if (k
.k
&& !bkey_err(k
)) {
2526 bch2_trans_iter_exit(trans
, &iter2
);
2528 struct bpos pos
= iter
->pos
;
2530 k
= bch2_btree_iter_peek_upto(iter
, end
);
2531 if (unlikely(bkey_err(k
)))
2532 bch2_btree_iter_set_pos(iter
, pos
);
2537 if (unlikely(bkey_err(k
)))
2540 next
= k
.k
? bkey_start_pos(k
.k
) : POS_MAX
;
2542 if (bkey_lt(iter
->pos
, next
)) {
2543 bkey_init(&iter
->k
);
2544 iter
->k
.p
= iter
->pos
;
2546 if (iter
->flags
& BTREE_ITER_IS_EXTENTS
) {
2547 bch2_key_resize(&iter
->k
,
2548 min_t(u64
, KEY_SIZE_MAX
,
2549 (next
.inode
== iter
->pos
.inode
2553 EBUG_ON(!iter
->k
.size
);
2556 k
= (struct bkey_s_c
) { &iter
->k
, NULL
};
2560 btree_path_set_should_be_locked(iter
->path
);
2562 bch2_btree_iter_verify_entry_exit(iter
);
2563 bch2_btree_iter_verify(iter
);
2564 ret
= bch2_btree_iter_verify_ret(iter
, k
);
2566 return bkey_s_c_err(ret
);
2571 struct bkey_s_c
bch2_btree_iter_next_slot(struct btree_iter
*iter
)
2573 if (!bch2_btree_iter_advance(iter
))
2574 return bkey_s_c_null
;
2576 return bch2_btree_iter_peek_slot(iter
);
2579 struct bkey_s_c
bch2_btree_iter_prev_slot(struct btree_iter
*iter
)
2581 if (!bch2_btree_iter_rewind(iter
))
2582 return bkey_s_c_null
;
2584 return bch2_btree_iter_peek_slot(iter
);
2587 struct bkey_s_c
bch2_btree_iter_peek_and_restart_outlined(struct btree_iter
*iter
)
2591 while (btree_trans_too_many_iters(iter
->trans
) ||
2592 (k
= bch2_btree_iter_peek_type(iter
, iter
->flags
),
2593 bch2_err_matches(bkey_err(k
), BCH_ERR_transaction_restart
)))
2594 bch2_trans_begin(iter
->trans
);
2599 /* new transactional stuff: */
2601 #ifdef CONFIG_BCACHEFS_DEBUG
2602 static void btree_trans_verify_sorted_refs(struct btree_trans
*trans
)
2604 struct btree_path
*path
;
2607 BUG_ON(trans
->nr_sorted
!= hweight64(trans
->paths_allocated
));
2609 trans_for_each_path(trans
, path
) {
2610 BUG_ON(path
->sorted_idx
>= trans
->nr_sorted
);
2611 BUG_ON(trans
->sorted
[path
->sorted_idx
] != path
->idx
);
2614 for (i
= 0; i
< trans
->nr_sorted
; i
++) {
2615 unsigned idx
= trans
->sorted
[i
];
2617 EBUG_ON(!(trans
->paths_allocated
& (1ULL << idx
)));
2618 BUG_ON(trans
->paths
[idx
].sorted_idx
!= i
);
2622 static void btree_trans_verify_sorted(struct btree_trans
*trans
)
2624 struct btree_path
*path
, *prev
= NULL
;
2627 if (!bch2_debug_check_iterators
)
2630 trans_for_each_path_inorder(trans
, path
, i
) {
2631 if (prev
&& btree_path_cmp(prev
, path
) > 0) {
2632 __bch2_dump_trans_paths_updates(trans
, true);
2633 panic("trans paths out of order!\n");
2639 static inline void btree_trans_verify_sorted_refs(struct btree_trans
*trans
) {}
2640 static inline void btree_trans_verify_sorted(struct btree_trans
*trans
) {}
2643 void __bch2_btree_trans_sort_paths(struct btree_trans
*trans
)
2645 int i
, l
= 0, r
= trans
->nr_sorted
, inc
= 1;
2648 btree_trans_verify_sorted_refs(trans
);
2650 if (trans
->paths_sorted
)
2654 * Cocktail shaker sort: this is efficient because iterators will be
2660 for (i
= inc
> 0 ? l
: r
- 2;
2661 i
+ 1 < r
&& i
>= l
;
2663 if (btree_path_cmp(trans
->paths
+ trans
->sorted
[i
],
2664 trans
->paths
+ trans
->sorted
[i
+ 1]) > 0) {
2665 swap(trans
->sorted
[i
], trans
->sorted
[i
+ 1]);
2666 trans
->paths
[trans
->sorted
[i
]].sorted_idx
= i
;
2667 trans
->paths
[trans
->sorted
[i
+ 1]].sorted_idx
= i
+ 1;
2679 trans
->paths_sorted
= true;
2681 btree_trans_verify_sorted(trans
);
2684 static inline void btree_path_list_remove(struct btree_trans
*trans
,
2685 struct btree_path
*path
)
2689 EBUG_ON(path
->sorted_idx
>= trans
->nr_sorted
);
2690 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2692 memmove_u64s_down_small(trans
->sorted
+ path
->sorted_idx
,
2693 trans
->sorted
+ path
->sorted_idx
+ 1,
2694 DIV_ROUND_UP(trans
->nr_sorted
- path
->sorted_idx
, 8));
2696 array_remove_item(trans
->sorted
, trans
->nr_sorted
, path
->sorted_idx
);
2698 for (i
= path
->sorted_idx
; i
< trans
->nr_sorted
; i
++)
2699 trans
->paths
[trans
->sorted
[i
]].sorted_idx
= i
;
2701 path
->sorted_idx
= U8_MAX
;
2704 static inline void btree_path_list_add(struct btree_trans
*trans
,
2705 struct btree_path
*pos
,
2706 struct btree_path
*path
)
2710 path
->sorted_idx
= pos
? pos
->sorted_idx
+ 1 : trans
->nr_sorted
;
2712 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2713 memmove_u64s_up_small(trans
->sorted
+ path
->sorted_idx
+ 1,
2714 trans
->sorted
+ path
->sorted_idx
,
2715 DIV_ROUND_UP(trans
->nr_sorted
- path
->sorted_idx
, 8));
2717 trans
->sorted
[path
->sorted_idx
] = path
->idx
;
2719 array_insert_item(trans
->sorted
, trans
->nr_sorted
, path
->sorted_idx
, path
->idx
);
2722 for (i
= path
->sorted_idx
; i
< trans
->nr_sorted
; i
++)
2723 trans
->paths
[trans
->sorted
[i
]].sorted_idx
= i
;
2725 btree_trans_verify_sorted_refs(trans
);
2728 void bch2_trans_iter_exit(struct btree_trans
*trans
, struct btree_iter
*iter
)
2730 if (iter
->update_path
)
2731 bch2_path_put_nokeep(trans
, iter
->update_path
,
2732 iter
->flags
& BTREE_ITER_INTENT
);
2734 bch2_path_put(trans
, iter
->path
,
2735 iter
->flags
& BTREE_ITER_INTENT
);
2736 if (iter
->key_cache_path
)
2737 bch2_path_put(trans
, iter
->key_cache_path
,
2738 iter
->flags
& BTREE_ITER_INTENT
);
2740 iter
->update_path
= NULL
;
2741 iter
->key_cache_path
= NULL
;
2744 void bch2_trans_iter_init_outlined(struct btree_trans
*trans
,
2745 struct btree_iter
*iter
,
2746 enum btree_id btree_id
, struct bpos pos
,
2749 bch2_trans_iter_init_common(trans
, iter
, btree_id
, pos
, 0, 0,
2750 bch2_btree_iter_flags(trans
, btree_id
, flags
),
2754 void bch2_trans_node_iter_init(struct btree_trans
*trans
,
2755 struct btree_iter
*iter
,
2756 enum btree_id btree_id
,
2758 unsigned locks_want
,
2762 flags
|= BTREE_ITER_NOT_EXTENTS
;
2763 flags
|= __BTREE_ITER_ALL_SNAPSHOTS
;
2764 flags
|= BTREE_ITER_ALL_SNAPSHOTS
;
2766 bch2_trans_iter_init_common(trans
, iter
, btree_id
, pos
, locks_want
, depth
,
2767 __bch2_btree_iter_flags(trans
, btree_id
, flags
),
2770 iter
->min_depth
= depth
;
2772 BUG_ON(iter
->path
->locks_want
< min(locks_want
, BTREE_MAX_DEPTH
));
2773 BUG_ON(iter
->path
->level
!= depth
);
2774 BUG_ON(iter
->min_depth
!= depth
);
2777 void bch2_trans_copy_iter(struct btree_iter
*dst
, struct btree_iter
*src
)
2781 __btree_path_get(src
->path
, src
->flags
& BTREE_ITER_INTENT
);
2782 if (src
->update_path
)
2783 __btree_path_get(src
->update_path
, src
->flags
& BTREE_ITER_INTENT
);
2784 dst
->key_cache_path
= NULL
;
2787 void *__bch2_trans_kmalloc(struct btree_trans
*trans
, size_t size
)
2789 unsigned new_top
= trans
->mem_top
+ size
;
2790 size_t old_bytes
= trans
->mem_bytes
;
2791 size_t new_bytes
= roundup_pow_of_two(new_top
);
2796 trans
->mem_max
= max(trans
->mem_max
, new_top
);
2798 WARN_ON_ONCE(new_bytes
> BTREE_TRANS_MEM_MAX
);
2800 new_mem
= krealloc(trans
->mem
, new_bytes
, GFP_NOWAIT
|__GFP_NOWARN
);
2801 if (unlikely(!new_mem
)) {
2802 bch2_trans_unlock(trans
);
2804 new_mem
= krealloc(trans
->mem
, new_bytes
, GFP_KERNEL
);
2805 if (!new_mem
&& new_bytes
<= BTREE_TRANS_MEM_MAX
) {
2806 new_mem
= mempool_alloc(&trans
->c
->btree_trans_mem_pool
, GFP_KERNEL
);
2807 new_bytes
= BTREE_TRANS_MEM_MAX
;
2812 return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc
);
2814 trans
->mem
= new_mem
;
2815 trans
->mem_bytes
= new_bytes
;
2817 ret
= bch2_trans_relock(trans
);
2819 return ERR_PTR(ret
);
2822 trans
->mem
= new_mem
;
2823 trans
->mem_bytes
= new_bytes
;
2826 trace_and_count(trans
->c
, trans_restart_mem_realloced
, trans
, _RET_IP_
, new_bytes
);
2827 return ERR_PTR(btree_trans_restart(trans
, BCH_ERR_transaction_restart_mem_realloced
));
2830 p
= trans
->mem
+ trans
->mem_top
;
2831 trans
->mem_top
+= size
;
2836 static inline void check_srcu_held_too_long(struct btree_trans
*trans
)
2838 WARN(trans
->srcu_held
&& time_after(jiffies
, trans
->srcu_lock_time
+ HZ
* 10),
2839 "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
2840 (jiffies
- trans
->srcu_lock_time
) / HZ
);
2843 void bch2_trans_srcu_unlock(struct btree_trans
*trans
)
2845 if (trans
->srcu_held
) {
2846 struct bch_fs
*c
= trans
->c
;
2847 struct btree_path
*path
;
2849 trans_for_each_path(trans
, path
)
2850 if (path
->cached
&& !btree_node_locked(path
, 0))
2851 path
->l
[0].b
= ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset
);
2853 check_srcu_held_too_long(trans
);
2854 srcu_read_unlock(&c
->btree_trans_barrier
, trans
->srcu_idx
);
2855 trans
->srcu_held
= false;
2859 void bch2_trans_srcu_lock(struct btree_trans
*trans
)
2861 if (!trans
->srcu_held
) {
2862 trans
->srcu_idx
= srcu_read_lock(&trans
->c
->btree_trans_barrier
);
2863 trans
->srcu_lock_time
= jiffies
;
2864 trans
->srcu_held
= true;
2869 * bch2_trans_begin() - reset a transaction after a interrupted attempt
2870 * @trans: transaction to reset
2872 * Returns: current restart counter, to be used with trans_was_restarted()
2874 * While iterating over nodes or updating nodes a attempt to lock a btree node
2875 * may return BCH_ERR_transaction_restart when the trylock fails. When this
2876 * occurs bch2_trans_begin() should be called and the transaction retried.
2878 u32
bch2_trans_begin(struct btree_trans
*trans
)
2880 struct btree_path
*path
;
2883 bch2_trans_reset_updates(trans
);
2885 trans
->restart_count
++;
2888 trans_for_each_path(trans
, path
) {
2889 path
->should_be_locked
= false;
2892 * If the transaction wasn't restarted, we're presuming to be
2893 * doing something new: dont keep iterators excpt the ones that
2894 * are in use - except for the subvolumes btree:
2896 if (!trans
->restarted
&& path
->btree_id
!= BTREE_ID_subvolumes
)
2897 path
->preserve
= false;
2900 * XXX: we probably shouldn't be doing this if the transaction
2901 * was restarted, but currently we still overflow transaction
2902 * iterators if we do that
2904 if (!path
->ref
&& !path
->preserve
)
2905 __bch2_path_free(trans
, path
);
2907 path
->preserve
= false;
2910 now
= local_clock();
2911 if (!trans
->restarted
&&
2913 now
- trans
->last_begin_time
> BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS
)) {
2914 drop_locks_do(trans
, (cond_resched(), 0));
2915 now
= local_clock();
2917 trans
->last_begin_time
= now
;
2919 if (unlikely(trans
->srcu_held
&&
2920 time_after(jiffies
, trans
->srcu_lock_time
+ msecs_to_jiffies(10))))
2921 bch2_trans_srcu_unlock(trans
);
2923 trans
->last_begin_ip
= _RET_IP_
;
2924 if (trans
->restarted
) {
2925 bch2_btree_path_traverse_all(trans
);
2926 trans
->notrace_relock_fail
= false;
2929 return trans
->restart_count
;
2932 static struct btree_trans
*bch2_trans_alloc(struct bch_fs
*c
)
2934 struct btree_trans
*trans
;
2936 if (IS_ENABLED(__KERNEL__
)) {
2937 trans
= this_cpu_xchg(c
->btree_trans_bufs
->trans
, NULL
);
2942 trans
= mempool_alloc(&c
->btree_trans_pool
, GFP_NOFS
);
2944 * paths need to be zeroed, bch2_check_for_deadlock looks at
2945 * paths in other threads
2947 memset(&trans
->paths
, 0, sizeof(trans
->paths
));
2951 const char *bch2_btree_transaction_fns
[BCH_TRANSACTIONS_NR
];
2953 unsigned bch2_trans_get_fn_idx(const char *fn
)
2957 for (i
= 0; i
< ARRAY_SIZE(bch2_btree_transaction_fns
); i
++)
2958 if (!bch2_btree_transaction_fns
[i
] ||
2959 bch2_btree_transaction_fns
[i
] == fn
) {
2960 bch2_btree_transaction_fns
[i
] = fn
;
2964 pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
2968 struct btree_trans
*__bch2_trans_get(struct bch_fs
*c
, unsigned fn_idx
)
2969 __acquires(&c
->btree_trans_barrier
)
2971 struct btree_trans
*trans
;
2972 struct btree_transaction_stats
*s
;
2974 trans
= bch2_trans_alloc(c
);
2976 memset(trans
, 0, sizeof(*trans
));
2978 trans
->fn
= fn_idx
< ARRAY_SIZE(bch2_btree_transaction_fns
)
2979 ? bch2_btree_transaction_fns
[fn_idx
] : NULL
;
2980 trans
->last_begin_time
= local_clock();
2981 trans
->fn_idx
= fn_idx
;
2982 trans
->locking_wait
.task
= current
;
2983 trans
->journal_replay_not_finished
=
2984 unlikely(!test_bit(JOURNAL_REPLAY_DONE
, &c
->journal
.flags
)) &&
2985 atomic_inc_not_zero(&c
->journal_keys
.ref
);
2986 closure_init_stack(&trans
->ref
);
2988 s
= btree_trans_stats(trans
);
2989 if (s
&& s
->max_mem
) {
2990 unsigned expected_mem_bytes
= roundup_pow_of_two(s
->max_mem
);
2992 trans
->mem
= kmalloc(expected_mem_bytes
, GFP_KERNEL
);
2994 if (!unlikely(trans
->mem
)) {
2995 trans
->mem
= mempool_alloc(&c
->btree_trans_mem_pool
, GFP_KERNEL
);
2996 trans
->mem_bytes
= BTREE_TRANS_MEM_MAX
;
2998 trans
->mem_bytes
= expected_mem_bytes
;
3003 trans
->nr_max_paths
= s
->nr_max_paths
;
3004 trans
->wb_updates_size
= s
->wb_updates_size
;
3007 trans
->srcu_idx
= srcu_read_lock(&c
->btree_trans_barrier
);
3008 trans
->srcu_lock_time
= jiffies
;
3009 trans
->srcu_held
= true;
3011 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
)) {
3012 struct btree_trans
*pos
;
3014 seqmutex_lock(&c
->btree_trans_lock
);
3015 list_for_each_entry(pos
, &c
->btree_trans_list
, list
) {
3017 * We'd much prefer to be stricter here and completely
3018 * disallow multiple btree_trans in the same thread -
3019 * but the data move path calls bch2_write when we
3020 * already have a btree_trans initialized.
3022 BUG_ON(trans
->locking_wait
.task
->pid
== pos
->locking_wait
.task
->pid
&&
3023 bch2_trans_locked(pos
));
3025 if (trans
->locking_wait
.task
->pid
< pos
->locking_wait
.task
->pid
) {
3026 list_add_tail(&trans
->list
, &pos
->list
);
3030 list_add_tail(&trans
->list
, &c
->btree_trans_list
);
3032 seqmutex_unlock(&c
->btree_trans_lock
);
3038 static void check_btree_paths_leaked(struct btree_trans
*trans
)
3040 #ifdef CONFIG_BCACHEFS_DEBUG
3041 struct bch_fs
*c
= trans
->c
;
3042 struct btree_path
*path
;
3044 trans_for_each_path(trans
, path
)
3049 bch_err(c
, "btree paths leaked from %s!", trans
->fn
);
3050 trans_for_each_path(trans
, path
)
3052 printk(KERN_ERR
" btree %s %pS\n",
3053 bch2_btree_id_str(path
->btree_id
),
3054 (void *) path
->ip_allocated
);
3055 /* Be noisy about this: */
3056 bch2_fatal_error(c
);
3060 void bch2_trans_put(struct btree_trans
*trans
)
3061 __releases(&c
->btree_trans_barrier
)
3063 struct btree_insert_entry
*i
;
3064 struct bch_fs
*c
= trans
->c
;
3065 struct btree_transaction_stats
*s
= btree_trans_stats(trans
);
3067 bch2_trans_unlock(trans
);
3069 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
)) {
3070 seqmutex_lock(&c
->btree_trans_lock
);
3071 list_del(&trans
->list
);
3072 seqmutex_unlock(&c
->btree_trans_lock
);
3075 closure_sync(&trans
->ref
);
3078 s
->max_mem
= max(s
->max_mem
, trans
->mem_max
);
3080 trans_for_each_update(trans
, i
)
3081 __btree_path_put(i
->path
, true);
3082 trans
->nr_updates
= 0;
3084 check_btree_paths_leaked(trans
);
3086 if (trans
->srcu_held
) {
3087 check_srcu_held_too_long(trans
);
3088 srcu_read_unlock(&c
->btree_trans_barrier
, trans
->srcu_idx
);
3091 kfree(trans
->extra_journal_entries
.data
);
3093 if (trans
->fs_usage_deltas
) {
3094 if (trans
->fs_usage_deltas
->size
+ sizeof(trans
->fs_usage_deltas
) ==
3095 REPLICAS_DELTA_LIST_MAX
)
3096 mempool_free(trans
->fs_usage_deltas
,
3097 &c
->replicas_delta_pool
);
3099 kfree(trans
->fs_usage_deltas
);
3102 if (unlikely(trans
->journal_replay_not_finished
))
3103 bch2_journal_keys_put(c
);
3105 if (trans
->mem_bytes
== BTREE_TRANS_MEM_MAX
)
3106 mempool_free(trans
->mem
, &c
->btree_trans_mem_pool
);
3110 /* Userspace doesn't have a real percpu implementation: */
3111 if (IS_ENABLED(__KERNEL__
))
3112 trans
= this_cpu_xchg(c
->btree_trans_bufs
->trans
, trans
);
3114 mempool_free(trans
, &c
->btree_trans_pool
);
3117 static void __maybe_unused
3118 bch2_btree_bkey_cached_common_to_text(struct printbuf
*out
,
3119 struct btree_bkey_cached_common
*b
)
3121 struct six_lock_count c
= six_lock_counts(&b
->lock
);
3122 struct task_struct
*owner
;
3126 owner
= READ_ONCE(b
->lock
.owner
);
3127 pid
= owner
? owner
->pid
: 0;
3131 prt_printf(out
, "%px %c l=%u %s:", b
, b
->cached
? 'c' : 'b',
3132 b
->level
, bch2_btree_id_str(b
->btree_id
));
3133 bch2_bpos_to_text(out
, btree_node_pos(b
));
3136 prt_printf(out
, " locks %u:%u:%u held by pid %u",
3137 c
.n
[0], c
.n
[1], c
.n
[2], pid
);
3140 void bch2_btree_trans_to_text(struct printbuf
*out
, struct btree_trans
*trans
)
3142 struct btree_path
*path
;
3143 struct btree_bkey_cached_common
*b
;
3144 static char lock_types
[] = { 'r', 'i', 'w' };
3147 if (!out
->nr_tabstops
) {
3148 printbuf_tabstop_push(out
, 16);
3149 printbuf_tabstop_push(out
, 32);
3152 prt_printf(out
, "%i %s\n", trans
->locking_wait
.task
->pid
, trans
->fn
);
3154 trans_for_each_path_safe(trans
, path
, idx
) {
3155 if (!path
->nodes_locked
)
3158 prt_printf(out
, " path %u %c l=%u %s:",
3160 path
->cached
? 'c' : 'b',
3162 bch2_btree_id_str(path
->btree_id
));
3163 bch2_bpos_to_text(out
, path
->pos
);
3166 for (l
= 0; l
< BTREE_MAX_DEPTH
; l
++) {
3167 if (btree_node_locked(path
, l
) &&
3168 !IS_ERR_OR_NULL(b
= (void *) READ_ONCE(path
->l
[l
].b
))) {
3169 prt_printf(out
, " %c l=%u ",
3170 lock_types
[btree_node_locked_type(path
, l
)], l
);
3171 bch2_btree_bkey_cached_common_to_text(out
, b
);
3177 b
= READ_ONCE(trans
->locking
);
3179 prt_printf(out
, " blocked for %lluus on",
3180 div_u64(local_clock() - trans
->locking_wait
.start_time
,
3183 prt_printf(out
, " %c", lock_types
[trans
->locking_wait
.lock_want
]);
3184 bch2_btree_bkey_cached_common_to_text(out
, b
);
3189 void bch2_fs_btree_iter_exit(struct bch_fs
*c
)
3191 struct btree_transaction_stats
*s
;
3192 struct btree_trans
*trans
;
3195 trans
= list_first_entry_or_null(&c
->btree_trans_list
, struct btree_trans
, list
);
3197 panic("%s leaked btree_trans\n", trans
->fn
);
3199 if (c
->btree_trans_bufs
)
3200 for_each_possible_cpu(cpu
)
3201 kfree(per_cpu_ptr(c
->btree_trans_bufs
, cpu
)->trans
);
3202 free_percpu(c
->btree_trans_bufs
);
3204 for (s
= c
->btree_transaction_stats
;
3205 s
< c
->btree_transaction_stats
+ ARRAY_SIZE(c
->btree_transaction_stats
);
3207 kfree(s
->max_paths_text
);
3208 bch2_time_stats_exit(&s
->lock_hold_times
);
3211 if (c
->btree_trans_barrier_initialized
)
3212 cleanup_srcu_struct(&c
->btree_trans_barrier
);
3213 mempool_exit(&c
->btree_trans_mem_pool
);
3214 mempool_exit(&c
->btree_trans_pool
);
3217 int bch2_fs_btree_iter_init(struct bch_fs
*c
)
3219 struct btree_transaction_stats
*s
;
3222 for (s
= c
->btree_transaction_stats
;
3223 s
< c
->btree_transaction_stats
+ ARRAY_SIZE(c
->btree_transaction_stats
);
3225 bch2_time_stats_init(&s
->lock_hold_times
);
3226 mutex_init(&s
->lock
);
3229 INIT_LIST_HEAD(&c
->btree_trans_list
);
3230 seqmutex_init(&c
->btree_trans_lock
);
3232 c
->btree_trans_bufs
= alloc_percpu(struct btree_trans_buf
);
3233 if (!c
->btree_trans_bufs
)
3236 ret
= mempool_init_kmalloc_pool(&c
->btree_trans_pool
, 1,
3237 sizeof(struct btree_trans
)) ?:
3238 mempool_init_kmalloc_pool(&c
->btree_trans_mem_pool
, 1,
3239 BTREE_TRANS_MEM_MAX
) ?:
3240 init_srcu_struct(&c
->btree_trans_barrier
);
3242 c
->btree_trans_barrier_initialized
= true;