1 // SPDX-License-Identifier: GPL-2.0
6 #include "btree_iter.h"
7 #include "btree_journal_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_update_interior.h"
10 #include "btree_write_buffer.h"
15 #include "journal_reclaim.h"
19 #include <linux/prefetch.h>
21 static void verify_update_old_key(struct btree_trans
*trans
, struct btree_insert_entry
*i
)
23 #ifdef CONFIG_BCACHEFS_DEBUG
24 struct bch_fs
*c
= trans
->c
;
26 struct bkey_s_c k
= bch2_btree_path_peek_slot_exact(i
->path
, &u
);
28 if (unlikely(trans
->journal_replay_not_finished
)) {
30 bch2_journal_keys_peek_slot(c
, i
->btree_id
, i
->level
, i
->k
->k
.p
);
33 k
= bkey_i_to_s_c(j_k
);
37 u
.needs_whiteout
= i
->old_k
.needs_whiteout
;
39 BUG_ON(memcmp(&i
->old_k
, &u
, sizeof(struct bkey
)));
40 BUG_ON(i
->old_v
!= k
.v
);
44 static inline struct btree_path_level
*insert_l(struct btree_insert_entry
*i
)
46 return i
->path
->l
+ i
->level
;
49 static inline bool same_leaf_as_prev(struct btree_trans
*trans
,
50 struct btree_insert_entry
*i
)
52 return i
!= trans
->updates
&&
53 insert_l(&i
[0])->b
== insert_l(&i
[-1])->b
;
56 static inline bool same_leaf_as_next(struct btree_trans
*trans
,
57 struct btree_insert_entry
*i
)
59 return i
+ 1 < trans
->updates
+ trans
->nr_updates
&&
60 insert_l(&i
[0])->b
== insert_l(&i
[1])->b
;
63 inline void bch2_btree_node_prep_for_write(struct btree_trans
*trans
,
64 struct btree_path
*path
,
67 struct bch_fs
*c
= trans
->c
;
69 if (unlikely(btree_node_just_written(b
)) &&
70 bch2_btree_post_write_cleanup(c
, b
))
71 bch2_trans_node_reinit_iter(trans
, b
);
74 * If the last bset has been written, or if it's gotten too big - start
75 * a new bset to insert into:
77 if (want_new_bset(c
, b
))
78 bch2_btree_init_next(trans
, b
);
81 static noinline
int trans_lock_write_fail(struct btree_trans
*trans
, struct btree_insert_entry
*i
)
83 while (--i
>= trans
->updates
) {
84 if (same_leaf_as_prev(trans
, i
))
87 bch2_btree_node_unlock_write(trans
, i
->path
, insert_l(i
)->b
);
90 trace_and_count(trans
->c
, trans_restart_would_deadlock_write
, trans
);
91 return btree_trans_restart(trans
, BCH_ERR_transaction_restart_would_deadlock_write
);
94 static inline int bch2_trans_lock_write(struct btree_trans
*trans
)
96 struct btree_insert_entry
*i
;
98 EBUG_ON(trans
->write_locked
);
100 trans_for_each_update(trans
, i
) {
101 if (same_leaf_as_prev(trans
, i
))
104 if (bch2_btree_node_lock_write(trans
, i
->path
, &insert_l(i
)->b
->c
))
105 return trans_lock_write_fail(trans
, i
);
108 bch2_btree_node_prep_for_write(trans
, i
->path
, insert_l(i
)->b
);
111 trans
->write_locked
= true;
115 static inline void bch2_trans_unlock_write(struct btree_trans
*trans
)
117 if (likely(trans
->write_locked
)) {
118 struct btree_insert_entry
*i
;
120 trans_for_each_update(trans
, i
)
121 if (!same_leaf_as_prev(trans
, i
))
122 bch2_btree_node_unlock_write_inlined(trans
, i
->path
,
124 trans
->write_locked
= false;
128 /* Inserting into a given leaf node (last stage of insert): */
130 /* Handle overwrites and do insert, for non extents: */
131 bool bch2_btree_bset_insert_key(struct btree_trans
*trans
,
132 struct btree_path
*path
,
134 struct btree_node_iter
*node_iter
,
135 struct bkey_i
*insert
)
137 struct bkey_packed
*k
;
138 unsigned clobber_u64s
= 0, new_u64s
= 0;
140 EBUG_ON(btree_node_just_written(b
));
141 EBUG_ON(bset_written(b
, btree_bset_last(b
)));
142 EBUG_ON(bkey_deleted(&insert
->k
) && bkey_val_u64s(&insert
->k
));
143 EBUG_ON(bpos_lt(insert
->k
.p
, b
->data
->min_key
));
144 EBUG_ON(bpos_gt(insert
->k
.p
, b
->data
->max_key
));
145 EBUG_ON(insert
->k
.u64s
>
146 bch_btree_keys_u64s_remaining(trans
->c
, b
));
147 EBUG_ON(!b
->c
.level
&& !bpos_eq(insert
->k
.p
, path
->pos
));
149 k
= bch2_btree_node_iter_peek_all(node_iter
, b
);
150 if (k
&& bkey_cmp_left_packed(b
, k
, &insert
->k
.p
))
153 /* @k is the key being overwritten/deleted, if any: */
154 EBUG_ON(k
&& bkey_deleted(k
));
156 /* Deleting, but not found? nothing to do: */
157 if (bkey_deleted(&insert
->k
) && !k
)
160 if (bkey_deleted(&insert
->k
)) {
162 btree_account_key_drop(b
, k
);
163 k
->type
= KEY_TYPE_deleted
;
165 if (k
->needs_whiteout
)
166 push_whiteout(trans
->c
, b
, insert
->k
.p
);
167 k
->needs_whiteout
= false;
169 if (k
>= btree_bset_last(b
)->start
) {
170 clobber_u64s
= k
->u64s
;
171 bch2_bset_delete(b
, k
, clobber_u64s
);
174 bch2_btree_path_fix_key_modified(trans
, b
, k
);
182 btree_account_key_drop(b
, k
);
183 k
->type
= KEY_TYPE_deleted
;
185 insert
->k
.needs_whiteout
= k
->needs_whiteout
;
186 k
->needs_whiteout
= false;
188 if (k
>= btree_bset_last(b
)->start
) {
189 clobber_u64s
= k
->u64s
;
192 bch2_btree_path_fix_key_modified(trans
, b
, k
);
196 k
= bch2_btree_node_iter_bset_pos(node_iter
, b
, bset_tree_last(b
));
198 bch2_bset_insert(b
, node_iter
, k
, insert
, clobber_u64s
);
201 if (clobber_u64s
!= new_u64s
)
202 bch2_btree_node_iter_fix(trans
, path
, b
, node_iter
, k
,
203 clobber_u64s
, new_u64s
);
207 static int __btree_node_flush(struct journal
*j
, struct journal_entry_pin
*pin
,
210 struct bch_fs
*c
= container_of(j
, struct bch_fs
, journal
);
211 struct btree_write
*w
= container_of(pin
, struct btree_write
, journal
);
212 struct btree
*b
= container_of(w
, struct btree
, writes
[i
]);
213 struct btree_trans
*trans
= bch2_trans_get(c
);
214 unsigned long old
, new, v
;
215 unsigned idx
= w
- b
->writes
;
217 btree_node_lock_nopath_nofail(trans
, &b
->c
, SIX_LOCK_read
);
218 v
= READ_ONCE(b
->flags
);
223 if (!(old
& (1 << BTREE_NODE_dirty
)) ||
224 !!(old
& (1 << BTREE_NODE_write_idx
)) != idx
||
225 w
->journal
.seq
!= seq
)
228 new &= ~BTREE_WRITE_TYPE_MASK
;
229 new |= BTREE_WRITE_journal_reclaim
;
230 new |= 1 << BTREE_NODE_need_write
;
231 } while ((v
= cmpxchg(&b
->flags
, old
, new)) != old
);
233 btree_node_write_if_need(c
, b
, SIX_LOCK_read
);
234 six_unlock_read(&b
->c
.lock
);
236 bch2_trans_put(trans
);
240 int bch2_btree_node_flush0(struct journal
*j
, struct journal_entry_pin
*pin
, u64 seq
)
242 return __btree_node_flush(j
, pin
, 0, seq
);
245 int bch2_btree_node_flush1(struct journal
*j
, struct journal_entry_pin
*pin
, u64 seq
)
247 return __btree_node_flush(j
, pin
, 1, seq
);
250 inline void bch2_btree_add_journal_pin(struct bch_fs
*c
,
251 struct btree
*b
, u64 seq
)
253 struct btree_write
*w
= btree_current_write(b
);
255 bch2_journal_pin_add(&c
->journal
, seq
, &w
->journal
,
256 btree_node_write_idx(b
) == 0
257 ? bch2_btree_node_flush0
258 : bch2_btree_node_flush1
);
262 * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
263 * @trans: btree transaction object
264 * @path: path pointing to @insert's pos
265 * @insert: key to insert
266 * @journal_seq: sequence number of journal reservation
268 inline void bch2_btree_insert_key_leaf(struct btree_trans
*trans
,
269 struct btree_path
*path
,
270 struct bkey_i
*insert
,
273 struct bch_fs
*c
= trans
->c
;
274 struct btree
*b
= path_l(path
)->b
;
275 struct bset_tree
*t
= bset_tree_last(b
);
276 struct bset
*i
= bset(b
, t
);
277 int old_u64s
= bset_u64s(t
);
278 int old_live_u64s
= b
->nr
.live_u64s
;
279 int live_u64s_added
, u64s_added
;
281 if (unlikely(!bch2_btree_bset_insert_key(trans
, path
, b
,
282 &path_l(path
)->iter
, insert
)))
285 i
->journal_seq
= cpu_to_le64(max(journal_seq
, le64_to_cpu(i
->journal_seq
)));
287 bch2_btree_add_journal_pin(c
, b
, journal_seq
);
289 if (unlikely(!btree_node_dirty(b
))) {
290 EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN
, &c
->flags
));
291 set_btree_node_dirty_acct(c
, b
);
294 live_u64s_added
= (int) b
->nr
.live_u64s
- old_live_u64s
;
295 u64s_added
= (int) bset_u64s(t
) - old_u64s
;
297 if (b
->sib_u64s
[0] != U16_MAX
&& live_u64s_added
< 0)
298 b
->sib_u64s
[0] = max(0, (int) b
->sib_u64s
[0] + live_u64s_added
);
299 if (b
->sib_u64s
[1] != U16_MAX
&& live_u64s_added
< 0)
300 b
->sib_u64s
[1] = max(0, (int) b
->sib_u64s
[1] + live_u64s_added
);
302 if (u64s_added
> live_u64s_added
&&
303 bch2_maybe_compact_whiteouts(c
, b
))
304 bch2_trans_node_reinit_iter(trans
, b
);
307 /* Cached btree updates: */
309 /* Normal update interface: */
311 static inline void btree_insert_entry_checks(struct btree_trans
*trans
,
312 struct btree_insert_entry
*i
)
314 BUG_ON(!bpos_eq(i
->k
->k
.p
, i
->path
->pos
));
315 BUG_ON(i
->cached
!= i
->path
->cached
);
316 BUG_ON(i
->level
!= i
->path
->level
);
317 BUG_ON(i
->btree_id
!= i
->path
->btree_id
);
319 btree_type_has_snapshots(i
->btree_id
) &&
320 !(i
->flags
& BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE
) &&
321 test_bit(JOURNAL_REPLAY_DONE
, &trans
->c
->journal
.flags
) &&
322 i
->k
->k
.p
.snapshot
&&
323 bch2_snapshot_is_internal_node(trans
->c
, i
->k
->k
.p
.snapshot
));
326 static __always_inline
int bch2_trans_journal_res_get(struct btree_trans
*trans
,
329 return bch2_journal_res_get(&trans
->c
->journal
, &trans
->journal_res
,
330 trans
->journal_u64s
, flags
);
333 #define JSET_ENTRY_LOG_U64s 4
335 static noinline
void journal_transaction_name(struct btree_trans
*trans
)
337 struct bch_fs
*c
= trans
->c
;
338 struct journal
*j
= &c
->journal
;
339 struct jset_entry
*entry
=
340 bch2_journal_add_entry(j
, &trans
->journal_res
,
341 BCH_JSET_ENTRY_log
, 0, 0,
342 JSET_ENTRY_LOG_U64s
);
343 struct jset_entry_log
*l
=
344 container_of(entry
, struct jset_entry_log
, entry
);
346 strncpy(l
->d
, trans
->fn
, JSET_ENTRY_LOG_U64s
* sizeof(u64
));
349 static inline int btree_key_can_insert(struct btree_trans
*trans
,
350 struct btree
*b
, unsigned u64s
)
352 struct bch_fs
*c
= trans
->c
;
354 if (!bch2_btree_node_insert_fits(c
, b
, u64s
))
355 return -BCH_ERR_btree_insert_btree_node_full
;
361 btree_key_can_insert_cached_slowpath(struct btree_trans
*trans
, unsigned flags
,
362 struct btree_path
*path
, unsigned new_u64s
)
364 struct bch_fs
*c
= trans
->c
;
365 struct btree_insert_entry
*i
;
366 struct bkey_cached
*ck
= (void *) path
->l
[0].b
;
367 struct bkey_i
*new_k
;
370 bch2_trans_unlock_write(trans
);
371 bch2_trans_unlock(trans
);
373 new_k
= kmalloc(new_u64s
* sizeof(u64
), GFP_KERNEL
);
375 bch_err(c
, "error allocating memory for key cache key, btree %s u64s %u",
376 bch2_btree_id_str(path
->btree_id
), new_u64s
);
377 return -BCH_ERR_ENOMEM_btree_key_cache_insert
;
380 ret
= bch2_trans_relock(trans
) ?:
381 bch2_trans_lock_write(trans
);
387 memcpy(new_k
, ck
->k
, ck
->u64s
* sizeof(u64
));
389 trans_for_each_update(trans
, i
)
390 if (i
->old_v
== &ck
->k
->v
)
391 i
->old_v
= &new_k
->v
;
399 static int btree_key_can_insert_cached(struct btree_trans
*trans
, unsigned flags
,
400 struct btree_path
*path
, unsigned u64s
)
402 struct bch_fs
*c
= trans
->c
;
403 struct bkey_cached
*ck
= (void *) path
->l
[0].b
;
404 struct btree_insert_entry
*i
;
406 struct bkey_i
*new_k
;
408 EBUG_ON(path
->level
);
410 if (!test_bit(BKEY_CACHED_DIRTY
, &ck
->flags
) &&
411 bch2_btree_key_cache_must_wait(c
) &&
412 !(flags
& BTREE_INSERT_JOURNAL_RECLAIM
))
413 return -BCH_ERR_btree_insert_need_journal_reclaim
;
416 * bch2_varint_decode can read past the end of the buffer by at most 7
417 * bytes (it won't be used):
421 if (u64s
<= ck
->u64s
)
424 new_u64s
= roundup_pow_of_two(u64s
);
425 new_k
= krealloc(ck
->k
, new_u64s
* sizeof(u64
), GFP_NOWAIT
);
426 if (unlikely(!new_k
))
427 return btree_key_can_insert_cached_slowpath(trans
, flags
, path
, new_u64s
);
429 trans_for_each_update(trans
, i
)
430 if (i
->old_v
== &ck
->k
->v
)
431 i
->old_v
= &new_k
->v
;
440 static int run_one_mem_trigger(struct btree_trans
*trans
,
441 struct btree_insert_entry
*i
,
444 struct bkey_s_c old
= { &i
->old_k
, i
->old_v
};
445 struct bkey_i
*new = i
->k
;
446 const struct bkey_ops
*old_ops
= bch2_bkey_type_ops(old
.k
->type
);
447 const struct bkey_ops
*new_ops
= bch2_bkey_type_ops(i
->k
->k
.type
);
450 verify_update_old_key(trans
, i
);
452 if (unlikely(flags
& BTREE_TRIGGER_NORUN
))
455 if (!btree_node_type_needs_gc(__btree_node_type(i
->level
, i
->btree_id
)))
458 if (old_ops
->atomic_trigger
== new_ops
->atomic_trigger
) {
459 ret
= bch2_mark_key(trans
, i
->btree_id
, i
->level
,
460 old
, bkey_i_to_s_c(new),
461 BTREE_TRIGGER_INSERT
|BTREE_TRIGGER_OVERWRITE
|flags
);
463 struct bkey _deleted
= KEY(0, 0, 0);
464 struct bkey_s_c deleted
= (struct bkey_s_c
) { &_deleted
, NULL
};
466 _deleted
.p
= i
->path
->pos
;
468 ret
= bch2_mark_key(trans
, i
->btree_id
, i
->level
,
469 deleted
, bkey_i_to_s_c(new),
470 BTREE_TRIGGER_INSERT
|flags
) ?:
471 bch2_mark_key(trans
, i
->btree_id
, i
->level
,
473 BTREE_TRIGGER_OVERWRITE
|flags
);
479 static int run_one_trans_trigger(struct btree_trans
*trans
, struct btree_insert_entry
*i
,
483 * Transactional triggers create new btree_insert_entries, so we can't
484 * pass them a pointer to a btree_insert_entry, that memory is going to
487 struct bkey old_k
= i
->old_k
;
488 struct bkey_s_c old
= { &old_k
, i
->old_v
};
489 const struct bkey_ops
*old_ops
= bch2_bkey_type_ops(old
.k
->type
);
490 const struct bkey_ops
*new_ops
= bch2_bkey_type_ops(i
->k
->k
.type
);
492 verify_update_old_key(trans
, i
);
494 if ((i
->flags
& BTREE_TRIGGER_NORUN
) ||
495 !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS
& (1U << i
->bkey_type
)))
498 if (!i
->insert_trigger_run
&&
499 !i
->overwrite_trigger_run
&&
500 old_ops
->trans_trigger
== new_ops
->trans_trigger
) {
501 i
->overwrite_trigger_run
= true;
502 i
->insert_trigger_run
= true;
503 return bch2_trans_mark_key(trans
, i
->btree_id
, i
->level
, old
, i
->k
,
504 BTREE_TRIGGER_INSERT
|
505 BTREE_TRIGGER_OVERWRITE
|
507 } else if (overwrite
&& !i
->overwrite_trigger_run
) {
508 i
->overwrite_trigger_run
= true;
509 return bch2_trans_mark_old(trans
, i
->btree_id
, i
->level
, old
, i
->flags
) ?: 1;
510 } else if (!overwrite
&& !i
->insert_trigger_run
) {
511 i
->insert_trigger_run
= true;
512 return bch2_trans_mark_new(trans
, i
->btree_id
, i
->level
, i
->k
, i
->flags
) ?: 1;
518 static int run_btree_triggers(struct btree_trans
*trans
, enum btree_id btree_id
,
519 struct btree_insert_entry
*btree_id_start
)
521 struct btree_insert_entry
*i
;
522 bool trans_trigger_run
;
525 for (overwrite
= 1; overwrite
>= 0; --overwrite
) {
528 * Running triggers will append more updates to the list of updates as
532 trans_trigger_run
= false;
534 for (i
= btree_id_start
;
535 i
< trans
->updates
+ trans
->nr_updates
&& i
->btree_id
<= btree_id
;
537 if (i
->btree_id
!= btree_id
)
540 ret
= run_one_trans_trigger(trans
, i
, overwrite
);
544 trans_trigger_run
= true;
546 } while (trans_trigger_run
);
552 static int bch2_trans_commit_run_triggers(struct btree_trans
*trans
)
554 struct btree_insert_entry
*i
= NULL
, *btree_id_start
= trans
->updates
;
555 unsigned btree_id
= 0;
560 * For a given btree, this algorithm runs insert triggers before
561 * overwrite triggers: this is so that when extents are being moved
562 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
565 for (btree_id
= 0; btree_id
< BTREE_ID_NR
; btree_id
++) {
566 if (btree_id
== BTREE_ID_alloc
)
569 while (btree_id_start
< trans
->updates
+ trans
->nr_updates
&&
570 btree_id_start
->btree_id
< btree_id
)
573 ret
= run_btree_triggers(trans
, btree_id
, btree_id_start
);
578 trans_for_each_update(trans
, i
) {
579 if (i
->btree_id
> BTREE_ID_alloc
)
581 if (i
->btree_id
== BTREE_ID_alloc
) {
582 ret
= run_btree_triggers(trans
, BTREE_ID_alloc
, i
);
589 #ifdef CONFIG_BCACHEFS_DEBUG
590 trans_for_each_update(trans
, i
)
591 BUG_ON(!(i
->flags
& BTREE_TRIGGER_NORUN
) &&
592 (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS
& (1U << i
->bkey_type
)) &&
593 (!i
->insert_trigger_run
|| !i
->overwrite_trigger_run
));
598 static noinline
int bch2_trans_commit_run_gc_triggers(struct btree_trans
*trans
)
600 struct bch_fs
*c
= trans
->c
;
601 struct btree_insert_entry
*i
;
604 trans_for_each_update(trans
, i
) {
606 * XXX: synchronization of cached update triggers with gc
607 * XXX: synchronization of interior node updates with gc
609 BUG_ON(i
->cached
|| i
->level
);
611 if (gc_visited(c
, gc_pos_btree_node(insert_l(i
)->b
))) {
612 ret
= run_one_mem_trigger(trans
, i
, i
->flags
|BTREE_TRIGGER_GC
);
622 bch2_trans_commit_write_locked(struct btree_trans
*trans
, unsigned flags
,
623 struct btree_insert_entry
**stopped_at
,
624 unsigned long trace_ip
)
626 struct bch_fs
*c
= trans
->c
;
627 struct btree_insert_entry
*i
;
628 struct btree_write_buffered_key
*wb
;
629 struct btree_trans_commit_hook
*h
;
634 trace_and_count(c
, trans_restart_fault_inject
, trans
, trace_ip
);
635 return btree_trans_restart_nounlock(trans
, BCH_ERR_transaction_restart_fault_inject
);
639 * Check if the insert will fit in the leaf node with the write lock
640 * held, otherwise another thread could write the node changing the
641 * amount of space available:
644 prefetch(&trans
->c
->journal
.flags
);
646 trans_for_each_update(trans
, i
) {
647 /* Multiple inserts might go to same leaf: */
648 if (!same_leaf_as_prev(trans
, i
))
651 u64s
+= i
->k
->k
.u64s
;
653 ? btree_key_can_insert(trans
, insert_l(i
)->b
, u64s
)
654 : btree_key_can_insert_cached(trans
, flags
, i
->path
, u64s
);
661 if (trans
->nr_wb_updates
&&
662 trans
->nr_wb_updates
+ c
->btree_write_buffer
.state
.nr
> c
->btree_write_buffer
.size
)
663 return -BCH_ERR_btree_insert_need_flush_buffer
;
666 * Don't get journal reservation until after we know insert will
669 if (likely(!(flags
& BTREE_INSERT_JOURNAL_REPLAY
))) {
670 ret
= bch2_trans_journal_res_get(trans
,
671 (flags
& BCH_WATERMARK_MASK
)|
672 JOURNAL_RES_GET_NONBLOCK
);
676 if (unlikely(trans
->journal_transaction_names
))
677 journal_transaction_name(trans
);
679 trans
->journal_res
.seq
= c
->journal
.replay_journal_seq
;
683 * Not allowed to fail after we've gotten our journal reservation - we
687 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG
) &&
688 !(flags
& BTREE_INSERT_JOURNAL_REPLAY
)) {
689 if (bch2_journal_seq_verify
)
690 trans_for_each_update(trans
, i
)
691 i
->k
->k
.version
.lo
= trans
->journal_res
.seq
;
692 else if (bch2_inject_invalid_keys
)
693 trans_for_each_update(trans
, i
)
694 i
->k
->k
.version
= MAX_VERSION
;
697 if (trans
->fs_usage_deltas
&&
698 bch2_trans_fs_usage_apply(trans
, trans
->fs_usage_deltas
))
699 return -BCH_ERR_btree_insert_need_mark_replicas
;
701 if (trans
->nr_wb_updates
) {
702 EBUG_ON(flags
& BTREE_INSERT_JOURNAL_REPLAY
);
704 ret
= bch2_btree_insert_keys_write_buffer(trans
);
706 goto revert_fs_usage
;
711 ret
= h
->fn(trans
, h
);
713 goto revert_fs_usage
;
717 trans_for_each_update(trans
, i
)
718 if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS
& (1U << i
->bkey_type
)) {
719 ret
= run_one_mem_trigger(trans
, i
, i
->flags
);
724 if (unlikely(c
->gc_pos
.phase
)) {
725 ret
= bch2_trans_commit_run_gc_triggers(trans
);
730 if (unlikely(trans
->extra_journal_entries
.nr
)) {
731 memcpy_u64s_small(journal_res_entry(&c
->journal
, &trans
->journal_res
),
732 trans
->extra_journal_entries
.data
,
733 trans
->extra_journal_entries
.nr
);
735 trans
->journal_res
.offset
+= trans
->extra_journal_entries
.nr
;
736 trans
->journal_res
.u64s
-= trans
->extra_journal_entries
.nr
;
739 if (likely(!(flags
& BTREE_INSERT_JOURNAL_REPLAY
))) {
740 struct journal
*j
= &c
->journal
;
741 struct jset_entry
*entry
;
743 trans_for_each_update(trans
, i
) {
744 if (i
->key_cache_already_flushed
)
747 if (i
->flags
& BTREE_UPDATE_NOJOURNAL
)
750 verify_update_old_key(trans
, i
);
752 if (trans
->journal_transaction_names
) {
753 entry
= bch2_journal_add_entry(j
, &trans
->journal_res
,
754 BCH_JSET_ENTRY_overwrite
,
755 i
->btree_id
, i
->level
,
757 bkey_reassemble((struct bkey_i
*) entry
->start
,
758 (struct bkey_s_c
) { &i
->old_k
, i
->old_v
});
761 entry
= bch2_journal_add_entry(j
, &trans
->journal_res
,
762 BCH_JSET_ENTRY_btree_keys
,
763 i
->btree_id
, i
->level
,
765 bkey_copy((struct bkey_i
*) entry
->start
, i
->k
);
768 trans_for_each_wb_update(trans
, wb
) {
769 entry
= bch2_journal_add_entry(j
, &trans
->journal_res
,
770 BCH_JSET_ENTRY_btree_keys
,
773 bkey_copy((struct bkey_i
*) entry
->start
, &wb
->k
);
776 if (trans
->journal_seq
)
777 *trans
->journal_seq
= trans
->journal_res
.seq
;
780 trans_for_each_update(trans
, i
) {
781 i
->k
->k
.needs_whiteout
= false;
784 u64 seq
= trans
->journal_res
.seq
;
786 if (i
->flags
& BTREE_UPDATE_PREJOURNAL
)
789 bch2_btree_insert_key_leaf(trans
, i
->path
, i
->k
, seq
);
790 } else if (!i
->key_cache_already_flushed
)
791 bch2_btree_insert_key_cached(trans
, flags
, i
);
793 bch2_btree_key_cache_drop(trans
, i
->path
);
794 btree_path_set_dirty(i
->path
, BTREE_ITER_NEED_TRAVERSE
);
802 if (trans
->fs_usage_deltas
)
803 bch2_trans_fs_usage_revert(trans
, trans
->fs_usage_deltas
);
807 static noinline
void bch2_drop_overwrites_from_journal(struct btree_trans
*trans
)
809 struct btree_insert_entry
*i
;
810 struct btree_write_buffered_key
*wb
;
812 trans_for_each_update(trans
, i
)
813 bch2_journal_key_overwritten(trans
->c
, i
->btree_id
, i
->level
, i
->k
->k
.p
);
815 trans_for_each_wb_update(trans
, wb
)
816 bch2_journal_key_overwritten(trans
->c
, wb
->btree
, 0, wb
->k
.k
.p
);
819 static noinline
int bch2_trans_commit_bkey_invalid(struct btree_trans
*trans
,
820 enum bkey_invalid_flags flags
,
821 struct btree_insert_entry
*i
,
822 struct printbuf
*err
)
824 struct bch_fs
*c
= trans
->c
;
827 prt_printf(err
, "invalid bkey on insert from %s -> %ps",
828 trans
->fn
, (void *) i
->ip_allocated
);
830 printbuf_indent_add(err
, 2);
832 bch2_bkey_val_to_text(err
, c
, bkey_i_to_s_c(i
->k
));
835 bch2_bkey_invalid(c
, bkey_i_to_s_c(i
->k
), i
->bkey_type
, flags
, err
);
836 bch2_print_string_as_lines(KERN_ERR
, err
->buf
);
838 bch2_inconsistent_error(c
);
839 bch2_dump_trans_updates(trans
);
845 * Get journal reservation, take write locks, and attempt to do btree update(s):
847 static inline int do_bch2_trans_commit(struct btree_trans
*trans
, unsigned flags
,
848 struct btree_insert_entry
**stopped_at
,
849 unsigned long trace_ip
)
851 struct bch_fs
*c
= trans
->c
;
852 struct btree_insert_entry
*i
;
853 int ret
= 0, u64s_delta
= 0;
855 trans_for_each_update(trans
, i
) {
859 u64s_delta
+= !bkey_deleted(&i
->k
->k
) ? i
->k
->k
.u64s
: 0;
860 u64s_delta
-= i
->old_btree_u64s
;
862 if (!same_leaf_as_next(trans
, i
)) {
863 if (u64s_delta
<= 0) {
864 ret
= bch2_foreground_maybe_merge(trans
, i
->path
,
874 ret
= bch2_trans_lock_write(trans
);
878 ret
= bch2_trans_commit_write_locked(trans
, flags
, stopped_at
, trace_ip
);
880 if (!ret
&& unlikely(trans
->journal_replay_not_finished
))
881 bch2_drop_overwrites_from_journal(trans
);
883 bch2_trans_unlock_write(trans
);
885 if (!ret
&& trans
->journal_pin
)
886 bch2_journal_pin_add(&c
->journal
, trans
->journal_res
.seq
,
887 trans
->journal_pin
, NULL
);
890 * Drop journal reservation after dropping write locks, since dropping
891 * the journal reservation may kick off a journal write:
893 bch2_journal_res_put(&c
->journal
, &trans
->journal_res
);
898 static int journal_reclaim_wait_done(struct bch_fs
*c
)
900 int ret
= bch2_journal_error(&c
->journal
) ?:
901 !bch2_btree_key_cache_must_wait(c
);
904 journal_reclaim_kick(&c
->journal
);
909 int bch2_trans_commit_error(struct btree_trans
*trans
, unsigned flags
,
910 struct btree_insert_entry
*i
,
911 int ret
, unsigned long trace_ip
)
913 struct bch_fs
*c
= trans
->c
;
916 case -BCH_ERR_btree_insert_btree_node_full
:
917 ret
= bch2_btree_split_leaf(trans
, i
->path
, flags
);
918 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
919 trace_and_count(c
, trans_restart_btree_node_split
, trans
, trace_ip
, i
->path
);
921 case -BCH_ERR_btree_insert_need_mark_replicas
:
922 ret
= drop_locks_do(trans
,
923 bch2_replicas_delta_list_mark(c
, trans
->fs_usage_deltas
));
925 case -BCH_ERR_journal_res_get_blocked
:
927 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
930 if ((flags
& BTREE_INSERT_JOURNAL_RECLAIM
) &&
931 (flags
& BCH_WATERMARK_MASK
) != BCH_WATERMARK_reclaim
) {
932 ret
= -BCH_ERR_journal_reclaim_would_deadlock
;
936 ret
= drop_locks_do(trans
,
937 bch2_trans_journal_res_get(trans
,
938 (flags
& BCH_WATERMARK_MASK
)|
939 JOURNAL_RES_GET_CHECK
));
941 case -BCH_ERR_btree_insert_need_journal_reclaim
:
942 bch2_trans_unlock(trans
);
944 trace_and_count(c
, trans_blocked_journal_reclaim
, trans
, trace_ip
);
946 wait_event_freezable(c
->journal
.reclaim_wait
,
947 (ret
= journal_reclaim_wait_done(c
)));
951 ret
= bch2_trans_relock(trans
);
953 case -BCH_ERR_btree_insert_need_flush_buffer
: {
954 struct btree_write_buffer
*wb
= &c
->btree_write_buffer
;
958 if (wb
->state
.nr
> wb
->size
* 3 / 4) {
959 bch2_trans_unlock(trans
);
960 mutex_lock(&wb
->flush_lock
);
962 if (wb
->state
.nr
> wb
->size
* 3 / 4) {
963 bch2_trans_begin(trans
);
964 ret
= __bch2_btree_write_buffer_flush(trans
,
965 flags
|BTREE_INSERT_NOCHECK_RW
, true);
967 trace_and_count(c
, trans_restart_write_buffer_flush
, trans
, _THIS_IP_
);
968 ret
= btree_trans_restart(trans
, BCH_ERR_transaction_restart_write_buffer_flush
);
971 mutex_unlock(&wb
->flush_lock
);
972 ret
= bch2_trans_relock(trans
);
982 BUG_ON(bch2_err_matches(ret
, BCH_ERR_transaction_restart
) != !!trans
->restarted
);
984 bch2_fs_inconsistent_on(bch2_err_matches(ret
, ENOSPC
) &&
985 !(flags
& BTREE_INSERT_NOWAIT
) &&
986 (flags
& BTREE_INSERT_NOFAIL
), c
,
987 "%s: incorrectly got %s\n", __func__
, bch2_err_str(ret
));
993 bch2_trans_commit_get_rw_cold(struct btree_trans
*trans
, unsigned flags
)
995 struct bch_fs
*c
= trans
->c
;
998 if (likely(!(flags
& BTREE_INSERT_LAZY_RW
)) ||
999 test_bit(BCH_FS_STARTED
, &c
->flags
))
1000 return -BCH_ERR_erofs_trans_commit
;
1002 ret
= drop_locks_do(trans
, bch2_fs_read_write_early(c
));
1006 bch2_write_ref_get(c
, BCH_WRITE_REF_trans
);
1011 * This is for updates done in the early part of fsck - btree_gc - before we've
1012 * gone RW. we only add the new key to the list of keys for journal replay to
1016 do_bch2_trans_commit_to_journal_replay(struct btree_trans
*trans
)
1018 struct bch_fs
*c
= trans
->c
;
1019 struct btree_insert_entry
*i
;
1022 trans_for_each_update(trans
, i
) {
1023 ret
= bch2_journal_key_insert(c
, i
->btree_id
, i
->level
, i
->k
);
1031 int __bch2_trans_commit(struct btree_trans
*trans
, unsigned flags
)
1033 struct bch_fs
*c
= trans
->c
;
1034 struct btree_insert_entry
*i
= NULL
;
1035 struct btree_write_buffered_key
*wb
;
1038 if (!trans
->nr_updates
&&
1039 !trans
->nr_wb_updates
&&
1040 !trans
->extra_journal_entries
.nr
)
1043 if (flags
& BTREE_INSERT_GC_LOCK_HELD
)
1044 lockdep_assert_held(&c
->gc_lock
);
1046 ret
= bch2_trans_commit_run_triggers(trans
);
1050 trans_for_each_update(trans
, i
) {
1051 struct printbuf buf
= PRINTBUF
;
1052 enum bkey_invalid_flags invalid_flags
= 0;
1054 if (!(flags
& BTREE_INSERT_JOURNAL_REPLAY
))
1055 invalid_flags
|= BKEY_INVALID_WRITE
|BKEY_INVALID_COMMIT
;
1057 if (unlikely(bch2_bkey_invalid(c
, bkey_i_to_s_c(i
->k
),
1058 i
->bkey_type
, invalid_flags
, &buf
)))
1059 ret
= bch2_trans_commit_bkey_invalid(trans
, invalid_flags
, i
, &buf
);
1060 btree_insert_entry_checks(trans
, i
);
1061 printbuf_exit(&buf
);
1067 if (unlikely(!test_bit(BCH_FS_MAY_GO_RW
, &c
->flags
))) {
1068 ret
= do_bch2_trans_commit_to_journal_replay(trans
);
1072 if (!(flags
& BTREE_INSERT_NOCHECK_RW
) &&
1073 unlikely(!bch2_write_ref_tryget(c
, BCH_WRITE_REF_trans
))) {
1074 ret
= bch2_trans_commit_get_rw_cold(trans
, flags
);
1079 if (c
->btree_write_buffer
.state
.nr
> c
->btree_write_buffer
.size
/ 2 &&
1080 mutex_trylock(&c
->btree_write_buffer
.flush_lock
)) {
1081 bch2_trans_begin(trans
);
1082 bch2_trans_unlock(trans
);
1084 ret
= __bch2_btree_write_buffer_flush(trans
,
1085 flags
|BTREE_INSERT_NOCHECK_RW
, true);
1087 trace_and_count(c
, trans_restart_write_buffer_flush
, trans
, _THIS_IP_
);
1088 ret
= btree_trans_restart(trans
, BCH_ERR_transaction_restart_write_buffer_flush
);
1093 EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN
, &c
->flags
));
1095 trans
->journal_u64s
= trans
->extra_journal_entries
.nr
;
1096 trans
->journal_transaction_names
= READ_ONCE(c
->opts
.journal_transaction_names
);
1097 if (trans
->journal_transaction_names
)
1098 trans
->journal_u64s
+= jset_u64s(JSET_ENTRY_LOG_U64s
);
1100 trans_for_each_update(trans
, i
) {
1101 EBUG_ON(!i
->path
->should_be_locked
);
1103 ret
= bch2_btree_path_upgrade(trans
, i
->path
, i
->level
+ 1);
1107 EBUG_ON(!btree_node_intent_locked(i
->path
, i
->level
));
1109 if (i
->key_cache_already_flushed
)
1112 if (i
->flags
& BTREE_UPDATE_NOJOURNAL
)
1115 /* we're going to journal the key being updated: */
1116 trans
->journal_u64s
+= jset_u64s(i
->k
->k
.u64s
);
1118 /* and we're also going to log the overwrite: */
1119 if (trans
->journal_transaction_names
)
1120 trans
->journal_u64s
+= jset_u64s(i
->old_k
.u64s
);
1123 trans_for_each_wb_update(trans
, wb
)
1124 trans
->journal_u64s
+= jset_u64s(wb
->k
.k
.u64s
);
1126 if (trans
->extra_journal_res
) {
1127 ret
= bch2_disk_reservation_add(c
, trans
->disk_res
,
1128 trans
->extra_journal_res
,
1129 (flags
& BTREE_INSERT_NOFAIL
)
1130 ? BCH_DISK_RESERVATION_NOFAIL
: 0);
1135 bch2_trans_verify_not_in_restart(trans
);
1136 memset(&trans
->journal_res
, 0, sizeof(trans
->journal_res
));
1138 ret
= do_bch2_trans_commit(trans
, flags
, &i
, _RET_IP_
);
1140 /* make sure we didn't drop or screw up locks: */
1141 bch2_trans_verify_locks(trans
);
1146 trace_and_count(c
, transaction_commit
, trans
, _RET_IP_
);
1148 if (likely(!(flags
& BTREE_INSERT_NOCHECK_RW
)))
1149 bch2_write_ref_put(c
, BCH_WRITE_REF_trans
);
1152 bch2_trans_downgrade(trans
);
1153 bch2_trans_reset_updates(trans
);
1157 ret
= bch2_trans_commit_error(trans
, flags
, i
, ret
, _RET_IP_
);