]>
Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H | |
3 | #define _BCACHEFS_BTREE_UPDATE_INTERIOR_H | |
4 | ||
5 | #include "btree_cache.h" | |
6 | #include "btree_locking.h" | |
7 | #include "btree_update.h" | |
8 | ||
1c6fdbd8 KO |
9 | void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); |
10 | bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, | |
11 | struct bkey_format *); | |
12 | ||
00b8ccf7 | 13 | #define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) |
1c6fdbd8 | 14 | |
00b8ccf7 | 15 | #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) |
0f9dda47 | 16 | |
1c6fdbd8 KO |
17 | /* |
18 | * Tracks an in progress split/rewrite of a btree node and the update to the | |
19 | * parent node: | |
20 | * | |
21 | * When we split/rewrite a node, we do all the updates in memory without | |
22 | * waiting for any writes to complete - we allocate the new node(s) and update | |
23 | * the parent node, possibly recursively up to the root. | |
24 | * | |
25 | * The end result is that we have one or more new nodes being written - | |
26 | * possibly several, if there were multiple splits - and then a write (updating | |
27 | * an interior node) which will make all these new nodes visible. | |
28 | * | |
29 | * Additionally, as we split/rewrite nodes we free the old nodes - but the old | |
30 | * nodes can't be freed (their space on disk can't be reclaimed) until the | |
31 | * update to the interior node that makes the new node visible completes - | |
32 | * until then, the old nodes are still reachable on disk. | |
33 | * | |
34 | */ | |
35 | struct btree_update { | |
36 | struct closure cl; | |
37 | struct bch_fs *c; | |
991ba021 | 38 | u64 start_time; |
1c6fdbd8 KO |
39 | |
40 | struct list_head list; | |
ac7c51b2 | 41 | struct list_head unwritten_list; |
1c6fdbd8 KO |
42 | |
43 | /* What kind of update are we doing? */ | |
44 | enum { | |
45 | BTREE_INTERIOR_NO_UPDATE, | |
46 | BTREE_INTERIOR_UPDATING_NODE, | |
47 | BTREE_INTERIOR_UPDATING_ROOT, | |
48 | BTREE_INTERIOR_UPDATING_AS, | |
49 | } mode; | |
50 | ||
1c6fdbd8 | 51 | unsigned nodes_written:1; |
e264b2f6 | 52 | unsigned took_gc_lock:1; |
1c6fdbd8 KO |
53 | |
54 | enum btree_id btree_id; | |
1f0f731f | 55 | unsigned update_level; |
1c6fdbd8 | 56 | |
00b8ccf7 | 57 | struct disk_reservation disk_res; |
1c6fdbd8 KO |
58 | |
59 | /* | |
60 | * BTREE_INTERIOR_UPDATING_NODE: | |
61 | * The update that made the new nodes visible was a regular update to an | |
62 | * existing interior node - @b. We can't write out the update to @b | |
63 | * until the new nodes we created are finished writing, so we block @b | |
64 | * from writing by putting this btree_interior update on the | |
65 | * @b->write_blocked list with @write_blocked_list: | |
66 | */ | |
67 | struct btree *b; | |
68 | struct list_head write_blocked_list; | |
69 | ||
1c6fdbd8 KO |
70 | /* |
71 | * We may be freeing nodes that were dirty, and thus had journal entries | |
72 | * pinned: we need to transfer the oldest of those pins to the | |
73 | * btree_update operation, and release it when the new node(s) | |
74 | * are all persistent and reachable: | |
75 | */ | |
76 | struct journal_entry_pin journal; | |
77 | ||
00b8ccf7 | 78 | /* Preallocated nodes we reserve when we start the update: */ |
30985537 KO |
79 | struct prealloc_nodes { |
80 | struct btree *b[BTREE_UPDATE_NODES_MAX]; | |
81 | unsigned nr; | |
82 | } prealloc_nodes[2]; | |
00b8ccf7 KO |
83 | |
84 | /* Nodes being freed: */ | |
85 | struct keylist old_keys; | |
86 | u64 _old_keys[BTREE_UPDATE_NODES_MAX * | |
74ef5b0d | 87 | BKEY_BTREE_PTR_U64s_MAX]; |
00b8ccf7 KO |
88 | |
89 | /* Nodes being added: */ | |
90 | struct keylist new_keys; | |
91 | u64 _new_keys[BTREE_UPDATE_NODES_MAX * | |
74ef5b0d | 92 | BKEY_BTREE_PTR_U64s_MAX]; |
1c6fdbd8 KO |
93 | |
94 | /* New nodes, that will be made reachable by this update: */ | |
00b8ccf7 | 95 | struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; |
1c6fdbd8 KO |
96 | unsigned nr_new_nodes; |
97 | ||
ee757054 KO |
98 | struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; |
99 | __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; | |
100 | unsigned nr_old_nodes; | |
101 | ||
374153c2 | 102 | open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * |
00b8ccf7 | 103 | BCH_REPLICAS_MAX]; |
374153c2 | 104 | open_bucket_idx_t nr_open_buckets; |
00b8ccf7 | 105 | |
501e1bda | 106 | unsigned journal_u64s; |
0f9dda47 | 107 | u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; |
501e1bda | 108 | |
1c6fdbd8 KO |
109 | /* Only here to reduce stack usage on recursive splits: */ |
110 | struct keylist parent_keys; | |
111 | /* | |
112 | * Enough room for btree_split's keys without realloc - btree node | |
113 | * pointers never have crc/compression info, so we only need to acount | |
114 | * for the pointers for three keys | |
115 | */ | |
116 | u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; | |
117 | }; | |
118 | ||
1c6fdbd8 | 119 | struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, |
ca7d8fca | 120 | struct btree_trans *, |
1c6fdbd8 KO |
121 | struct btree *, |
122 | struct bkey_format); | |
123 | ||
67e0dd8f | 124 | int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); |
1c6fdbd8 | 125 | |
67e0dd8f | 126 | int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, |
ecab6be7 | 127 | unsigned, unsigned, enum btree_node_sibling); |
1c6fdbd8 | 128 | |
e3a67bdb | 129 | static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, |
67e0dd8f | 130 | struct btree_path *path, |
1c6fdbd8 KO |
131 | unsigned level, unsigned flags, |
132 | enum btree_node_sibling sib) | |
133 | { | |
134 | struct btree *b; | |
135 | ||
22b383ad | 136 | EBUG_ON(!btree_node_locked(path, level)); |
1c6fdbd8 | 137 | |
67e0dd8f | 138 | b = path->l[level].b; |
e3a67bdb | 139 | if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) |
ecab6be7 | 140 | return 0; |
1c6fdbd8 | 141 | |
67e0dd8f | 142 | return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); |
1c6fdbd8 KO |
143 | } |
144 | ||
e3a67bdb | 145 | static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, |
67e0dd8f | 146 | struct btree_path *path, |
e3a67bdb KO |
147 | unsigned level, |
148 | unsigned flags) | |
1c6fdbd8 | 149 | { |
67e0dd8f | 150 | return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, |
ecab6be7 | 151 | btree_prev_sib) ?: |
67e0dd8f | 152 | bch2_foreground_maybe_merge_sibling(trans, path, level, flags, |
ecab6be7 | 153 | btree_next_sib); |
1c6fdbd8 KO |
154 | } |
155 | ||
ac319b4f KO |
156 | int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, |
157 | struct btree *, unsigned); | |
158 | void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); | |
159 | int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, | |
160 | struct btree *, struct bkey_i *, | |
161 | unsigned, bool); | |
162 | int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, | |
163 | struct bkey_i *, unsigned, bool); | |
164 | ||
1c6fdbd8 KO |
165 | void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); |
166 | void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); | |
167 | ||
168 | static inline unsigned btree_update_reserve_required(struct bch_fs *c, | |
169 | struct btree *b) | |
170 | { | |
c43a6ef9 | 171 | unsigned depth = btree_node_root(c, b)->c.level + 1; |
1c6fdbd8 KO |
172 | |
173 | /* | |
174 | * Number of nodes we might have to allocate in a worst case btree | |
175 | * split operation - we split all the way up to the root, then allocate | |
176 | * a new root, unless we're already at max depth: | |
177 | */ | |
178 | if (depth < BTREE_MAX_DEPTH) | |
c43a6ef9 | 179 | return (depth - b->c.level) * 2 + 1; |
1c6fdbd8 | 180 | else |
c43a6ef9 | 181 | return (depth - b->c.level) * 2 - 1; |
1c6fdbd8 KO |
182 | } |
183 | ||
184 | static inline void btree_node_reset_sib_u64s(struct btree *b) | |
185 | { | |
186 | b->sib_u64s[0] = b->nr.live_u64s; | |
187 | b->sib_u64s[1] = b->nr.live_u64s; | |
188 | } | |
189 | ||
190 | static inline void *btree_data_end(struct bch_fs *c, struct btree *b) | |
191 | { | |
192 | return (void *) b->data + btree_bytes(c); | |
193 | } | |
194 | ||
195 | static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, | |
196 | struct btree *b) | |
197 | { | |
198 | return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); | |
199 | } | |
200 | ||
201 | static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, | |
202 | struct btree *b) | |
203 | { | |
204 | return btree_data_end(c, b); | |
205 | } | |
206 | ||
207 | static inline void *write_block(struct btree *b) | |
208 | { | |
209 | return (void *) b->data + (b->written << 9); | |
210 | } | |
211 | ||
1fe08f31 KO |
212 | static inline bool __btree_addr_written(struct btree *b, void *p) |
213 | { | |
214 | return p < write_block(b); | |
215 | } | |
216 | ||
1c6fdbd8 KO |
217 | static inline bool bset_written(struct btree *b, struct bset *i) |
218 | { | |
1fe08f31 | 219 | return __btree_addr_written(b, i); |
1c6fdbd8 KO |
220 | } |
221 | ||
1fe08f31 | 222 | static inline bool bkey_written(struct btree *b, struct bkey_packed *k) |
1c6fdbd8 | 223 | { |
1fe08f31 | 224 | return __btree_addr_written(b, k); |
1c6fdbd8 KO |
225 | } |
226 | ||
227 | static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, | |
228 | struct btree *b, | |
229 | void *end) | |
230 | { | |
231 | ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + | |
c9bebae6 | 232 | b->whiteout_u64s; |
8244f320 | 233 | ssize_t total = c->opts.btree_node_size >> 3; |
1c6fdbd8 | 234 | |
6d9378f3 KO |
235 | /* Always leave one extra u64 for bch2_varint_decode: */ |
236 | used++; | |
237 | ||
1c6fdbd8 KO |
238 | return total - used; |
239 | } | |
240 | ||
241 | static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, | |
242 | struct btree *b) | |
243 | { | |
244 | ssize_t remaining = __bch_btree_u64s_remaining(c, b, | |
245 | btree_bkey_last(b, bset_tree_last(b))); | |
246 | ||
247 | BUG_ON(remaining < 0); | |
248 | ||
249 | if (bset_written(b, btree_bset_last(b))) | |
250 | return 0; | |
251 | ||
252 | return remaining; | |
253 | } | |
254 | ||
2177147b KO |
255 | #define BTREE_WRITE_SET_U64s_BITS 9 |
256 | ||
1c6fdbd8 KO |
257 | static inline unsigned btree_write_set_buffer(struct btree *b) |
258 | { | |
259 | /* | |
260 | * Could buffer up larger amounts of keys for btrees with larger keys, | |
261 | * pending benchmarking: | |
262 | */ | |
2177147b | 263 | return 8 << BTREE_WRITE_SET_U64s_BITS; |
1c6fdbd8 KO |
264 | } |
265 | ||
266 | static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, | |
267 | struct btree *b) | |
268 | { | |
b7ba66c8 | 269 | struct bset_tree *t = bset_tree_last(b); |
1c6fdbd8 KO |
270 | struct btree_node_entry *bne = max(write_block(b), |
271 | (void *) btree_bkey_last(b, bset_tree_last(b))); | |
272 | ssize_t remaining_space = | |
6dfa10ab | 273 | __bch_btree_u64s_remaining(c, b, bne->keys.start); |
1c6fdbd8 | 274 | |
b7ba66c8 | 275 | if (unlikely(bset_written(b, bset(b, t)))) { |
1c6fdbd8 KO |
276 | if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) |
277 | return bne; | |
278 | } else { | |
b7ba66c8 | 279 | if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && |
1c6fdbd8 KO |
280 | remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) |
281 | return bne; | |
282 | } | |
283 | ||
284 | return NULL; | |
285 | } | |
286 | ||
c9bebae6 | 287 | static inline void push_whiteout(struct bch_fs *c, struct btree *b, |
e3e464ac | 288 | struct bpos pos) |
1c6fdbd8 | 289 | { |
e3e464ac | 290 | struct bkey_packed k; |
1c6fdbd8 | 291 | |
e3e464ac | 292 | BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); |
46fee692 | 293 | EBUG_ON(btree_node_just_written(b)); |
c9bebae6 | 294 | |
e3e464ac KO |
295 | if (!bkey_pack_pos(&k, pos, b)) { |
296 | struct bkey *u = (void *) &k; | |
297 | ||
298 | bkey_init(u); | |
299 | u->p = pos; | |
300 | } | |
301 | ||
302 | k.needs_whiteout = true; | |
303 | ||
304 | b->whiteout_u64s += k.u64s; | |
a8958a1a | 305 | bkey_p_copy(unwritten_whiteouts_start(c, b), &k); |
1c6fdbd8 KO |
306 | } |
307 | ||
308 | /* | |
309 | * write lock must be held on @b (else the dirty bset that we were going to | |
310 | * insert into could be written out from under us) | |
311 | */ | |
312 | static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, | |
cc1add4a | 313 | struct btree *b, unsigned u64s) |
1c6fdbd8 | 314 | { |
f8058242 | 315 | if (unlikely(btree_node_need_rewrite(b))) |
1c6fdbd8 KO |
316 | return false; |
317 | ||
1c6fdbd8 KO |
318 | return u64s <= bch_btree_keys_u64s_remaining(c, b); |
319 | } | |
320 | ||
7807e143 | 321 | void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); |
1c6fdbd8 | 322 | |
c0960603 | 323 | bool bch2_btree_interior_updates_flush(struct bch_fs *); |
1c6fdbd8 | 324 | |
9f6db127 | 325 | void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); |
00b8ccf7 | 326 | struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, |
769b3600 | 327 | struct jset_entry *, unsigned long); |
00b8ccf7 | 328 | |
a1f26d70 KO |
329 | void bch2_do_pending_node_rewrites(struct bch_fs *); |
330 | void bch2_free_pending_node_rewrites(struct bch_fs *); | |
331 | ||
c823c339 | 332 | void bch2_fs_btree_interior_update_exit(struct bch_fs *); |
65db6049 | 333 | void bch2_fs_btree_interior_update_init_early(struct bch_fs *); |
c823c339 KO |
334 | int bch2_fs_btree_interior_update_init(struct bch_fs *); |
335 | ||
1c6fdbd8 | 336 | #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ |