]>
Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
07a1006a | 4 | #include "bkey_buf.h" |
1c6fdbd8 KO |
5 | #include "btree_cache.h" |
6 | #include "btree_io.h" | |
7 | #include "btree_iter.h" | |
8 | #include "btree_locking.h" | |
9 | #include "debug.h" | |
549d173c | 10 | #include "errcode.h" |
a0b73c1c | 11 | #include "error.h" |
0117591e | 12 | #include "journal.h" |
1c6fdbd8 KO |
13 | #include "trace.h" |
14 | ||
15 | #include <linux/prefetch.h> | |
e0dfc08b | 16 | #include <linux/sched/mm.h> |
1c6fdbd8 | 17 | |
de517c95 KO |
18 | const char * const bch2_btree_node_flags[] = { |
19 | #define x(f) #f, | |
20 | BTREE_FLAGS() | |
21 | #undef x | |
22 | NULL | |
23 | }; | |
24 | ||
1c6fdbd8 KO |
25 | void bch2_recalc_btree_reserve(struct bch_fs *c) |
26 | { | |
27 | unsigned i, reserve = 16; | |
28 | ||
faa6cb6c | 29 | if (!c->btree_roots_known[0].b) |
1c6fdbd8 KO |
30 | reserve += 8; |
31 | ||
faa6cb6c KO |
32 | for (i = 0; i < btree_id_nr_alive(c); i++) { |
33 | struct btree_root *r = bch2_btree_id_root(c, i); | |
34 | ||
35 | if (r->b) | |
36 | reserve += min_t(unsigned, 1, r->b->c.level) * 8; | |
37 | } | |
1c6fdbd8 KO |
38 | |
39 | c->btree_cache.reserve = reserve; | |
40 | } | |
41 | ||
42 | static inline unsigned btree_cache_can_free(struct btree_cache *bc) | |
43 | { | |
44 | return max_t(int, 0, bc->used - bc->reserve); | |
45 | } | |
46 | ||
30985537 KO |
47 | static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) |
48 | { | |
49 | if (b->c.lock.readers) | |
50 | list_move(&b->list, &bc->freed_pcpu); | |
51 | else | |
52 | list_move(&b->list, &bc->freed_nonpcpu); | |
53 | } | |
54 | ||
65c0601a | 55 | static void btree_node_data_free(struct bch_fs *c, struct btree *b) |
1c6fdbd8 | 56 | { |
65c0601a KO |
57 | struct btree_cache *bc = &c->btree_cache; |
58 | ||
1c6fdbd8 KO |
59 | EBUG_ON(btree_node_write_in_flight(b)); |
60 | ||
0b438c5b KO |
61 | clear_btree_node_just_written(b); |
62 | ||
1c6fdbd8 KO |
63 | kvpfree(b->data, btree_bytes(c)); |
64 | b->data = NULL; | |
65c0601a | 65 | #ifdef __KERNEL__ |
4580baec | 66 | kvfree(b->aux_data); |
65c0601a KO |
67 | #else |
68 | munmap(b->aux_data, btree_aux_data_bytes(b)); | |
69 | #endif | |
4580baec | 70 | b->aux_data = NULL; |
1c6fdbd8 | 71 | |
1c6fdbd8 | 72 | bc->used--; |
30985537 KO |
73 | |
74 | btree_node_to_freedlist(bc, b); | |
1c6fdbd8 KO |
75 | } |
76 | ||
77 | static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, | |
78 | const void *obj) | |
79 | { | |
80 | const struct btree *b = obj; | |
81 | const u64 *v = arg->key; | |
82 | ||
237e8048 | 83 | return b->hash_val == *v ? 0 : 1; |
1c6fdbd8 KO |
84 | } |
85 | ||
86 | static const struct rhashtable_params bch_btree_cache_params = { | |
87 | .head_offset = offsetof(struct btree, hash), | |
237e8048 KO |
88 | .key_offset = offsetof(struct btree, hash_val), |
89 | .key_len = sizeof(u64), | |
1c6fdbd8 KO |
90 | .obj_cmpfn = bch2_btree_cache_cmp_fn, |
91 | }; | |
92 | ||
4580baec | 93 | static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) |
1c6fdbd8 | 94 | { |
e38821f3 | 95 | BUG_ON(b->data || b->aux_data); |
1c6fdbd8 KO |
96 | |
97 | b->data = kvpmalloc(btree_bytes(c), gfp); | |
98 | if (!b->data) | |
65d48e35 | 99 | return -BCH_ERR_ENOMEM_btree_node_mem_alloc; |
65c0601a | 100 | #ifdef __KERNEL__ |
4580baec | 101 | b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); |
65c0601a KO |
102 | #else |
103 | b->aux_data = mmap(NULL, btree_aux_data_bytes(b), | |
104 | PROT_READ|PROT_WRITE|PROT_EXEC, | |
105 | MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); | |
106 | if (b->aux_data == MAP_FAILED) | |
107 | b->aux_data = NULL; | |
108 | #endif | |
4580baec | 109 | if (!b->aux_data) { |
e38821f3 KO |
110 | kvpfree(b->data, btree_bytes(c)); |
111 | b->data = NULL; | |
65d48e35 | 112 | return -BCH_ERR_ENOMEM_btree_node_mem_alloc; |
e38821f3 | 113 | } |
1c6fdbd8 | 114 | |
e38821f3 KO |
115 | return 0; |
116 | } | |
117 | ||
c36ff038 | 118 | static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) |
e38821f3 | 119 | { |
a1019576 KO |
120 | struct btree *b; |
121 | ||
122 | b = kzalloc(sizeof(struct btree), gfp); | |
1c6fdbd8 KO |
123 | if (!b) |
124 | return NULL; | |
125 | ||
26609b61 | 126 | bkey_btree_ptr_init(&b->key); |
1c6fdbd8 KO |
127 | INIT_LIST_HEAD(&b->list); |
128 | INIT_LIST_HEAD(&b->write_blocked); | |
4580baec KO |
129 | b->byte_order = ilog2(btree_bytes(c)); |
130 | return b; | |
131 | } | |
132 | ||
6adaac0b | 133 | struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) |
4580baec KO |
134 | { |
135 | struct btree_cache *bc = &c->btree_cache; | |
a1019576 KO |
136 | struct btree *b; |
137 | ||
138 | b = __btree_node_mem_alloc(c, GFP_KERNEL); | |
4580baec KO |
139 | if (!b) |
140 | return NULL; | |
1c6fdbd8 | 141 | |
4580baec KO |
142 | if (btree_node_data_alloc(c, b, GFP_KERNEL)) { |
143 | kfree(b); | |
144 | return NULL; | |
145 | } | |
146 | ||
0d2234a7 KO |
147 | bch2_btree_lock_init(&b->c, 0); |
148 | ||
4580baec KO |
149 | bc->used++; |
150 | list_add(&b->list, &bc->freeable); | |
151 | return b; | |
1c6fdbd8 KO |
152 | } |
153 | ||
154 | /* Btree in memory cache - hash table */ | |
155 | ||
156 | void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) | |
157 | { | |
cab8e233 | 158 | int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); |
a1019576 | 159 | |
cab8e233 | 160 | BUG_ON(ret); |
1c6fdbd8 KO |
161 | |
162 | /* Cause future lookups for this node to fail: */ | |
237e8048 | 163 | b->hash_val = 0; |
1c6fdbd8 KO |
164 | } |
165 | ||
166 | int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) | |
167 | { | |
237e8048 KO |
168 | BUG_ON(b->hash_val); |
169 | b->hash_val = btree_ptr_hash_val(&b->key); | |
170 | ||
1c6fdbd8 KO |
171 | return rhashtable_lookup_insert_fast(&bc->table, &b->hash, |
172 | bch_btree_cache_params); | |
173 | } | |
174 | ||
175 | int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, | |
176 | unsigned level, enum btree_id id) | |
177 | { | |
178 | int ret; | |
179 | ||
c43a6ef9 KO |
180 | b->c.level = level; |
181 | b->c.btree_id = id; | |
1c6fdbd8 KO |
182 | |
183 | mutex_lock(&bc->lock); | |
184 | ret = __bch2_btree_node_hash_insert(bc, b); | |
185 | if (!ret) | |
b5ac23c4 | 186 | list_add_tail(&b->list, &bc->live); |
1c6fdbd8 KO |
187 | mutex_unlock(&bc->lock); |
188 | ||
189 | return ret; | |
190 | } | |
191 | ||
192 | __flatten | |
193 | static inline struct btree *btree_cache_find(struct btree_cache *bc, | |
194 | const struct bkey_i *k) | |
195 | { | |
237e8048 KO |
196 | u64 v = btree_ptr_hash_val(k); |
197 | ||
198 | return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); | |
1c6fdbd8 KO |
199 | } |
200 | ||
201 | /* | |
202 | * this version is for btree nodes that have already been freed (we're not | |
203 | * reaping a real btree node) | |
204 | */ | |
205 | static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) | |
206 | { | |
207 | struct btree_cache *bc = &c->btree_cache; | |
208 | int ret = 0; | |
209 | ||
210 | lockdep_assert_held(&bc->lock); | |
19d54324 KO |
211 | wait_on_io: |
212 | if (b->flags & ((1U << BTREE_NODE_dirty)| | |
213 | (1U << BTREE_NODE_read_in_flight)| | |
214 | (1U << BTREE_NODE_write_in_flight))) { | |
215 | if (!flush) | |
65d48e35 | 216 | return -BCH_ERR_ENOMEM_btree_node_reclaim; |
19d54324 KO |
217 | |
218 | /* XXX: waiting on IO with btree cache lock held */ | |
219 | bch2_btree_node_wait_on_read(b); | |
220 | bch2_btree_node_wait_on_write(b); | |
221 | } | |
1c6fdbd8 | 222 | |
c43a6ef9 | 223 | if (!six_trylock_intent(&b->c.lock)) |
65d48e35 | 224 | return -BCH_ERR_ENOMEM_btree_node_reclaim; |
1c6fdbd8 | 225 | |
c43a6ef9 | 226 | if (!six_trylock_write(&b->c.lock)) |
1c6fdbd8 KO |
227 | goto out_unlock_intent; |
228 | ||
19d54324 KO |
229 | /* recheck under lock */ |
230 | if (b->flags & ((1U << BTREE_NODE_read_in_flight)| | |
231 | (1U << BTREE_NODE_write_in_flight))) { | |
232 | if (!flush) | |
233 | goto out_unlock; | |
234 | six_unlock_write(&b->c.lock); | |
235 | six_unlock_intent(&b->c.lock); | |
236 | goto wait_on_io; | |
237 | } | |
238 | ||
bf3efff5 KO |
239 | if (btree_node_noevict(b) || |
240 | btree_node_write_blocked(b) || | |
241 | btree_node_will_make_reachable(b)) | |
1c6fdbd8 KO |
242 | goto out_unlock; |
243 | ||
19d54324 | 244 | if (btree_node_dirty(b)) { |
55334d78 | 245 | if (!flush) |
1c6fdbd8 | 246 | goto out_unlock; |
1c6fdbd8 KO |
247 | /* |
248 | * Using the underscore version because we don't want to compact | |
249 | * bsets after the write, since this node is about to be evicted | |
250 | * - unless btree verify mode is enabled, since it runs out of | |
251 | * the post write cleanup: | |
252 | */ | |
29364f34 | 253 | if (bch2_verify_btree_ondisk) |
46fee692 KO |
254 | bch2_btree_node_write(c, b, SIX_LOCK_intent, |
255 | BTREE_WRITE_cache_reclaim); | |
1c6fdbd8 | 256 | else |
46fee692 KO |
257 | __bch2_btree_node_write(c, b, |
258 | BTREE_WRITE_cache_reclaim); | |
1c6fdbd8 | 259 | |
19d54324 KO |
260 | six_unlock_write(&b->c.lock); |
261 | six_unlock_intent(&b->c.lock); | |
262 | goto wait_on_io; | |
1c6fdbd8 KO |
263 | } |
264 | out: | |
237e8048 | 265 | if (b->hash_val && !ret) |
674cfc26 | 266 | trace_and_count(c, btree_cache_reap, c, b); |
1c6fdbd8 KO |
267 | return ret; |
268 | out_unlock: | |
c43a6ef9 | 269 | six_unlock_write(&b->c.lock); |
1c6fdbd8 | 270 | out_unlock_intent: |
c43a6ef9 | 271 | six_unlock_intent(&b->c.lock); |
65d48e35 | 272 | ret = -BCH_ERR_ENOMEM_btree_node_reclaim; |
1c6fdbd8 KO |
273 | goto out; |
274 | } | |
275 | ||
276 | static int btree_node_reclaim(struct bch_fs *c, struct btree *b) | |
277 | { | |
278 | return __btree_node_reclaim(c, b, false); | |
279 | } | |
280 | ||
281 | static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) | |
282 | { | |
283 | return __btree_node_reclaim(c, b, true); | |
284 | } | |
285 | ||
286 | static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, | |
287 | struct shrink_control *sc) | |
288 | { | |
ecae0bd5 | 289 | struct bch_fs *c = shrink->private_data; |
1c6fdbd8 KO |
290 | struct btree_cache *bc = &c->btree_cache; |
291 | struct btree *b, *t; | |
292 | unsigned long nr = sc->nr_to_scan; | |
7c7e071d | 293 | unsigned long can_free = 0; |
1c6fdbd8 | 294 | unsigned long freed = 0; |
c36ff038 | 295 | unsigned long touched = 0; |
97c0e195 | 296 | unsigned i, flags; |
c7ce813f | 297 | unsigned long ret = SHRINK_STOP; |
c36ff038 KO |
298 | bool trigger_writes = atomic_read(&bc->dirty) + nr >= |
299 | bc->used * 3 / 4; | |
1c6fdbd8 | 300 | |
29364f34 | 301 | if (bch2_btree_shrinker_disabled) |
1c6fdbd8 KO |
302 | return SHRINK_STOP; |
303 | ||
c36ff038 | 304 | mutex_lock(&bc->lock); |
97c0e195 KO |
305 | flags = memalloc_nofs_save(); |
306 | ||
1c6fdbd8 KO |
307 | /* |
308 | * It's _really_ critical that we don't free too many btree nodes - we | |
309 | * have to always leave ourselves a reserve. The reserve is how we | |
310 | * guarantee that allocating memory for a new btree node can always | |
311 | * succeed, so that inserting keys into the btree can always succeed and | |
312 | * IO can always make forward progress: | |
313 | */ | |
1c6fdbd8 KO |
314 | can_free = btree_cache_can_free(bc); |
315 | nr = min_t(unsigned long, nr, can_free); | |
316 | ||
317 | i = 0; | |
318 | list_for_each_entry_safe(b, t, &bc->freeable, list) { | |
c043a330 KO |
319 | /* |
320 | * Leave a few nodes on the freeable list, so that a btree split | |
321 | * won't have to hit the system allocator: | |
322 | */ | |
323 | if (++i <= 3) | |
324 | continue; | |
325 | ||
1c6fdbd8 KO |
326 | touched++; |
327 | ||
54b2db3d | 328 | if (touched >= nr) |
c36ff038 | 329 | goto out; |
1c6fdbd8 | 330 | |
c043a330 | 331 | if (!btree_node_reclaim(c, b)) { |
1c6fdbd8 | 332 | btree_node_data_free(c, b); |
c43a6ef9 KO |
333 | six_unlock_write(&b->c.lock); |
334 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 KO |
335 | freed++; |
336 | } | |
337 | } | |
338 | restart: | |
339 | list_for_each_entry_safe(b, t, &bc->live, list) { | |
c36ff038 KO |
340 | touched++; |
341 | ||
05a49d22 KO |
342 | if (btree_node_accessed(b)) { |
343 | clear_btree_node_accessed(b); | |
c36ff038 | 344 | } else if (!btree_node_reclaim(c, b)) { |
1c6fdbd8 | 345 | freed++; |
1c6fdbd8 | 346 | btree_node_data_free(c, b); |
1c6fdbd8 KO |
347 | |
348 | bch2_btree_node_hash_remove(bc, b); | |
c43a6ef9 KO |
349 | six_unlock_write(&b->c.lock); |
350 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 | 351 | |
c36ff038 KO |
352 | if (freed == nr) |
353 | goto out_rotate; | |
354 | } else if (trigger_writes && | |
355 | btree_node_dirty(b) && | |
356 | !btree_node_will_make_reachable(b) && | |
357 | !btree_node_write_blocked(b) && | |
358 | six_trylock_read(&b->c.lock)) { | |
359 | list_move(&bc->live, &b->list); | |
360 | mutex_unlock(&bc->lock); | |
46fee692 | 361 | __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); |
c36ff038 KO |
362 | six_unlock_read(&b->c.lock); |
363 | if (touched >= nr) | |
364 | goto out_nounlock; | |
365 | mutex_lock(&bc->lock); | |
1c6fdbd8 | 366 | goto restart; |
05a49d22 | 367 | } |
05a49d22 | 368 | |
c36ff038 | 369 | if (touched >= nr) |
05a49d22 | 370 | break; |
1c6fdbd8 | 371 | } |
c36ff038 KO |
372 | out_rotate: |
373 | if (&t->list != &bc->live) | |
374 | list_move_tail(&bc->live, &t->list); | |
1c6fdbd8 | 375 | out: |
c36ff038 KO |
376 | mutex_unlock(&bc->lock); |
377 | out_nounlock: | |
7c7e071d | 378 | ret = freed; |
e648448c | 379 | memalloc_nofs_restore(flags); |
674cfc26 | 380 | trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); |
c7ce813f | 381 | return ret; |
1c6fdbd8 KO |
382 | } |
383 | ||
384 | static unsigned long bch2_btree_cache_count(struct shrinker *shrink, | |
385 | struct shrink_control *sc) | |
386 | { | |
ecae0bd5 | 387 | struct bch_fs *c = shrink->private_data; |
1c6fdbd8 KO |
388 | struct btree_cache *bc = &c->btree_cache; |
389 | ||
29364f34 | 390 | if (bch2_btree_shrinker_disabled) |
1c6fdbd8 KO |
391 | return 0; |
392 | ||
7c7e071d | 393 | return btree_cache_can_free(bc); |
1c6fdbd8 KO |
394 | } |
395 | ||
396 | void bch2_fs_btree_cache_exit(struct bch_fs *c) | |
397 | { | |
398 | struct btree_cache *bc = &c->btree_cache; | |
399 | struct btree *b; | |
5d0b7f90 | 400 | unsigned i, flags; |
1c6fdbd8 | 401 | |
ecae0bd5 | 402 | shrinker_free(bc->shrink); |
1c6fdbd8 | 403 | |
5d0b7f90 KO |
404 | /* vfree() can allocate memory: */ |
405 | flags = memalloc_nofs_save(); | |
1c6fdbd8 KO |
406 | mutex_lock(&bc->lock); |
407 | ||
1c6fdbd8 KO |
408 | if (c->verify_data) |
409 | list_move(&c->verify_data->list, &bc->live); | |
410 | ||
411 | kvpfree(c->verify_ondisk, btree_bytes(c)); | |
1c6fdbd8 | 412 | |
faa6cb6c KO |
413 | for (i = 0; i < btree_id_nr_alive(c); i++) { |
414 | struct btree_root *r = bch2_btree_id_root(c, i); | |
415 | ||
416 | if (r->b) | |
417 | list_add(&r->b->list, &bc->live); | |
418 | } | |
1c6fdbd8 KO |
419 | |
420 | list_splice(&bc->freeable, &bc->live); | |
421 | ||
422 | while (!list_empty(&bc->live)) { | |
423 | b = list_first_entry(&bc->live, struct btree, list); | |
424 | ||
425 | BUG_ON(btree_node_read_in_flight(b) || | |
426 | btree_node_write_in_flight(b)); | |
427 | ||
1c6fdbd8 KO |
428 | btree_node_data_free(c, b); |
429 | } | |
430 | ||
0117591e KO |
431 | BUG_ON(!bch2_journal_error(&c->journal) && |
432 | atomic_read(&c->btree_cache.dirty)); | |
6a747c46 | 433 | |
30985537 KO |
434 | list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); |
435 | ||
436 | while (!list_empty(&bc->freed_nonpcpu)) { | |
437 | b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); | |
1c6fdbd8 | 438 | list_del(&b->list); |
0d2234a7 | 439 | six_lock_exit(&b->c.lock); |
1c6fdbd8 KO |
440 | kfree(b); |
441 | } | |
442 | ||
443 | mutex_unlock(&bc->lock); | |
5d0b7f90 | 444 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
445 | |
446 | if (bc->table_init_done) | |
447 | rhashtable_destroy(&bc->table); | |
448 | } | |
449 | ||
450 | int bch2_fs_btree_cache_init(struct bch_fs *c) | |
451 | { | |
452 | struct btree_cache *bc = &c->btree_cache; | |
ecae0bd5 | 453 | struct shrinker *shrink; |
1c6fdbd8 KO |
454 | unsigned i; |
455 | int ret = 0; | |
456 | ||
1c6fdbd8 KO |
457 | ret = rhashtable_init(&bc->table, &bch_btree_cache_params); |
458 | if (ret) | |
c8b4534d | 459 | goto err; |
1c6fdbd8 KO |
460 | |
461 | bc->table_init_done = true; | |
462 | ||
463 | bch2_recalc_btree_reserve(c); | |
464 | ||
465 | for (i = 0; i < bc->reserve; i++) | |
c8b4534d KO |
466 | if (!__bch2_btree_node_mem_alloc(c)) |
467 | goto err; | |
1c6fdbd8 KO |
468 | |
469 | list_splice_init(&bc->live, &bc->freeable); | |
470 | ||
1c6fdbd8 KO |
471 | mutex_init(&c->verify_lock); |
472 | ||
c9d01179 | 473 | shrink = shrinker_alloc(0, "%s-btree_cache", c->name); |
ecae0bd5 | 474 | if (!shrink) |
c8b4534d | 475 | goto err; |
ecae0bd5 LT |
476 | bc->shrink = shrink; |
477 | shrink->count_objects = bch2_btree_cache_count; | |
478 | shrink->scan_objects = bch2_btree_cache_scan; | |
479 | shrink->seeks = 4; | |
480 | shrink->private_data = c; | |
481 | shrinker_register(shrink); | |
c8b4534d KO |
482 | |
483 | return 0; | |
484 | err: | |
485 | return -BCH_ERR_ENOMEM_fs_btree_cache_init; | |
1c6fdbd8 KO |
486 | } |
487 | ||
488 | void bch2_fs_btree_cache_init_early(struct btree_cache *bc) | |
489 | { | |
490 | mutex_init(&bc->lock); | |
491 | INIT_LIST_HEAD(&bc->live); | |
492 | INIT_LIST_HEAD(&bc->freeable); | |
30985537 KO |
493 | INIT_LIST_HEAD(&bc->freed_pcpu); |
494 | INIT_LIST_HEAD(&bc->freed_nonpcpu); | |
1c6fdbd8 KO |
495 | } |
496 | ||
497 | /* | |
498 | * We can only have one thread cannibalizing other cached btree nodes at a time, | |
499 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | |
500 | * cannibalize_bucket() will take. This means every time we unlock the root of | |
501 | * the btree, we need to release this lock if we have it held. | |
502 | */ | |
503 | void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) | |
504 | { | |
505 | struct btree_cache *bc = &c->btree_cache; | |
506 | ||
507 | if (bc->alloc_lock == current) { | |
674cfc26 | 508 | trace_and_count(c, btree_cache_cannibalize_unlock, c); |
1c6fdbd8 KO |
509 | bc->alloc_lock = NULL; |
510 | closure_wake_up(&bc->alloc_wait); | |
511 | } | |
512 | } | |
513 | ||
514 | int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) | |
515 | { | |
516 | struct btree_cache *bc = &c->btree_cache; | |
517 | struct task_struct *old; | |
518 | ||
519 | old = cmpxchg(&bc->alloc_lock, NULL, current); | |
520 | if (old == NULL || old == current) | |
521 | goto success; | |
522 | ||
523 | if (!cl) { | |
674cfc26 | 524 | trace_and_count(c, btree_cache_cannibalize_lock_fail, c); |
65d48e35 | 525 | return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; |
1c6fdbd8 KO |
526 | } |
527 | ||
528 | closure_wait(&bc->alloc_wait, cl); | |
529 | ||
530 | /* Try again, after adding ourselves to waitlist */ | |
531 | old = cmpxchg(&bc->alloc_lock, NULL, current); | |
532 | if (old == NULL || old == current) { | |
533 | /* We raced */ | |
534 | closure_wake_up(&bc->alloc_wait); | |
535 | goto success; | |
536 | } | |
537 | ||
674cfc26 | 538 | trace_and_count(c, btree_cache_cannibalize_lock_fail, c); |
87ced107 | 539 | return -BCH_ERR_btree_cache_cannibalize_lock_blocked; |
1c6fdbd8 KO |
540 | |
541 | success: | |
674cfc26 | 542 | trace_and_count(c, btree_cache_cannibalize_lock, c); |
1c6fdbd8 KO |
543 | return 0; |
544 | } | |
545 | ||
546 | static struct btree *btree_node_cannibalize(struct bch_fs *c) | |
547 | { | |
548 | struct btree_cache *bc = &c->btree_cache; | |
549 | struct btree *b; | |
550 | ||
551 | list_for_each_entry_reverse(b, &bc->live, list) | |
552 | if (!btree_node_reclaim(c, b)) | |
553 | return b; | |
554 | ||
555 | while (1) { | |
556 | list_for_each_entry_reverse(b, &bc->live, list) | |
557 | if (!btree_node_write_and_reclaim(c, b)) | |
558 | return b; | |
559 | ||
560 | /* | |
561 | * Rare case: all nodes were intent-locked. | |
562 | * Just busy-wait. | |
563 | */ | |
564 | WARN_ONCE(1, "btree cache cannibalize failed\n"); | |
565 | cond_resched(); | |
566 | } | |
567 | } | |
568 | ||
1306f87d | 569 | struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) |
1c6fdbd8 | 570 | { |
1306f87d | 571 | struct bch_fs *c = trans->c; |
1c6fdbd8 | 572 | struct btree_cache *bc = &c->btree_cache; |
30985537 KO |
573 | struct list_head *freed = pcpu_read_locks |
574 | ? &bc->freed_pcpu | |
575 | : &bc->freed_nonpcpu; | |
5b3f7805 | 576 | struct btree *b, *b2; |
1c6fdbd8 | 577 | u64 start_time = local_clock(); |
e0dfc08b | 578 | unsigned flags; |
1c6fdbd8 | 579 | |
e0dfc08b | 580 | flags = memalloc_nofs_save(); |
1c6fdbd8 KO |
581 | mutex_lock(&bc->lock); |
582 | ||
1c6fdbd8 KO |
583 | /* |
584 | * We never free struct btree itself, just the memory that holds the on | |
585 | * disk node. Check the freed list before allocating a new one: | |
586 | */ | |
30985537 | 587 | list_for_each_entry(b, freed, list) |
5b3f7805 KO |
588 | if (!btree_node_reclaim(c, b)) { |
589 | list_del_init(&b->list); | |
e38821f3 | 590 | goto got_node; |
5b3f7805 | 591 | } |
1c6fdbd8 | 592 | |
e1d29c5f | 593 | b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); |
c36ff038 KO |
594 | if (!b) { |
595 | mutex_unlock(&bc->lock); | |
e1d29c5f | 596 | bch2_trans_unlock(trans); |
c36ff038 KO |
597 | b = __btree_node_mem_alloc(c, GFP_KERNEL); |
598 | if (!b) | |
599 | goto err; | |
600 | mutex_lock(&bc->lock); | |
601 | } | |
5b3f7805 | 602 | |
0d2234a7 | 603 | bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); |
30985537 | 604 | |
5b3f7805 KO |
605 | BUG_ON(!six_trylock_intent(&b->c.lock)); |
606 | BUG_ON(!six_trylock_write(&b->c.lock)); | |
e38821f3 | 607 | got_node: |
e38821f3 | 608 | |
5b3f7805 KO |
609 | /* |
610 | * btree_free() doesn't free memory; it sticks the node on the end of | |
611 | * the list. Check if there's any freed nodes there: | |
612 | */ | |
613 | list_for_each_entry(b2, &bc->freeable, list) | |
614 | if (!btree_node_reclaim(c, b2)) { | |
615 | swap(b->data, b2->data); | |
616 | swap(b->aux_data, b2->aux_data); | |
30985537 | 617 | btree_node_to_freedlist(bc, b2); |
5b3f7805 KO |
618 | six_unlock_write(&b2->c.lock); |
619 | six_unlock_intent(&b2->c.lock); | |
620 | goto got_mem; | |
621 | } | |
1c6fdbd8 | 622 | |
5b3f7805 | 623 | mutex_unlock(&bc->lock); |
e38821f3 | 624 | |
e1d29c5f KO |
625 | if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { |
626 | bch2_trans_unlock(trans); | |
627 | if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) | |
628 | goto err; | |
629 | } | |
e38821f3 | 630 | |
5b3f7805 KO |
631 | mutex_lock(&bc->lock); |
632 | bc->used++; | |
633 | got_mem: | |
634 | mutex_unlock(&bc->lock); | |
1c6fdbd8 | 635 | |
1c6fdbd8 | 636 | BUG_ON(btree_node_hashed(b)); |
19d54324 | 637 | BUG_ON(btree_node_dirty(b)); |
1c6fdbd8 | 638 | BUG_ON(btree_node_write_in_flight(b)); |
1c6fdbd8 KO |
639 | out: |
640 | b->flags = 0; | |
641 | b->written = 0; | |
642 | b->nsets = 0; | |
643 | b->sib_u64s[0] = 0; | |
644 | b->sib_u64s[1] = 0; | |
645 | b->whiteout_u64s = 0; | |
29364f34 | 646 | bch2_btree_keys_init(b); |
e68031fb | 647 | set_btree_node_accessed(b); |
1c6fdbd8 KO |
648 | |
649 | bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], | |
650 | start_time); | |
651 | ||
692c3f06 | 652 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
653 | return b; |
654 | err: | |
e38821f3 | 655 | mutex_lock(&bc->lock); |
c36ff038 | 656 | |
1c6fdbd8 KO |
657 | /* Try to cannibalize another cached btree node: */ |
658 | if (bc->alloc_lock == current) { | |
5b3f7805 | 659 | b2 = btree_node_cannibalize(c); |
0b438c5b | 660 | clear_btree_node_just_written(b2); |
5b3f7805 KO |
661 | bch2_btree_node_hash_remove(bc, b2); |
662 | ||
663 | if (b) { | |
664 | swap(b->data, b2->data); | |
665 | swap(b->aux_data, b2->aux_data); | |
30985537 | 666 | btree_node_to_freedlist(bc, b2); |
5b3f7805 KO |
667 | six_unlock_write(&b2->c.lock); |
668 | six_unlock_intent(&b2->c.lock); | |
669 | } else { | |
670 | b = b2; | |
671 | list_del_init(&b->list); | |
672 | } | |
1c6fdbd8 | 673 | |
5b3f7805 | 674 | mutex_unlock(&bc->lock); |
1c6fdbd8 | 675 | |
674cfc26 | 676 | trace_and_count(c, btree_cache_cannibalize, c); |
1c6fdbd8 KO |
677 | goto out; |
678 | } | |
679 | ||
680 | mutex_unlock(&bc->lock); | |
692c3f06 | 681 | memalloc_nofs_restore(flags); |
65d48e35 | 682 | return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); |
1c6fdbd8 KO |
683 | } |
684 | ||
685 | /* Slowpath, don't want it inlined into btree_iter_traverse() */ | |
1306f87d | 686 | static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, |
67e0dd8f | 687 | struct btree_path *path, |
1c6fdbd8 | 688 | const struct bkey_i *k, |
e62d65f2 | 689 | enum btree_id btree_id, |
1c6fdbd8 KO |
690 | unsigned level, |
691 | enum six_lock_type lock_type, | |
692 | bool sync) | |
693 | { | |
1306f87d | 694 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
695 | struct btree_cache *bc = &c->btree_cache; |
696 | struct btree *b; | |
19d54324 | 697 | u32 seq; |
1c6fdbd8 | 698 | |
72141e1f | 699 | BUG_ON(level + 1 >= BTREE_MAX_DEPTH); |
1c6fdbd8 KO |
700 | /* |
701 | * Parent node must be locked, else we could read in a btree node that's | |
702 | * been freed: | |
703 | */ | |
1306f87d | 704 | if (path && !bch2_btree_node_relock(trans, path, level + 1)) { |
674cfc26 | 705 | trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); |
549d173c | 706 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); |
e5af273f | 707 | } |
1c6fdbd8 | 708 | |
1306f87d | 709 | b = bch2_btree_node_mem_alloc(trans, level != 0); |
8f9ad91a | 710 | |
65d48e35 | 711 | if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { |
8f9ad91a | 712 | trans->memory_allocation_failure = true; |
674cfc26 | 713 | trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); |
549d173c | 714 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); |
8f9ad91a KO |
715 | } |
716 | ||
1c6fdbd8 KO |
717 | if (IS_ERR(b)) |
718 | return b; | |
719 | ||
447e9227 KO |
720 | /* |
721 | * Btree nodes read in from disk should not have the accessed bit set | |
722 | * initially, so that linear scans don't thrash the cache: | |
723 | */ | |
724 | clear_btree_node_accessed(b); | |
725 | ||
1c6fdbd8 | 726 | bkey_copy(&b->key, k); |
e62d65f2 | 727 | if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { |
1c6fdbd8 KO |
728 | /* raced with another fill: */ |
729 | ||
730 | /* mark as unhashed... */ | |
237e8048 | 731 | b->hash_val = 0; |
1c6fdbd8 KO |
732 | |
733 | mutex_lock(&bc->lock); | |
734 | list_add(&b->list, &bc->freeable); | |
735 | mutex_unlock(&bc->lock); | |
736 | ||
c43a6ef9 KO |
737 | six_unlock_write(&b->c.lock); |
738 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 KO |
739 | return NULL; |
740 | } | |
741 | ||
19d54324 KO |
742 | set_btree_node_read_in_flight(b); |
743 | ||
744 | six_unlock_write(&b->c.lock); | |
1fb4fe63 | 745 | seq = six_lock_seq(&b->c.lock); |
19d54324 KO |
746 | six_unlock_intent(&b->c.lock); |
747 | ||
c205321b | 748 | /* Unlock before doing IO: */ |
51c801bc | 749 | if (path && sync) |
25aa8c21 | 750 | bch2_trans_unlock_noassert(trans); |
1c6fdbd8 KO |
751 | |
752 | bch2_btree_node_read(c, b, sync); | |
753 | ||
19d54324 | 754 | if (!sync) |
1c6fdbd8 | 755 | return NULL; |
1c6fdbd8 | 756 | |
1306f87d | 757 | if (path) { |
549d173c KO |
758 | int ret = bch2_trans_relock(trans) ?: |
759 | bch2_btree_path_relock_intent(trans, path); | |
760 | if (ret) { | |
761 | BUG_ON(!trans->restarted); | |
762 | return ERR_PTR(ret); | |
763 | } | |
e5af273f | 764 | } |
c205321b | 765 | |
e5af273f | 766 | if (!six_relock_type(&b->c.lock, lock_type, seq)) { |
1306f87d | 767 | if (path) |
674cfc26 | 768 | trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); |
549d173c | 769 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); |
e5af273f | 770 | } |
1c6fdbd8 KO |
771 | |
772 | return b; | |
773 | } | |
774 | ||
537c32f5 KO |
775 | static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) |
776 | { | |
1d8a2689 | 777 | struct printbuf buf = PRINTBUF; |
537c32f5 | 778 | |
067d228b | 779 | if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) |
537c32f5 KO |
780 | return; |
781 | ||
401ec4db | 782 | prt_printf(&buf, |
1d8a2689 KO |
783 | "btree node header doesn't match ptr\n" |
784 | "btree %s level %u\n" | |
785 | "ptr: ", | |
88dfe193 | 786 | bch2_btree_id_str(b->c.btree_id), b->c.level); |
1d8a2689 KO |
787 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); |
788 | ||
401ec4db | 789 | prt_printf(&buf, "\nheader: btree %s level %llu\n" |
1d8a2689 | 790 | "min ", |
88dfe193 | 791 | bch2_btree_id_str(BTREE_NODE_ID(b->data)), |
1d8a2689 KO |
792 | BTREE_NODE_LEVEL(b->data)); |
793 | bch2_bpos_to_text(&buf, b->data->min_key); | |
794 | ||
401ec4db | 795 | prt_printf(&buf, "\nmax "); |
1d8a2689 KO |
796 | bch2_bpos_to_text(&buf, b->data->max_key); |
797 | ||
798 | bch2_fs_inconsistent(c, "%s", buf.buf); | |
799 | printbuf_exit(&buf); | |
537c32f5 KO |
800 | } |
801 | ||
802 | static inline void btree_check_header(struct bch_fs *c, struct btree *b) | |
803 | { | |
804 | if (b->c.btree_id != BTREE_NODE_ID(b->data) || | |
805 | b->c.level != BTREE_NODE_LEVEL(b->data) || | |
e88a75eb | 806 | !bpos_eq(b->data->max_key, b->key.k.p) || |
537c32f5 | 807 | (b->key.k.type == KEY_TYPE_btree_ptr_v2 && |
e88a75eb | 808 | !bpos_eq(b->data->min_key, |
537c32f5 KO |
809 | bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) |
810 | btree_bad_header(c, b); | |
811 | } | |
812 | ||
001783e2 KO |
813 | static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, |
814 | const struct bkey_i *k, unsigned level, | |
815 | enum six_lock_type lock_type, | |
816 | unsigned long trace_ip) | |
1c6fdbd8 | 817 | { |
6e075b54 | 818 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
819 | struct btree_cache *bc = &c->btree_cache; |
820 | struct btree *b; | |
821 | struct bset_tree *t; | |
e1d29c5f | 822 | bool need_relock = false; |
549d173c | 823 | int ret; |
1c6fdbd8 | 824 | |
1c6fdbd8 KO |
825 | EBUG_ON(level >= BTREE_MAX_DEPTH); |
826 | retry: | |
1c6fdbd8 | 827 | b = btree_cache_find(bc, k); |
1c6fdbd8 KO |
828 | if (unlikely(!b)) { |
829 | /* | |
830 | * We must have the parent locked to call bch2_btree_node_fill(), | |
831 | * else we could read in a btree node from disk that's been | |
832 | * freed: | |
833 | */ | |
1306f87d | 834 | b = bch2_btree_node_fill(trans, path, k, path->btree_id, |
e62d65f2 | 835 | level, lock_type, true); |
e1d29c5f | 836 | need_relock = true; |
1c6fdbd8 KO |
837 | |
838 | /* We raced and found the btree node in the cache */ | |
839 | if (!b) | |
840 | goto retry; | |
841 | ||
842 | if (IS_ERR(b)) | |
843 | return b; | |
844 | } else { | |
67e0dd8f | 845 | if (btree_node_read_locked(path, level + 1)) |
8bfe14e8 | 846 | btree_node_unlock(trans, path, level + 1); |
1c6fdbd8 | 847 | |
0d7009d7 KO |
848 | ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); |
849 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
850 | return ERR_PTR(ret); | |
851 | ||
852 | BUG_ON(ret); | |
1c6fdbd8 | 853 | |
237e8048 | 854 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || |
c43a6ef9 | 855 | b->c.level != level || |
1c6fdbd8 | 856 | race_fault())) { |
c43a6ef9 | 857 | six_unlock_type(&b->c.lock, lock_type); |
67e0dd8f | 858 | if (bch2_btree_node_relock(trans, path, level + 1)) |
1c6fdbd8 KO |
859 | goto retry; |
860 | ||
674cfc26 | 861 | trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); |
549d173c | 862 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); |
1c6fdbd8 | 863 | } |
447e9227 KO |
864 | |
865 | /* avoid atomic set bit if it's not needed: */ | |
866 | if (!btree_node_accessed(b)) | |
867 | set_btree_node_accessed(b); | |
1c6fdbd8 KO |
868 | } |
869 | ||
c205321b | 870 | if (unlikely(btree_node_read_in_flight(b))) { |
1fb4fe63 | 871 | u32 seq = six_lock_seq(&b->c.lock); |
19d54324 | 872 | |
c205321b | 873 | six_unlock_type(&b->c.lock, lock_type); |
6e075b54 | 874 | bch2_trans_unlock(trans); |
e1d29c5f | 875 | need_relock = true; |
c205321b | 876 | |
19d54324 | 877 | bch2_btree_node_wait_on_read(b); |
c205321b KO |
878 | |
879 | /* | |
67e0dd8f KO |
880 | * should_be_locked is not set on this path yet, so we need to |
881 | * relock it specifically: | |
c205321b | 882 | */ |
19d54324 KO |
883 | if (!six_relock_type(&b->c.lock, lock_type, seq)) |
884 | goto retry; | |
c205321b | 885 | } |
1c6fdbd8 | 886 | |
e1d29c5f | 887 | if (unlikely(need_relock)) { |
96dea3d5 | 888 | ret = bch2_trans_relock(trans) ?: |
e1d29c5f KO |
889 | bch2_btree_path_relock_intent(trans, path); |
890 | if (ret) { | |
891 | six_unlock_type(&b->c.lock, lock_type); | |
892 | return ERR_PTR(ret); | |
893 | } | |
894 | } | |
895 | ||
1c6fdbd8 KO |
896 | prefetch(b->aux_data); |
897 | ||
898 | for_each_bset(b, t) { | |
899 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
900 | ||
901 | prefetch(p + L1_CACHE_BYTES * 0); | |
902 | prefetch(p + L1_CACHE_BYTES * 1); | |
903 | prefetch(p + L1_CACHE_BYTES * 2); | |
904 | } | |
905 | ||
1c6fdbd8 | 906 | if (unlikely(btree_node_read_error(b))) { |
c43a6ef9 | 907 | six_unlock_type(&b->c.lock, lock_type); |
1c6fdbd8 KO |
908 | return ERR_PTR(-EIO); |
909 | } | |
910 | ||
67e0dd8f | 911 | EBUG_ON(b->c.btree_id != path->btree_id); |
001783e2 KO |
912 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); |
913 | btree_check_header(c, b); | |
914 | ||
915 | return b; | |
916 | } | |
917 | ||
918 | /** | |
96dea3d5 | 919 | * bch2_btree_node_get - find a btree node in the cache and lock it, reading it |
001783e2 KO |
920 | * in from disk if necessary. |
921 | * | |
96dea3d5 KO |
922 | * @trans: btree transaction object |
923 | * @path: btree_path being traversed | |
924 | * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) | |
925 | * @level: level of btree node being looked up (0 == leaf node) | |
926 | * @lock_type: SIX_LOCK_read or SIX_LOCK_intent | |
927 | * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) | |
928 | * | |
001783e2 KO |
929 | * The btree node will have either a read or a write lock held, depending on |
930 | * the @write parameter. | |
96dea3d5 KO |
931 | * |
932 | * Returns: btree node or ERR_PTR() | |
001783e2 KO |
933 | */ |
934 | struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, | |
935 | const struct bkey_i *k, unsigned level, | |
936 | enum six_lock_type lock_type, | |
937 | unsigned long trace_ip) | |
938 | { | |
939 | struct bch_fs *c = trans->c; | |
940 | struct btree *b; | |
941 | struct bset_tree *t; | |
942 | int ret; | |
943 | ||
944 | EBUG_ON(level >= BTREE_MAX_DEPTH); | |
945 | ||
946 | b = btree_node_mem_ptr(k); | |
947 | ||
948 | /* | |
949 | * Check b->hash_val _before_ calling btree_node_lock() - this might not | |
950 | * be the node we want anymore, and trying to lock the wrong node could | |
951 | * cause an unneccessary transaction restart: | |
952 | */ | |
953 | if (unlikely(!c->opts.btree_node_mem_ptr_optimization || | |
954 | !b || | |
955 | b->hash_val != btree_ptr_hash_val(k))) | |
956 | return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); | |
957 | ||
958 | if (btree_node_read_locked(path, level + 1)) | |
959 | btree_node_unlock(trans, path, level + 1); | |
960 | ||
961 | ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); | |
962 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
963 | return ERR_PTR(ret); | |
964 | ||
965 | BUG_ON(ret); | |
966 | ||
967 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || | |
968 | b->c.level != level || | |
969 | race_fault())) { | |
970 | six_unlock_type(&b->c.lock, lock_type); | |
971 | if (bch2_btree_node_relock(trans, path, level + 1)) | |
972 | return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); | |
973 | ||
974 | trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); | |
975 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); | |
976 | } | |
977 | ||
978 | if (unlikely(btree_node_read_in_flight(b))) { | |
001783e2 | 979 | six_unlock_type(&b->c.lock, lock_type); |
51c801bc | 980 | return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); |
001783e2 KO |
981 | } |
982 | ||
983 | prefetch(b->aux_data); | |
984 | ||
985 | for_each_bset(b, t) { | |
986 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
987 | ||
988 | prefetch(p + L1_CACHE_BYTES * 0); | |
989 | prefetch(p + L1_CACHE_BYTES * 1); | |
990 | prefetch(p + L1_CACHE_BYTES * 2); | |
991 | } | |
992 | ||
993 | /* avoid atomic set bit if it's not needed: */ | |
994 | if (!btree_node_accessed(b)) | |
995 | set_btree_node_accessed(b); | |
996 | ||
997 | if (unlikely(btree_node_read_error(b))) { | |
998 | six_unlock_type(&b->c.lock, lock_type); | |
999 | return ERR_PTR(-EIO); | |
1000 | } | |
1001 | ||
1002 | EBUG_ON(b->c.btree_id != path->btree_id); | |
a0b73c1c | 1003 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); |
537c32f5 | 1004 | btree_check_header(c, b); |
1c6fdbd8 KO |
1005 | |
1006 | return b; | |
1007 | } | |
1008 | ||
ca7d8fca | 1009 | struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, |
e62d65f2 KO |
1010 | const struct bkey_i *k, |
1011 | enum btree_id btree_id, | |
a0b73c1c KO |
1012 | unsigned level, |
1013 | bool nofill) | |
e62d65f2 | 1014 | { |
ca7d8fca | 1015 | struct bch_fs *c = trans->c; |
e62d65f2 KO |
1016 | struct btree_cache *bc = &c->btree_cache; |
1017 | struct btree *b; | |
1018 | struct bset_tree *t; | |
bd2bb273 | 1019 | int ret; |
e62d65f2 KO |
1020 | |
1021 | EBUG_ON(level >= BTREE_MAX_DEPTH); | |
1022 | ||
a32b9573 KO |
1023 | if (c->opts.btree_node_mem_ptr_optimization) { |
1024 | b = btree_node_mem_ptr(k); | |
1025 | if (b) | |
1026 | goto lock_node; | |
1027 | } | |
e62d65f2 KO |
1028 | retry: |
1029 | b = btree_cache_find(bc, k); | |
1030 | if (unlikely(!b)) { | |
a0b73c1c | 1031 | if (nofill) |
18a7b972 | 1032 | goto out; |
a0b73c1c | 1033 | |
1306f87d | 1034 | b = bch2_btree_node_fill(trans, NULL, k, btree_id, |
e62d65f2 KO |
1035 | level, SIX_LOCK_read, true); |
1036 | ||
1037 | /* We raced and found the btree node in the cache */ | |
1038 | if (!b) | |
1039 | goto retry; | |
1040 | ||
18a7b972 KO |
1041 | if (IS_ERR(b) && |
1042 | !bch2_btree_cache_cannibalize_lock(c, NULL)) | |
1043 | goto retry; | |
1044 | ||
e62d65f2 | 1045 | if (IS_ERR(b)) |
18a7b972 | 1046 | goto out; |
e62d65f2 KO |
1047 | } else { |
1048 | lock_node: | |
94c69faf | 1049 | ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); |
0d7009d7 KO |
1050 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
1051 | return ERR_PTR(ret); | |
1052 | ||
1053 | BUG_ON(ret); | |
e62d65f2 KO |
1054 | |
1055 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || | |
1056 | b->c.btree_id != btree_id || | |
1057 | b->c.level != level)) { | |
1058 | six_unlock_read(&b->c.lock); | |
1059 | goto retry; | |
1060 | } | |
1061 | } | |
1062 | ||
1063 | /* XXX: waiting on IO with btree locks held: */ | |
19d54324 | 1064 | __bch2_btree_node_wait_on_read(b); |
e62d65f2 KO |
1065 | |
1066 | prefetch(b->aux_data); | |
1067 | ||
1068 | for_each_bset(b, t) { | |
1069 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
1070 | ||
1071 | prefetch(p + L1_CACHE_BYTES * 0); | |
1072 | prefetch(p + L1_CACHE_BYTES * 1); | |
1073 | prefetch(p + L1_CACHE_BYTES * 2); | |
1074 | } | |
1075 | ||
1076 | /* avoid atomic set bit if it's not needed: */ | |
1077 | if (!btree_node_accessed(b)) | |
1078 | set_btree_node_accessed(b); | |
1079 | ||
1080 | if (unlikely(btree_node_read_error(b))) { | |
1081 | six_unlock_read(&b->c.lock); | |
18a7b972 KO |
1082 | b = ERR_PTR(-EIO); |
1083 | goto out; | |
e62d65f2 KO |
1084 | } |
1085 | ||
a0b73c1c KO |
1086 | EBUG_ON(b->c.btree_id != btree_id); |
1087 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); | |
537c32f5 | 1088 | btree_check_header(c, b); |
18a7b972 KO |
1089 | out: |
1090 | bch2_btree_cache_cannibalize_unlock(c); | |
e62d65f2 KO |
1091 | return b; |
1092 | } | |
1093 | ||
1306f87d | 1094 | int bch2_btree_node_prefetch(struct btree_trans *trans, |
67e0dd8f | 1095 | struct btree_path *path, |
8b3e9bd6 KO |
1096 | const struct bkey_i *k, |
1097 | enum btree_id btree_id, unsigned level) | |
1c6fdbd8 | 1098 | { |
1306f87d | 1099 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
1100 | struct btree_cache *bc = &c->btree_cache; |
1101 | struct btree *b; | |
1102 | ||
67e0dd8f | 1103 | BUG_ON(trans && !btree_node_locked(path, level + 1)); |
1c6fdbd8 KO |
1104 | BUG_ON(level >= BTREE_MAX_DEPTH); |
1105 | ||
1c6fdbd8 | 1106 | b = btree_cache_find(bc, k); |
1c6fdbd8 | 1107 | if (b) |
8b3e9bd6 | 1108 | return 0; |
1c6fdbd8 | 1109 | |
1306f87d | 1110 | b = bch2_btree_node_fill(trans, path, k, btree_id, |
78cf784e | 1111 | level, SIX_LOCK_read, false); |
8b3e9bd6 | 1112 | return PTR_ERR_OR_ZERO(b); |
1c6fdbd8 KO |
1113 | } |
1114 | ||
ca7d8fca | 1115 | void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) |
ceda1b9a | 1116 | { |
ca7d8fca | 1117 | struct bch_fs *c = trans->c; |
ceda1b9a KO |
1118 | struct btree_cache *bc = &c->btree_cache; |
1119 | struct btree *b; | |
1120 | ||
1121 | b = btree_cache_find(bc, k); | |
1122 | if (!b) | |
1123 | return; | |
19d54324 KO |
1124 | wait_on_io: |
1125 | /* not allowed to wait on io with btree locks held: */ | |
1126 | ||
1127 | /* XXX we're called from btree_gc which will be holding other btree | |
1128 | * nodes locked | |
3e3e02e6 | 1129 | */ |
19d54324 KO |
1130 | __bch2_btree_node_wait_on_read(b); |
1131 | __bch2_btree_node_wait_on_write(b); | |
ceda1b9a | 1132 | |
ca7d8fca KO |
1133 | btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); |
1134 | btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); | |
ceda1b9a | 1135 | |
19d54324 | 1136 | if (btree_node_dirty(b)) { |
46fee692 | 1137 | __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); |
19d54324 KO |
1138 | six_unlock_write(&b->c.lock); |
1139 | six_unlock_intent(&b->c.lock); | |
1140 | goto wait_on_io; | |
1141 | } | |
ceda1b9a KO |
1142 | |
1143 | BUG_ON(btree_node_dirty(b)); | |
1144 | ||
1145 | mutex_lock(&bc->lock); | |
1146 | btree_node_data_free(c, b); | |
1147 | bch2_btree_node_hash_remove(bc, b); | |
1148 | mutex_unlock(&bc->lock); | |
1149 | ||
1150 | six_unlock_write(&b->c.lock); | |
1151 | six_unlock_intent(&b->c.lock); | |
1152 | } | |
1153 | ||
88dfe193 KO |
1154 | const char *bch2_btree_id_str(enum btree_id btree) |
1155 | { | |
1156 | return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; | |
1157 | } | |
1158 | ||
1159 | void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) | |
1160 | { | |
1161 | prt_printf(out, "%s level %u/%u\n ", | |
1162 | bch2_btree_id_str(b->c.btree_id), | |
1163 | b->c.level, | |
1164 | bch2_btree_id_root(c, b->c.btree_id)->level); | |
1165 | bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); | |
1166 | } | |
1167 | ||
1168 | void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) | |
1c6fdbd8 | 1169 | { |
1c6fdbd8 | 1170 | struct bset_stats stats; |
1c6fdbd8 KO |
1171 | |
1172 | memset(&stats, 0, sizeof(stats)); | |
1173 | ||
1c6fdbd8 KO |
1174 | bch2_btree_keys_stats(b, &stats); |
1175 | ||
401ec4db | 1176 | prt_printf(out, "l %u ", b->c.level); |
f020bfcd | 1177 | bch2_bpos_to_text(out, b->data->min_key); |
401ec4db | 1178 | prt_printf(out, " - "); |
f020bfcd | 1179 | bch2_bpos_to_text(out, b->data->max_key); |
401ec4db | 1180 | prt_printf(out, ":\n" |
f020bfcd | 1181 | " ptrs: "); |
26609b61 | 1182 | bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); |
6c643965 | 1183 | prt_newline(out); |
f020bfcd | 1184 | |
6c643965 KO |
1185 | prt_printf(out, |
1186 | " format: "); | |
1187 | bch2_bkey_format_to_text(out, &b->format); | |
1188 | ||
1189 | prt_printf(out, | |
319f9ac3 KO |
1190 | " unpack fn len: %u\n" |
1191 | " bytes used %zu/%zu (%zu%% full)\n" | |
54ca47e1 | 1192 | " sib u64s: %u, %u (merge threshold %u)\n" |
319f9ac3 KO |
1193 | " nr packed keys %u\n" |
1194 | " nr unpacked keys %u\n" | |
1195 | " floats %zu\n" | |
58404bb2 | 1196 | " failed unpacked %zu\n", |
319f9ac3 KO |
1197 | b->unpack_fn_len, |
1198 | b->nr.live_u64s * sizeof(u64), | |
1199 | btree_bytes(c) - sizeof(struct btree_node), | |
1200 | b->nr.live_u64s * 100 / btree_max_u64s(c), | |
1201 | b->sib_u64s[0], | |
1202 | b->sib_u64s[1], | |
54ca47e1 | 1203 | c->btree_foreground_merge_threshold, |
319f9ac3 KO |
1204 | b->nr.packed_keys, |
1205 | b->nr.unpacked_keys, | |
1206 | stats.floats, | |
58404bb2 | 1207 | stats.failed); |
1c6fdbd8 | 1208 | } |
d8ebed7d | 1209 | |
a345b0f3 | 1210 | void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) |
d8ebed7d | 1211 | { |
401ec4db KO |
1212 | prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); |
1213 | prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); | |
1214 | prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); | |
d8ebed7d | 1215 | } |