]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
cafe5635 KO |
2 | /* |
3 | * bcache journalling code, for btree insertions | |
4 | * | |
5 | * Copyright 2012 Google, Inc. | |
6 | */ | |
7 | ||
8 | #include "bcache.h" | |
9 | #include "btree.h" | |
10 | #include "debug.h" | |
9aa61a99 | 11 | #include "extents.h" |
cafe5635 | 12 | |
c37511b8 KO |
13 | #include <trace/events/bcache.h> |
14 | ||
cafe5635 KO |
15 | /* |
16 | * Journal replay/recovery: | |
17 | * | |
18 | * This code is all driven from run_cache_set(); we first read the journal | |
19 | * entries, do some other stuff, then we mark all the keys in the journal | |
20 | * entries (same as garbage collection would), then we replay them - reinserting | |
21 | * them into the cache in precisely the same order as they appear in the | |
22 | * journal. | |
23 | * | |
24 | * We only journal keys that go in leaf nodes, which simplifies things quite a | |
25 | * bit. | |
26 | */ | |
27 | ||
4246a0b6 | 28 | static void journal_read_endio(struct bio *bio) |
cafe5635 KO |
29 | { |
30 | struct closure *cl = bio->bi_private; | |
1fae7cf0 | 31 | |
cafe5635 KO |
32 | closure_put(cl); |
33 | } | |
34 | ||
35 | static int journal_read_bucket(struct cache *ca, struct list_head *list, | |
6f10f7d1 | 36 | unsigned int bucket_index) |
cafe5635 KO |
37 | { |
38 | struct journal_device *ja = &ca->journal; | |
39 | struct bio *bio = &ja->bio; | |
40 | ||
41 | struct journal_replay *i; | |
42 | struct jset *j, *data = ca->set->journal.w[0].data; | |
c18536a7 | 43 | struct closure cl; |
6f10f7d1 | 44 | unsigned int len, left, offset = 0; |
cafe5635 KO |
45 | int ret = 0; |
46 | sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); | |
47 | ||
c18536a7 KO |
48 | closure_init_stack(&cl); |
49 | ||
46f5aa88 | 50 | pr_debug("reading %u\n", bucket_index); |
cafe5635 KO |
51 | |
52 | while (offset < ca->sb.bucket_size) { | |
53 | reread: left = ca->sb.bucket_size - offset; | |
6f10f7d1 | 54 | len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); |
cafe5635 | 55 | |
a7c50c94 | 56 | bio_reset(bio, ca->bdev, REQ_OP_READ); |
4f024f37 | 57 | bio->bi_iter.bi_sector = bucket + offset; |
4f024f37 | 58 | bio->bi_iter.bi_size = len << 9; |
cafe5635 KO |
59 | |
60 | bio->bi_end_io = journal_read_endio; | |
c18536a7 | 61 | bio->bi_private = &cl; |
169ef1cf | 62 | bch_bio_map(bio, data); |
cafe5635 | 63 | |
771f393e | 64 | closure_bio_submit(ca->set, bio, &cl); |
c18536a7 | 65 | closure_sync(&cl); |
cafe5635 KO |
66 | |
67 | /* This function could be simpler now since we no longer write | |
68 | * journal entries that overlap bucket boundaries; this means | |
69 | * the start of a bucket will always have a valid journal entry | |
70 | * if it has any journal entries at all. | |
71 | */ | |
72 | ||
73 | j = data; | |
74 | while (len) { | |
75 | struct list_head *where; | |
76 | size_t blocks, bytes = set_bytes(j); | |
77 | ||
b3fa7e77 | 78 | if (j->magic != jset_magic(&ca->sb)) { |
46f5aa88 | 79 | pr_debug("%u: bad magic\n", bucket_index); |
cafe5635 | 80 | return ret; |
b3fa7e77 | 81 | } |
cafe5635 | 82 | |
b3fa7e77 KO |
83 | if (bytes > left << 9 || |
84 | bytes > PAGE_SIZE << JSET_BITS) { | |
46f5aa88 | 85 | pr_info("%u: too big, %zu bytes, offset %u\n", |
b3fa7e77 | 86 | bucket_index, bytes, offset); |
cafe5635 | 87 | return ret; |
b3fa7e77 | 88 | } |
cafe5635 KO |
89 | |
90 | if (bytes > len << 9) | |
91 | goto reread; | |
92 | ||
b3fa7e77 | 93 | if (j->csum != csum_set(j)) { |
46f5aa88 | 94 | pr_info("%u: bad csum, %zu bytes, offset %u\n", |
b3fa7e77 | 95 | bucket_index, bytes, offset); |
cafe5635 | 96 | return ret; |
b3fa7e77 | 97 | } |
cafe5635 | 98 | |
4e1ebae3 | 99 | blocks = set_blocks(j, block_bytes(ca)); |
cafe5635 | 100 | |
2464b693 CL |
101 | /* |
102 | * Nodes in 'list' are in linear increasing order of | |
103 | * i->j.seq, the node on head has the smallest (oldest) | |
104 | * journal seq, the node on tail has the biggest | |
105 | * (latest) journal seq. | |
106 | */ | |
107 | ||
108 | /* | |
109 | * Check from the oldest jset for last_seq. If | |
110 | * i->j.seq < j->last_seq, it means the oldest jset | |
111 | * in list is expired and useless, remove it from | |
9c9b81c4 | 112 | * this list. Otherwise, j is a candidate jset for |
2464b693 CL |
113 | * further following checks. |
114 | */ | |
cafe5635 KO |
115 | while (!list_empty(list)) { |
116 | i = list_first_entry(list, | |
117 | struct journal_replay, list); | |
118 | if (i->j.seq >= j->last_seq) | |
119 | break; | |
120 | list_del(&i->list); | |
121 | kfree(i); | |
122 | } | |
123 | ||
2464b693 | 124 | /* iterate list in reverse order (from latest jset) */ |
cafe5635 KO |
125 | list_for_each_entry_reverse(i, list, list) { |
126 | if (j->seq == i->j.seq) | |
127 | goto next_set; | |
128 | ||
2464b693 CL |
129 | /* |
130 | * if j->seq is less than any i->j.last_seq | |
131 | * in list, j is an expired and useless jset. | |
132 | */ | |
cafe5635 KO |
133 | if (j->seq < i->j.last_seq) |
134 | goto next_set; | |
135 | ||
2464b693 CL |
136 | /* |
137 | * 'where' points to first jset in list which | |
138 | * is elder then j. | |
139 | */ | |
cafe5635 KO |
140 | if (j->seq > i->j.seq) { |
141 | where = &i->list; | |
142 | goto add; | |
143 | } | |
144 | } | |
145 | ||
146 | where = list; | |
147 | add: | |
148 | i = kmalloc(offsetof(struct journal_replay, j) + | |
149 | bytes, GFP_KERNEL); | |
150 | if (!i) | |
151 | return -ENOMEM; | |
be0d8f48 KC |
152 | unsafe_memcpy(&i->j, j, bytes, |
153 | /* "bytes" was calculated by set_bytes() above */); | |
2464b693 | 154 | /* Add to the location after 'where' points to */ |
cafe5635 KO |
155 | list_add(&i->list, where); |
156 | ret = 1; | |
157 | ||
a231f07a CL |
158 | if (j->seq > ja->seq[bucket_index]) |
159 | ja->seq[bucket_index] = j->seq; | |
cafe5635 KO |
160 | next_set: |
161 | offset += blocks * ca->sb.block_size; | |
162 | len -= blocks * ca->sb.block_size; | |
163 | j = ((void *) j) + blocks * block_bytes(ca); | |
164 | } | |
165 | } | |
166 | ||
167 | return ret; | |
168 | } | |
169 | ||
c18536a7 | 170 | int bch_journal_read(struct cache_set *c, struct list_head *list) |
cafe5635 KO |
171 | { |
172 | #define read_bucket(b) \ | |
173 | ({ \ | |
14215ee0 | 174 | ret = journal_read_bucket(ca, list, b); \ |
cafe5635 KO |
175 | __set_bit(b, bitmap); \ |
176 | if (ret < 0) \ | |
177 | return ret; \ | |
178 | ret; \ | |
179 | }) | |
180 | ||
08fdb2cd | 181 | struct cache *ca = c->cache; |
14215ee0 | 182 | int ret = 0; |
08fdb2cd CL |
183 | struct journal_device *ja = &ca->journal; |
184 | DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); | |
185 | unsigned int i, l, r, m; | |
186 | uint64_t seq; | |
cafe5635 | 187 | |
08fdb2cd CL |
188 | bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); |
189 | pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); | |
cafe5635 | 190 | |
08fdb2cd CL |
191 | /* |
192 | * Read journal buckets ordered by golden ratio hash to quickly | |
193 | * find a sequence of buckets with valid journal entries | |
194 | */ | |
195 | for (i = 0; i < ca->sb.njournal_buckets; i++) { | |
c426c4fd | 196 | /* |
08fdb2cd CL |
197 | * We must try the index l with ZERO first for |
198 | * correctness due to the scenario that the journal | |
199 | * bucket is circular buffer which might have wrapped | |
cafe5635 | 200 | */ |
08fdb2cd | 201 | l = (i * 2654435769U) % ca->sb.njournal_buckets; |
cafe5635 | 202 | |
08fdb2cd CL |
203 | if (test_bit(l, bitmap)) |
204 | break; | |
cafe5635 | 205 | |
08fdb2cd CL |
206 | if (read_bucket(l)) |
207 | goto bsearch; | |
208 | } | |
cafe5635 | 209 | |
08fdb2cd CL |
210 | /* |
211 | * If that fails, check all the buckets we haven't checked | |
212 | * already | |
213 | */ | |
214 | pr_debug("falling back to linear search\n"); | |
cafe5635 | 215 | |
08fdb2cd CL |
216 | for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) |
217 | if (read_bucket(l)) | |
218 | goto bsearch; | |
c426c4fd | 219 | |
08fdb2cd CL |
220 | /* no journal entries on this device? */ |
221 | if (l == ca->sb.njournal_buckets) | |
222 | goto out; | |
cafe5635 | 223 | bsearch: |
08fdb2cd | 224 | BUG_ON(list_empty(list)); |
6b708de6 | 225 | |
08fdb2cd CL |
226 | /* Binary search */ |
227 | m = l; | |
228 | r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); | |
229 | pr_debug("starting binary search, l %u r %u\n", l, r); | |
cafe5635 | 230 | |
08fdb2cd CL |
231 | while (l + 1 < r) { |
232 | seq = list_entry(list->prev, struct journal_replay, | |
233 | list)->j.seq; | |
faa56736 | 234 | |
08fdb2cd CL |
235 | m = (l + r) >> 1; |
236 | read_bucket(m); | |
cafe5635 | 237 | |
08fdb2cd CL |
238 | if (seq != list_entry(list->prev, struct journal_replay, |
239 | list)->j.seq) | |
240 | l = m; | |
241 | else | |
242 | r = m; | |
243 | } | |
cafe5635 | 244 | |
08fdb2cd CL |
245 | /* |
246 | * Read buckets in reverse order until we stop finding more | |
247 | * journal entries | |
248 | */ | |
249 | pr_debug("finishing up: m %u njournal_buckets %u\n", | |
250 | m, ca->sb.njournal_buckets); | |
251 | l = m; | |
cafe5635 | 252 | |
08fdb2cd CL |
253 | while (1) { |
254 | if (!l--) | |
255 | l = ca->sb.njournal_buckets - 1; | |
cafe5635 | 256 | |
08fdb2cd CL |
257 | if (l == m) |
258 | break; | |
cafe5635 | 259 | |
08fdb2cd CL |
260 | if (test_bit(l, bitmap)) |
261 | continue; | |
cafe5635 | 262 | |
08fdb2cd CL |
263 | if (!read_bucket(l)) |
264 | break; | |
265 | } | |
cafe5635 | 266 | |
08fdb2cd | 267 | seq = 0; |
cafe5635 | 268 | |
08fdb2cd CL |
269 | for (i = 0; i < ca->sb.njournal_buckets; i++) |
270 | if (ja->seq[i] > seq) { | |
271 | seq = ja->seq[i]; | |
272 | /* | |
273 | * When journal_reclaim() goes to allocate for | |
274 | * the first time, it'll use the bucket after | |
275 | * ja->cur_idx | |
276 | */ | |
277 | ja->cur_idx = i; | |
278 | ja->last_idx = ja->discard_idx = (i + 1) % | |
279 | ca->sb.njournal_buckets; | |
cafe5635 | 280 | |
08fdb2cd | 281 | } |
cafe5635 | 282 | |
08fdb2cd | 283 | out: |
c426c4fd KO |
284 | if (!list_empty(list)) |
285 | c->journal.seq = list_entry(list->prev, | |
286 | struct journal_replay, | |
287 | list)->j.seq; | |
cafe5635 | 288 | |
0ae49cb7 | 289 | return 0; |
cafe5635 KO |
290 | #undef read_bucket |
291 | } | |
292 | ||
293 | void bch_journal_mark(struct cache_set *c, struct list_head *list) | |
294 | { | |
295 | atomic_t p = { 0 }; | |
296 | struct bkey *k; | |
297 | struct journal_replay *i; | |
298 | struct journal *j = &c->journal; | |
299 | uint64_t last = j->seq; | |
300 | ||
301 | /* | |
302 | * journal.pin should never fill up - we never write a journal | |
303 | * entry when it would fill up. But if for some reason it does, we | |
304 | * iterate over the list in reverse order so that we can just skip that | |
305 | * refcount instead of bugging. | |
306 | */ | |
307 | ||
308 | list_for_each_entry_reverse(i, list, list) { | |
309 | BUG_ON(last < i->j.seq); | |
310 | i->pin = NULL; | |
311 | ||
312 | while (last-- != i->j.seq) | |
313 | if (fifo_free(&j->pin) > 1) { | |
314 | fifo_push_front(&j->pin, p); | |
315 | atomic_set(&fifo_front(&j->pin), 0); | |
316 | } | |
317 | ||
318 | if (fifo_free(&j->pin) > 1) { | |
319 | fifo_push_front(&j->pin, p); | |
320 | i->pin = &fifo_front(&j->pin); | |
321 | atomic_set(i->pin, 1); | |
322 | } | |
323 | ||
324 | for (k = i->j.start; | |
fafff81c | 325 | k < bset_bkey_last(&i->j); |
9aa61a99 KO |
326 | k = bkey_next(k)) |
327 | if (!__bch_extent_invalid(c, k)) { | |
6f10f7d1 | 328 | unsigned int j; |
cafe5635 | 329 | |
9aa61a99 KO |
330 | for (j = 0; j < KEY_PTRS(k); j++) |
331 | if (ptr_available(c, k, j)) | |
332 | atomic_inc(&PTR_BUCKET(c, k, j)->pin); | |
65ddf45a | 333 | |
9aa61a99 KO |
334 | bch_initial_mark_key(c, 0, k); |
335 | } | |
cafe5635 KO |
336 | } |
337 | } | |
338 | ||
2d5abb9a | 339 | static bool is_discard_enabled(struct cache_set *s) |
63120731 | 340 | { |
08fdb2cd | 341 | struct cache *ca = s->cache; |
63120731 | 342 | |
08fdb2cd CL |
343 | if (ca->discard) |
344 | return true; | |
63120731 TJ |
345 | |
346 | return false; | |
347 | } | |
348 | ||
c18536a7 | 349 | int bch_journal_replay(struct cache_set *s, struct list_head *list) |
cafe5635 KO |
350 | { |
351 | int ret = 0, keys = 0, entries = 0; | |
352 | struct bkey *k; | |
353 | struct journal_replay *i = | |
354 | list_entry(list->prev, struct journal_replay, list); | |
355 | ||
356 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; | |
0b93207a KO |
357 | struct keylist keylist; |
358 | ||
cafe5635 KO |
359 | list_for_each_entry(i, list, list) { |
360 | BUG_ON(i->pin && atomic_read(i->pin) != 1); | |
361 | ||
68d10e69 | 362 | if (n != i->j.seq) { |
63120731 | 363 | if (n == start && is_discard_enabled(s)) |
46f5aa88 | 364 | pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", |
63120731 TJ |
365 | n, i->j.seq - 1, start, end); |
366 | else { | |
46f5aa88 | 367 | pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", |
63120731 TJ |
368 | n, i->j.seq - 1, start, end); |
369 | ret = -EIO; | |
370 | goto err; | |
371 | } | |
68d10e69 | 372 | } |
cafe5635 KO |
373 | |
374 | for (k = i->j.start; | |
fafff81c | 375 | k < bset_bkey_last(&i->j); |
cafe5635 | 376 | k = bkey_next(k)) { |
c37511b8 KO |
377 | trace_bcache_journal_replay_key(k); |
378 | ||
c13f3af9 | 379 | bch_keylist_init_single(&keylist, k); |
cafe5635 | 380 | |
cc7b8819 | 381 | ret = bch_btree_insert(s, &keylist, i->pin, NULL); |
cafe5635 KO |
382 | if (ret) |
383 | goto err; | |
384 | ||
0b93207a | 385 | BUG_ON(!bch_keylist_empty(&keylist)); |
cafe5635 KO |
386 | keys++; |
387 | ||
388 | cond_resched(); | |
389 | } | |
390 | ||
391 | if (i->pin) | |
392 | atomic_dec(i->pin); | |
393 | n = i->j.seq + 1; | |
394 | entries++; | |
395 | } | |
396 | ||
46f5aa88 | 397 | pr_info("journal replay done, %i keys in %i entries, seq %llu\n", |
cafe5635 | 398 | keys, entries, end); |
b54d6934 | 399 | err: |
cafe5635 KO |
400 | while (!list_empty(list)) { |
401 | i = list_first_entry(list, struct journal_replay, list); | |
402 | list_del(&i->list); | |
403 | kfree(i); | |
404 | } | |
b54d6934 | 405 | |
cafe5635 KO |
406 | return ret; |
407 | } | |
408 | ||
32feee36 CL |
409 | void bch_journal_space_reserve(struct journal *j) |
410 | { | |
411 | j->do_reserve = true; | |
412 | } | |
413 | ||
cafe5635 KO |
414 | /* Journalling */ |
415 | ||
416 | static void btree_flush_write(struct cache_set *c) | |
417 | { | |
91be66e1 | 418 | struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; |
d1c3cc34 CL |
419 | unsigned int i, nr; |
420 | int ref_nr; | |
2aa8c529 CL |
421 | atomic_t *fifo_front_p, *now_fifo_front_p; |
422 | size_t mask; | |
91be66e1 CL |
423 | |
424 | if (c->journal.btree_flushing) | |
425 | return; | |
426 | ||
427 | spin_lock(&c->journal.flush_write_lock); | |
428 | if (c->journal.btree_flushing) { | |
429 | spin_unlock(&c->journal.flush_write_lock); | |
430 | return; | |
431 | } | |
432 | c->journal.btree_flushing = true; | |
433 | spin_unlock(&c->journal.flush_write_lock); | |
a728eacb | 434 | |
2aa8c529 CL |
435 | /* get the oldest journal entry and check its refcount */ |
436 | spin_lock(&c->journal.lock); | |
437 | fifo_front_p = &fifo_front(&c->journal.pin); | |
438 | ref_nr = atomic_read(fifo_front_p); | |
439 | if (ref_nr <= 0) { | |
440 | /* | |
441 | * do nothing if no btree node references | |
442 | * the oldest journal entry | |
443 | */ | |
444 | spin_unlock(&c->journal.lock); | |
445 | goto out; | |
446 | } | |
447 | spin_unlock(&c->journal.lock); | |
448 | ||
449 | mask = c->journal.pin.mask; | |
450 | nr = 0; | |
a728eacb | 451 | atomic_long_inc(&c->flush_write); |
91be66e1 | 452 | memset(btree_nodes, 0, sizeof(btree_nodes)); |
249a5f6d | 453 | |
50a260e8 | 454 | mutex_lock(&c->bucket_lock); |
91be66e1 | 455 | list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { |
2aa8c529 CL |
456 | /* |
457 | * It is safe to get now_fifo_front_p without holding | |
458 | * c->journal.lock here, because we don't need to know | |
459 | * the exactly accurate value, just check whether the | |
460 | * front pointer of c->journal.pin is changed. | |
461 | */ | |
462 | now_fifo_front_p = &fifo_front(&c->journal.pin); | |
463 | /* | |
464 | * If the oldest journal entry is reclaimed and front | |
465 | * pointer of c->journal.pin changes, it is unnecessary | |
466 | * to scan c->btree_cache anymore, just quit the loop and | |
467 | * flush out what we have already. | |
468 | */ | |
469 | if (now_fifo_front_p != fifo_front_p) | |
470 | break; | |
471 | /* | |
472 | * quit this loop if all matching btree nodes are | |
473 | * scanned and record in btree_nodes[] already. | |
474 | */ | |
475 | ref_nr = atomic_read(fifo_front_p); | |
476 | if (nr >= ref_nr) | |
477 | break; | |
478 | ||
91be66e1 | 479 | if (btree_node_journal_flush(b)) |
46f5aa88 | 480 | pr_err("BUG: flush_write bit should not be set here!\n"); |
91be66e1 CL |
481 | |
482 | mutex_lock(&b->write_lock); | |
483 | ||
484 | if (!btree_node_dirty(b)) { | |
485 | mutex_unlock(&b->write_lock); | |
486 | continue; | |
487 | } | |
488 | ||
489 | if (!btree_current_write(b)->journal) { | |
490 | mutex_unlock(&b->write_lock); | |
491 | continue; | |
249a5f6d | 492 | } |
cafe5635 | 493 | |
2aa8c529 CL |
494 | /* |
495 | * Only select the btree node which exactly references | |
496 | * the oldest journal entry. | |
497 | * | |
498 | * If the journal entry pointed by fifo_front_p is | |
499 | * reclaimed in parallel, don't worry: | |
500 | * - the list_for_each_xxx loop will quit when checking | |
501 | * next now_fifo_front_p. | |
502 | * - If there are matched nodes recorded in btree_nodes[], | |
503 | * they are clean now (this is why and how the oldest | |
504 | * journal entry can be reclaimed). These selected nodes | |
9c9b81c4 | 505 | * will be ignored and skipped in the following for-loop. |
2aa8c529 | 506 | */ |
4ec31cb6 CL |
507 | if (((btree_current_write(b)->journal - fifo_front_p) & |
508 | mask) != 0) { | |
2aa8c529 CL |
509 | mutex_unlock(&b->write_lock); |
510 | continue; | |
511 | } | |
512 | ||
50a260e8 | 513 | set_btree_node_journal_flush(b); |
91be66e1 CL |
514 | |
515 | mutex_unlock(&b->write_lock); | |
516 | ||
2aa8c529 CL |
517 | btree_nodes[nr++] = b; |
518 | /* | |
519 | * To avoid holding c->bucket_lock too long time, | |
520 | * only scan for BTREE_FLUSH_NR matched btree nodes | |
521 | * at most. If there are more btree nodes reference | |
522 | * the oldest journal entry, try to flush them next | |
523 | * time when btree_flush_write() is called. | |
524 | */ | |
525 | if (nr == BTREE_FLUSH_NR) | |
91be66e1 CL |
526 | break; |
527 | } | |
50a260e8 CL |
528 | mutex_unlock(&c->bucket_lock); |
529 | ||
2aa8c529 | 530 | for (i = 0; i < nr; i++) { |
91be66e1 CL |
531 | b = btree_nodes[i]; |
532 | if (!b) { | |
46f5aa88 | 533 | pr_err("BUG: btree_nodes[%d] is NULL\n", i); |
91be66e1 CL |
534 | continue; |
535 | } | |
536 | ||
537 | /* safe to check without holding b->write_lock */ | |
538 | if (!btree_node_journal_flush(b)) { | |
46f5aa88 | 539 | pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b); |
91be66e1 CL |
540 | continue; |
541 | } | |
542 | ||
2a285686 | 543 | mutex_lock(&b->write_lock); |
a34a8bfd | 544 | if (!btree_current_write(b)->journal) { |
50a260e8 | 545 | clear_bit(BTREE_NODE_journal_flush, &b->flags); |
2a285686 | 546 | mutex_unlock(&b->write_lock); |
46f5aa88 | 547 | pr_debug("bnode %p: written by others\n", b); |
91be66e1 CL |
548 | continue; |
549 | } | |
550 | ||
551 | if (!btree_node_dirty(b)) { | |
552 | clear_bit(BTREE_NODE_journal_flush, &b->flags); | |
553 | mutex_unlock(&b->write_lock); | |
46f5aa88 | 554 | pr_debug("bnode %p: dirty bit cleaned by others\n", b); |
91be66e1 | 555 | continue; |
cafe5635 KO |
556 | } |
557 | ||
2a285686 | 558 | __bch_btree_node_write(b, NULL); |
50a260e8 | 559 | clear_bit(BTREE_NODE_journal_flush, &b->flags); |
2a285686 | 560 | mutex_unlock(&b->write_lock); |
cafe5635 | 561 | } |
91be66e1 | 562 | |
2aa8c529 | 563 | out: |
91be66e1 CL |
564 | spin_lock(&c->journal.flush_write_lock); |
565 | c->journal.btree_flushing = false; | |
566 | spin_unlock(&c->journal.flush_write_lock); | |
cafe5635 KO |
567 | } |
568 | ||
569 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) | |
570 | ||
4246a0b6 | 571 | static void journal_discard_endio(struct bio *bio) |
cafe5635 KO |
572 | { |
573 | struct journal_device *ja = | |
574 | container_of(bio, struct journal_device, discard_bio); | |
575 | struct cache *ca = container_of(ja, struct cache, journal); | |
576 | ||
577 | atomic_set(&ja->discard_in_flight, DISCARD_DONE); | |
578 | ||
579 | closure_wake_up(&ca->set->journal.wait); | |
580 | closure_put(&ca->set->cl); | |
581 | } | |
582 | ||
583 | static void journal_discard_work(struct work_struct *work) | |
584 | { | |
585 | struct journal_device *ja = | |
586 | container_of(work, struct journal_device, discard_work); | |
587 | ||
4e49ea4a | 588 | submit_bio(&ja->discard_bio); |
cafe5635 KO |
589 | } |
590 | ||
591 | static void do_journal_discard(struct cache *ca) | |
592 | { | |
593 | struct journal_device *ja = &ca->journal; | |
594 | struct bio *bio = &ja->discard_bio; | |
595 | ||
596 | if (!ca->discard) { | |
597 | ja->discard_idx = ja->last_idx; | |
598 | return; | |
599 | } | |
600 | ||
6d9d21e3 | 601 | switch (atomic_read(&ja->discard_in_flight)) { |
cafe5635 KO |
602 | case DISCARD_IN_FLIGHT: |
603 | return; | |
604 | ||
605 | case DISCARD_DONE: | |
606 | ja->discard_idx = (ja->discard_idx + 1) % | |
607 | ca->sb.njournal_buckets; | |
608 | ||
609 | atomic_set(&ja->discard_in_flight, DISCARD_READY); | |
df561f66 | 610 | fallthrough; |
cafe5635 KO |
611 | |
612 | case DISCARD_READY: | |
613 | if (ja->discard_idx == ja->last_idx) | |
614 | return; | |
615 | ||
616 | atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); | |
617 | ||
49add496 | 618 | bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); |
4f024f37 | 619 | bio->bi_iter.bi_sector = bucket_to_sector(ca->set, |
b1a67b0f | 620 | ca->sb.d[ja->discard_idx]); |
4f024f37 | 621 | bio->bi_iter.bi_size = bucket_bytes(ca); |
cafe5635 KO |
622 | bio->bi_end_io = journal_discard_endio; |
623 | ||
624 | closure_get(&ca->set->cl); | |
625 | INIT_WORK(&ja->discard_work, journal_discard_work); | |
0f843e65 | 626 | queue_work(bch_journal_wq, &ja->discard_work); |
cafe5635 KO |
627 | } |
628 | } | |
629 | ||
32feee36 CL |
630 | static unsigned int free_journal_buckets(struct cache_set *c) |
631 | { | |
632 | struct journal *j = &c->journal; | |
633 | struct cache *ca = c->cache; | |
634 | struct journal_device *ja = &c->cache->journal; | |
635 | unsigned int n; | |
636 | ||
637 | /* In case njournal_buckets is not power of 2 */ | |
638 | if (ja->cur_idx >= ja->discard_idx) | |
639 | n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; | |
640 | else | |
641 | n = ja->discard_idx - ja->cur_idx; | |
642 | ||
643 | if (n > (1 + j->do_reserve)) | |
644 | return n - (1 + j->do_reserve); | |
645 | ||
646 | return 0; | |
647 | } | |
648 | ||
cafe5635 KO |
649 | static void journal_reclaim(struct cache_set *c) |
650 | { | |
651 | struct bkey *k = &c->journal.key; | |
08fdb2cd | 652 | struct cache *ca = c->cache; |
cafe5635 | 653 | uint64_t last_seq; |
08fdb2cd | 654 | struct journal_device *ja = &ca->journal; |
42361469 | 655 | atomic_t p __maybe_unused; |
cafe5635 | 656 | |
a728eacb TJ |
657 | atomic_long_inc(&c->reclaim); |
658 | ||
cafe5635 KO |
659 | while (!atomic_read(&fifo_front(&c->journal.pin))) |
660 | fifo_pop(&c->journal.pin, p); | |
661 | ||
662 | last_seq = last_seq(&c->journal); | |
663 | ||
664 | /* Update last_idx */ | |
665 | ||
08fdb2cd CL |
666 | while (ja->last_idx != ja->cur_idx && |
667 | ja->seq[ja->last_idx] < last_seq) | |
668 | ja->last_idx = (ja->last_idx + 1) % | |
669 | ca->sb.njournal_buckets; | |
cafe5635 | 670 | |
08fdb2cd | 671 | do_journal_discard(ca); |
cafe5635 KO |
672 | |
673 | if (c->journal.blocks_free) | |
a34a8bfd | 674 | goto out; |
cafe5635 | 675 | |
32feee36 | 676 | if (!free_journal_buckets(c)) |
08fdb2cd | 677 | goto out; |
cafe5635 | 678 | |
32feee36 | 679 | ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; |
08fdb2cd CL |
680 | k->ptr[0] = MAKE_PTR(0, |
681 | bucket_to_sector(c, ca->sb.d[ja->cur_idx]), | |
682 | ca->sb.nr_this_dev); | |
683 | atomic_long_inc(&c->reclaimed_journal_buckets); | |
cafe5635 | 684 | |
08fdb2cd CL |
685 | bkey_init(k); |
686 | SET_KEY_PTRS(k, 1); | |
4a784266 | 687 | c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; |
cafe5635 | 688 | |
a34a8bfd | 689 | out: |
cafe5635 KO |
690 | if (!journal_full(&c->journal)) |
691 | __closure_wake_up(&c->journal.wait); | |
692 | } | |
693 | ||
694 | void bch_journal_next(struct journal *j) | |
695 | { | |
696 | atomic_t p = { 1 }; | |
697 | ||
698 | j->cur = (j->cur == j->w) | |
699 | ? &j->w[1] | |
700 | : &j->w[0]; | |
701 | ||
702 | /* | |
703 | * The fifo_push() needs to happen at the same time as j->seq is | |
704 | * incremented for last_seq() to be calculated correctly | |
705 | */ | |
706 | BUG_ON(!fifo_push(&j->pin, p)); | |
707 | atomic_set(&fifo_back(&j->pin), 1); | |
708 | ||
709 | j->cur->data->seq = ++j->seq; | |
dabb4433 | 710 | j->cur->dirty = false; |
cafe5635 KO |
711 | j->cur->need_write = false; |
712 | j->cur->data->keys = 0; | |
713 | ||
714 | if (fifo_full(&j->pin)) | |
46f5aa88 | 715 | pr_debug("journal_pin full (%zu)\n", fifo_used(&j->pin)); |
cafe5635 KO |
716 | } |
717 | ||
4246a0b6 | 718 | static void journal_write_endio(struct bio *bio) |
cafe5635 KO |
719 | { |
720 | struct journal_write *w = bio->bi_private; | |
721 | ||
4e4cbee9 | 722 | cache_set_err_on(bio->bi_status, w->c, "journal io error"); |
7857d5d4 | 723 | closure_put(&w->c->journal.io); |
cafe5635 KO |
724 | } |
725 | ||
d4e3b928 | 726 | static CLOSURE_CALLBACK(journal_write); |
cafe5635 | 727 | |
d4e3b928 | 728 | static CLOSURE_CALLBACK(journal_write_done) |
cafe5635 | 729 | { |
d4e3b928 | 730 | closure_type(j, struct journal, io); |
cafe5635 KO |
731 | struct journal_write *w = (j->cur == j->w) |
732 | ? &j->w[1] | |
733 | : &j->w[0]; | |
734 | ||
735 | __closure_wake_up(&w->wait); | |
0f843e65 | 736 | continue_at_nobarrier(cl, journal_write, bch_journal_wq); |
cafe5635 KO |
737 | } |
738 | ||
d4e3b928 | 739 | static CLOSURE_CALLBACK(journal_write_unlock) |
20d3a518 | 740 | __releases(&c->journal.lock) |
cb7a583e | 741 | { |
d4e3b928 | 742 | closure_type(c, struct cache_set, journal.io); |
cb7a583e KO |
743 | |
744 | c->journal.io_in_flight = 0; | |
745 | spin_unlock(&c->journal.lock); | |
746 | } | |
747 | ||
d4e3b928 | 748 | static CLOSURE_CALLBACK(journal_write_unlocked) |
c19ed23a | 749 | __releases(c->journal.lock) |
cafe5635 | 750 | { |
d4e3b928 | 751 | closure_type(c, struct cache_set, journal.io); |
08fdb2cd | 752 | struct cache *ca = c->cache; |
cafe5635 KO |
753 | struct journal_write *w = c->journal.cur; |
754 | struct bkey *k = &c->journal.key; | |
4e1ebae3 | 755 | unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * |
4a784266 | 756 | ca->sb.block_size; |
cafe5635 KO |
757 | |
758 | struct bio *bio; | |
759 | struct bio_list list; | |
1fae7cf0 | 760 | |
cafe5635 KO |
761 | bio_list_init(&list); |
762 | ||
763 | if (!w->need_write) { | |
cb7a583e | 764 | closure_return_with_destructor(cl, journal_write_unlock); |
77b5a084 | 765 | return; |
cafe5635 KO |
766 | } else if (journal_full(&c->journal)) { |
767 | journal_reclaim(c); | |
768 | spin_unlock(&c->journal.lock); | |
769 | ||
770 | btree_flush_write(c); | |
0f843e65 | 771 | continue_at(cl, journal_write, bch_journal_wq); |
77b5a084 | 772 | return; |
cafe5635 KO |
773 | } |
774 | ||
4e1ebae3 | 775 | c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); |
cafe5635 KO |
776 | |
777 | w->data->btree_level = c->root->level; | |
778 | ||
779 | bkey_copy(&w->data->btree_root, &c->root->key); | |
780 | bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); | |
781 | ||
08fdb2cd | 782 | w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; |
4a784266 | 783 | w->data->magic = jset_magic(&ca->sb); |
cafe5635 KO |
784 | w->data->version = BCACHE_JSET_VERSION; |
785 | w->data->last_seq = last_seq(&c->journal); | |
786 | w->data->csum = csum_set(w->data); | |
787 | ||
788 | for (i = 0; i < KEY_PTRS(k); i++) { | |
11e9560e | 789 | ca = c->cache; |
cafe5635 KO |
790 | bio = &ca->journal.bio; |
791 | ||
792 | atomic_long_add(sectors, &ca->meta_sectors_written); | |
793 | ||
a7c50c94 CH |
794 | bio_reset(bio, ca->bdev, REQ_OP_WRITE | |
795 | REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA); | |
4f024f37 | 796 | bio->bi_iter.bi_sector = PTR_OFFSET(k, i); |
4f024f37 | 797 | bio->bi_iter.bi_size = sectors << 9; |
cafe5635 KO |
798 | |
799 | bio->bi_end_io = journal_write_endio; | |
800 | bio->bi_private = w; | |
ff2695e5 | 801 | bch_bio_map(bio, w->data); |
cafe5635 | 802 | |
e78bd0d2 | 803 | trace_bcache_journal_write(bio, w->data->keys); |
cafe5635 KO |
804 | bio_list_add(&list, bio); |
805 | ||
806 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); | |
807 | ||
808 | ca->journal.seq[ca->journal.cur_idx] = w->data->seq; | |
809 | } | |
810 | ||
1bee2add CL |
811 | /* If KEY_PTRS(k) == 0, this jset gets lost in air */ |
812 | BUG_ON(i == 0); | |
813 | ||
cafe5635 KO |
814 | atomic_dec_bug(&fifo_back(&c->journal.pin)); |
815 | bch_journal_next(&c->journal); | |
816 | journal_reclaim(c); | |
817 | ||
818 | spin_unlock(&c->journal.lock); | |
819 | ||
820 | while ((bio = bio_list_pop(&list))) | |
771f393e | 821 | closure_bio_submit(c, bio, cl); |
cafe5635 KO |
822 | |
823 | continue_at(cl, journal_write_done, NULL); | |
824 | } | |
825 | ||
d4e3b928 | 826 | static CLOSURE_CALLBACK(journal_write) |
cafe5635 | 827 | { |
d4e3b928 | 828 | closure_type(c, struct cache_set, journal.io); |
cafe5635 KO |
829 | |
830 | spin_lock(&c->journal.lock); | |
d4e3b928 | 831 | journal_write_unlocked(&cl->work); |
cafe5635 KO |
832 | } |
833 | ||
a34a8bfd | 834 | static void journal_try_write(struct cache_set *c) |
c19ed23a | 835 | __releases(c->journal.lock) |
cafe5635 | 836 | { |
7857d5d4 KO |
837 | struct closure *cl = &c->journal.io; |
838 | struct journal_write *w = c->journal.cur; | |
839 | ||
840 | w->need_write = true; | |
cafe5635 | 841 | |
cb7a583e KO |
842 | if (!c->journal.io_in_flight) { |
843 | c->journal.io_in_flight = 1; | |
844 | closure_call(cl, journal_write_unlocked, NULL, &c->cl); | |
845 | } else { | |
a34a8bfd | 846 | spin_unlock(&c->journal.lock); |
cb7a583e | 847 | } |
cafe5635 KO |
848 | } |
849 | ||
a34a8bfd | 850 | static struct journal_write *journal_wait_for_write(struct cache_set *c, |
6f10f7d1 | 851 | unsigned int nkeys) |
20d3a518 | 852 | __acquires(&c->journal.lock) |
cafe5635 | 853 | { |
a34a8bfd KO |
854 | size_t sectors; |
855 | struct closure cl; | |
5775e213 | 856 | bool wait = false; |
4a784266 | 857 | struct cache *ca = c->cache; |
cafe5635 | 858 | |
a34a8bfd KO |
859 | closure_init_stack(&cl); |
860 | ||
861 | spin_lock(&c->journal.lock); | |
862 | ||
863 | while (1) { | |
864 | struct journal_write *w = c->journal.cur; | |
865 | ||
866 | sectors = __set_blocks(w->data, w->data->keys + nkeys, | |
4a784266 | 867 | block_bytes(ca)) * ca->sb.block_size; |
a34a8bfd KO |
868 | |
869 | if (sectors <= min_t(size_t, | |
4a784266 | 870 | c->journal.blocks_free * ca->sb.block_size, |
a34a8bfd KO |
871 | PAGE_SECTORS << JSET_BITS)) |
872 | return w; | |
873 | ||
5775e213 KO |
874 | if (wait) |
875 | closure_wait(&c->journal.wait, &cl); | |
876 | ||
a34a8bfd | 877 | if (!journal_full(&c->journal)) { |
5775e213 KO |
878 | if (wait) |
879 | trace_bcache_journal_entry_full(c); | |
a34a8bfd KO |
880 | |
881 | /* | |
882 | * XXX: If we were inserting so many keys that they | |
883 | * won't fit in an _empty_ journal write, we'll | |
884 | * deadlock. For now, handle this in | |
885 | * bch_keylist_realloc() - but something to think about. | |
886 | */ | |
887 | BUG_ON(!w->data->keys); | |
888 | ||
a34a8bfd KO |
889 | journal_try_write(c); /* unlocks */ |
890 | } else { | |
5775e213 KO |
891 | if (wait) |
892 | trace_bcache_journal_full(c); | |
a34a8bfd | 893 | |
a34a8bfd KO |
894 | journal_reclaim(c); |
895 | spin_unlock(&c->journal.lock); | |
cafe5635 | 896 | |
a34a8bfd KO |
897 | btree_flush_write(c); |
898 | } | |
cafe5635 | 899 | |
a34a8bfd KO |
900 | closure_sync(&cl); |
901 | spin_lock(&c->journal.lock); | |
5775e213 | 902 | wait = true; |
cafe5635 KO |
903 | } |
904 | } | |
905 | ||
7857d5d4 KO |
906 | static void journal_write_work(struct work_struct *work) |
907 | { | |
908 | struct cache_set *c = container_of(to_delayed_work(work), | |
909 | struct cache_set, | |
910 | journal.work); | |
911 | spin_lock(&c->journal.lock); | |
dabb4433 KO |
912 | if (c->journal.cur->dirty) |
913 | journal_try_write(c); | |
914 | else | |
915 | spin_unlock(&c->journal.lock); | |
7857d5d4 KO |
916 | } |
917 | ||
cafe5635 KO |
918 | /* |
919 | * Entry point to the journalling code - bio_insert() and btree_invalidate() | |
920 | * pass bch_journal() a list of keys to be journalled, and then | |
921 | * bch_journal() hands those same keys off to btree_insert_async() | |
922 | */ | |
923 | ||
a34a8bfd KO |
924 | atomic_t *bch_journal(struct cache_set *c, |
925 | struct keylist *keys, | |
926 | struct closure *parent) | |
cafe5635 | 927 | { |
cafe5635 | 928 | struct journal_write *w; |
a34a8bfd | 929 | atomic_t *ret; |
cafe5635 | 930 | |
383ff218 CL |
931 | /* No journaling if CACHE_SET_IO_DISABLE set already */ |
932 | if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) | |
933 | return NULL; | |
934 | ||
6f9414e0 | 935 | if (!CACHE_SYNC(&c->cache->sb)) |
a34a8bfd | 936 | return NULL; |
cafe5635 | 937 | |
a34a8bfd | 938 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); |
c37511b8 | 939 | |
fafff81c | 940 | memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); |
a34a8bfd | 941 | w->data->keys += bch_keylist_nkeys(keys); |
cafe5635 | 942 | |
a34a8bfd KO |
943 | ret = &fifo_back(&c->journal.pin); |
944 | atomic_inc(ret); | |
cafe5635 | 945 | |
a34a8bfd KO |
946 | if (parent) { |
947 | closure_wait(&w->wait, parent); | |
7857d5d4 | 948 | journal_try_write(c); |
dabb4433 KO |
949 | } else if (!w->dirty) { |
950 | w->dirty = true; | |
afe78ab4 KK |
951 | queue_delayed_work(bch_flush_wq, &c->journal.work, |
952 | msecs_to_jiffies(c->journal_delay_ms)); | |
7857d5d4 KO |
953 | spin_unlock(&c->journal.lock); |
954 | } else { | |
955 | spin_unlock(&c->journal.lock); | |
cafe5635 | 956 | } |
a34a8bfd KO |
957 | |
958 | ||
959 | return ret; | |
960 | } | |
961 | ||
962 | void bch_journal_meta(struct cache_set *c, struct closure *cl) | |
963 | { | |
964 | struct keylist keys; | |
965 | atomic_t *ref; | |
966 | ||
967 | bch_keylist_init(&keys); | |
968 | ||
969 | ref = bch_journal(c, &keys, cl); | |
970 | if (ref) | |
971 | atomic_dec_bug(ref); | |
cafe5635 KO |
972 | } |
973 | ||
974 | void bch_journal_free(struct cache_set *c) | |
975 | { | |
976 | free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); | |
977 | free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); | |
978 | free_fifo(&c->journal.pin); | |
979 | } | |
980 | ||
981 | int bch_journal_alloc(struct cache_set *c) | |
982 | { | |
983 | struct journal *j = &c->journal; | |
984 | ||
cafe5635 | 985 | spin_lock_init(&j->lock); |
91be66e1 | 986 | spin_lock_init(&j->flush_write_lock); |
7857d5d4 | 987 | INIT_DELAYED_WORK(&j->work, journal_write_work); |
cafe5635 KO |
988 | |
989 | c->journal_delay_ms = 100; | |
990 | ||
991 | j->w[0].c = c; | |
992 | j->w[1].c = c; | |
993 | ||
249a5f6d | 994 | if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || |
5fe48867 CL |
995 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || |
996 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) | |
cafe5635 KO |
997 | return -ENOMEM; |
998 | ||
999 | return 0; | |
1000 | } |