]> git.ipfire.org Git - thirdparty/linux.git/blame - fs/btrfs/block-group.c
btrfs: add and use helper to check if block group is used
[thirdparty/linux.git] / fs / btrfs / block-group.c
CommitLineData
2e405ad8
JB
1// SPDX-License-Identifier: GPL-2.0
2
52bb7a21 3#include <linux/sizes.h>
2ca0ec77 4#include <linux/list_sort.h>
784352fe 5#include "misc.h"
2e405ad8
JB
6#include "ctree.h"
7#include "block-group.h"
3eeb3226 8#include "space-info.h"
9f21246d
JB
9#include "disk-io.h"
10#include "free-space-cache.h"
11#include "free-space-tree.h"
e3e0520b
JB
12#include "volumes.h"
13#include "transaction.h"
14#include "ref-verify.h"
4358d963
JB
15#include "sysfs.h"
16#include "tree-log.h"
77745c05 17#include "delalloc-space.h"
b0643e59 18#include "discard.h"
96a14336 19#include "raid56.h"
08e11a3d 20#include "zoned.h"
c7f13d42 21#include "fs.h"
07e81dc9 22#include "accessors.h"
a0231804 23#include "extent-tree.h"
2e405ad8 24
06d61cb1
JB
25#ifdef CONFIG_BTRFS_DEBUG
26int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
27{
28 struct btrfs_fs_info *fs_info = block_group->fs_info;
29
30 return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
31 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
32 (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
33 block_group->flags & BTRFS_BLOCK_GROUP_DATA);
34}
35#endif
36
878d7b67
JB
37/*
38 * Return target flags in extended format or 0 if restripe for this chunk_type
39 * is not in progress
40 *
41 * Should be called with balance_lock held
42 */
e11c0406 43static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
878d7b67
JB
44{
45 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
46 u64 target = 0;
47
48 if (!bctl)
49 return 0;
50
51 if (flags & BTRFS_BLOCK_GROUP_DATA &&
52 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
53 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
54 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
55 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
56 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
57 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
58 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
59 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
60 }
61
62 return target;
63}
64
65/*
66 * @flags: available profiles in extended format (see ctree.h)
67 *
68 * Return reduced profile in chunk format. If profile changing is in progress
69 * (either running or paused) picks the target profile (if it's already
70 * available), otherwise falls back to plain reducing.
71 */
72static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
73{
74 u64 num_devices = fs_info->fs_devices->rw_devices;
75 u64 target;
76 u64 raid_type;
77 u64 allowed = 0;
78
79 /*
80 * See if restripe for this chunk_type is in progress, if so try to
81 * reduce to the target profile
82 */
83 spin_lock(&fs_info->balance_lock);
e11c0406 84 target = get_restripe_target(fs_info, flags);
878d7b67 85 if (target) {
162e0a16
JB
86 spin_unlock(&fs_info->balance_lock);
87 return extended_to_chunk(target);
878d7b67
JB
88 }
89 spin_unlock(&fs_info->balance_lock);
90
91 /* First, mask out the RAID levels which aren't possible */
92 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
93 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
94 allowed |= btrfs_raid_array[raid_type].bg_flag;
95 }
96 allowed &= flags;
97
160fe8f6
MC
98 /* Select the highest-redundancy RAID level. */
99 if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
100 allowed = BTRFS_BLOCK_GROUP_RAID1C4;
101 else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
878d7b67 102 allowed = BTRFS_BLOCK_GROUP_RAID6;
160fe8f6
MC
103 else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
104 allowed = BTRFS_BLOCK_GROUP_RAID1C3;
878d7b67
JB
105 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
106 allowed = BTRFS_BLOCK_GROUP_RAID5;
107 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
108 allowed = BTRFS_BLOCK_GROUP_RAID10;
109 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
110 allowed = BTRFS_BLOCK_GROUP_RAID1;
160fe8f6
MC
111 else if (allowed & BTRFS_BLOCK_GROUP_DUP)
112 allowed = BTRFS_BLOCK_GROUP_DUP;
878d7b67
JB
113 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
114 allowed = BTRFS_BLOCK_GROUP_RAID0;
115
116 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
117
118 return extended_to_chunk(flags | allowed);
119}
120
ef0a82da 121u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
878d7b67
JB
122{
123 unsigned seq;
124 u64 flags;
125
126 do {
127 flags = orig_flags;
128 seq = read_seqbegin(&fs_info->profiles_lock);
129
130 if (flags & BTRFS_BLOCK_GROUP_DATA)
131 flags |= fs_info->avail_data_alloc_bits;
132 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
133 flags |= fs_info->avail_system_alloc_bits;
134 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
135 flags |= fs_info->avail_metadata_alloc_bits;
136 } while (read_seqretry(&fs_info->profiles_lock, seq));
137
138 return btrfs_reduce_alloc_profile(fs_info, flags);
139}
140
32da5386 141void btrfs_get_block_group(struct btrfs_block_group *cache)
3cad1284 142{
48aaeebe 143 refcount_inc(&cache->refs);
3cad1284
JB
144}
145
32da5386 146void btrfs_put_block_group(struct btrfs_block_group *cache)
3cad1284 147{
48aaeebe 148 if (refcount_dec_and_test(&cache->refs)) {
3cad1284 149 WARN_ON(cache->pinned > 0);
40cdc509
FM
150 /*
151 * If there was a failure to cleanup a log tree, very likely due
152 * to an IO failure on a writeback attempt of one or more of its
153 * extent buffers, we could not do proper (and cheap) unaccounting
154 * of their reserved space, so don't warn on reserved > 0 in that
155 * case.
156 */
157 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
158 !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
159 WARN_ON(cache->reserved > 0);
3cad1284 160
b0643e59
DZ
161 /*
162 * A block_group shouldn't be on the discard_list anymore.
163 * Remove the block_group from the discard_list to prevent us
164 * from causing a panic due to NULL pointer dereference.
165 */
166 if (WARN_ON(!list_empty(&cache->discard_list)))
167 btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
168 cache);
169
3cad1284 170 kfree(cache->free_space_ctl);
7dc66abb 171 btrfs_free_chunk_map(cache->physical_map);
3cad1284
JB
172 kfree(cache);
173 }
174}
175
4358d963
JB
176/*
177 * This adds the block group to the fs_info rb tree for the block group cache
178 */
179static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
32da5386 180 struct btrfs_block_group *block_group)
4358d963
JB
181{
182 struct rb_node **p;
183 struct rb_node *parent = NULL;
32da5386 184 struct btrfs_block_group *cache;
08dddb29 185 bool leftmost = true;
4358d963 186
9afc6649
QW
187 ASSERT(block_group->length != 0);
188
16b0c258 189 write_lock(&info->block_group_cache_lock);
08dddb29 190 p = &info->block_group_cache_tree.rb_root.rb_node;
4358d963
JB
191
192 while (*p) {
193 parent = *p;
32da5386 194 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
b3470b5d 195 if (block_group->start < cache->start) {
4358d963 196 p = &(*p)->rb_left;
b3470b5d 197 } else if (block_group->start > cache->start) {
4358d963 198 p = &(*p)->rb_right;
08dddb29 199 leftmost = false;
4358d963 200 } else {
16b0c258 201 write_unlock(&info->block_group_cache_lock);
4358d963
JB
202 return -EEXIST;
203 }
204 }
205
206 rb_link_node(&block_group->cache_node, parent, p);
08dddb29
FM
207 rb_insert_color_cached(&block_group->cache_node,
208 &info->block_group_cache_tree, leftmost);
4358d963 209
16b0c258 210 write_unlock(&info->block_group_cache_lock);
4358d963
JB
211
212 return 0;
213}
214
2e405ad8
JB
215/*
216 * This will return the block group at or after bytenr if contains is 0, else
217 * it will return the block group that contains the bytenr
218 */
32da5386 219static struct btrfs_block_group *block_group_cache_tree_search(
2e405ad8
JB
220 struct btrfs_fs_info *info, u64 bytenr, int contains)
221{
32da5386 222 struct btrfs_block_group *cache, *ret = NULL;
2e405ad8
JB
223 struct rb_node *n;
224 u64 end, start;
225
16b0c258 226 read_lock(&info->block_group_cache_lock);
08dddb29 227 n = info->block_group_cache_tree.rb_root.rb_node;
2e405ad8
JB
228
229 while (n) {
32da5386 230 cache = rb_entry(n, struct btrfs_block_group, cache_node);
b3470b5d
DS
231 end = cache->start + cache->length - 1;
232 start = cache->start;
2e405ad8
JB
233
234 if (bytenr < start) {
b3470b5d 235 if (!contains && (!ret || start < ret->start))
2e405ad8
JB
236 ret = cache;
237 n = n->rb_left;
238 } else if (bytenr > start) {
239 if (contains && bytenr <= end) {
240 ret = cache;
241 break;
242 }
243 n = n->rb_right;
244 } else {
245 ret = cache;
246 break;
247 }
248 }
08dddb29 249 if (ret)
2e405ad8 250 btrfs_get_block_group(ret);
16b0c258 251 read_unlock(&info->block_group_cache_lock);
2e405ad8
JB
252
253 return ret;
254}
255
256/*
257 * Return the block group that starts at or after bytenr
258 */
32da5386 259struct btrfs_block_group *btrfs_lookup_first_block_group(
2e405ad8
JB
260 struct btrfs_fs_info *info, u64 bytenr)
261{
262 return block_group_cache_tree_search(info, bytenr, 0);
263}
264
265/*
266 * Return the block group that contains the given bytenr
267 */
32da5386 268struct btrfs_block_group *btrfs_lookup_block_group(
2e405ad8
JB
269 struct btrfs_fs_info *info, u64 bytenr)
270{
271 return block_group_cache_tree_search(info, bytenr, 1);
272}
273
32da5386
DS
274struct btrfs_block_group *btrfs_next_block_group(
275 struct btrfs_block_group *cache)
2e405ad8
JB
276{
277 struct btrfs_fs_info *fs_info = cache->fs_info;
278 struct rb_node *node;
279
16b0c258 280 read_lock(&fs_info->block_group_cache_lock);
2e405ad8
JB
281
282 /* If our block group was removed, we need a full search. */
283 if (RB_EMPTY_NODE(&cache->cache_node)) {
b3470b5d 284 const u64 next_bytenr = cache->start + cache->length;
2e405ad8 285
16b0c258 286 read_unlock(&fs_info->block_group_cache_lock);
2e405ad8 287 btrfs_put_block_group(cache);
8b01f931 288 return btrfs_lookup_first_block_group(fs_info, next_bytenr);
2e405ad8
JB
289 }
290 node = rb_next(&cache->cache_node);
291 btrfs_put_block_group(cache);
292 if (node) {
32da5386 293 cache = rb_entry(node, struct btrfs_block_group, cache_node);
2e405ad8
JB
294 btrfs_get_block_group(cache);
295 } else
296 cache = NULL;
16b0c258 297 read_unlock(&fs_info->block_group_cache_lock);
2e405ad8
JB
298 return cache;
299}
3eeb3226 300
43dd529a 301/*
2306e83e
FM
302 * Check if we can do a NOCOW write for a given extent.
303 *
304 * @fs_info: The filesystem information object.
305 * @bytenr: Logical start address of the extent.
306 *
307 * Check if we can do a NOCOW write for the given extent, and increments the
308 * number of NOCOW writers in the block group that contains the extent, as long
309 * as the block group exists and it's currently not in read-only mode.
310 *
311 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
312 * is responsible for calling btrfs_dec_nocow_writers() later.
313 *
314 * Or NULL if we can not do a NOCOW write
315 */
316struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
317 u64 bytenr)
3eeb3226 318{
32da5386 319 struct btrfs_block_group *bg;
2306e83e 320 bool can_nocow = true;
3eeb3226
JB
321
322 bg = btrfs_lookup_block_group(fs_info, bytenr);
323 if (!bg)
2306e83e 324 return NULL;
3eeb3226
JB
325
326 spin_lock(&bg->lock);
327 if (bg->ro)
2306e83e 328 can_nocow = false;
3eeb3226
JB
329 else
330 atomic_inc(&bg->nocow_writers);
331 spin_unlock(&bg->lock);
332
2306e83e 333 if (!can_nocow) {
3eeb3226 334 btrfs_put_block_group(bg);
2306e83e
FM
335 return NULL;
336 }
3eeb3226 337
2306e83e
FM
338 /* No put on block group, done by btrfs_dec_nocow_writers(). */
339 return bg;
3eeb3226
JB
340}
341
43dd529a 342/*
2306e83e
FM
343 * Decrement the number of NOCOW writers in a block group.
344 *
2306e83e
FM
345 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
346 * and on the block group returned by that call. Typically this is called after
347 * creating an ordered extent for a NOCOW write, to prevent races with scrub and
348 * relocation.
349 *
350 * After this call, the caller should not use the block group anymore. It it wants
351 * to use it, then it should get a reference on it before calling this function.
352 */
353void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
3eeb3226 354{
3eeb3226
JB
355 if (atomic_dec_and_test(&bg->nocow_writers))
356 wake_up_var(&bg->nocow_writers);
2306e83e
FM
357
358 /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
3eeb3226
JB
359 btrfs_put_block_group(bg);
360}
361
32da5386 362void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
3eeb3226
JB
363{
364 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
365}
366
367void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
368 const u64 start)
369{
32da5386 370 struct btrfs_block_group *bg;
3eeb3226
JB
371
372 bg = btrfs_lookup_block_group(fs_info, start);
373 ASSERT(bg);
374 if (atomic_dec_and_test(&bg->reservations))
375 wake_up_var(&bg->reservations);
376 btrfs_put_block_group(bg);
377}
378
32da5386 379void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
3eeb3226
JB
380{
381 struct btrfs_space_info *space_info = bg->space_info;
382
383 ASSERT(bg->ro);
384
385 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
386 return;
387
388 /*
389 * Our block group is read only but before we set it to read only,
390 * some task might have had allocated an extent from it already, but it
391 * has not yet created a respective ordered extent (and added it to a
392 * root's list of ordered extents).
393 * Therefore wait for any task currently allocating extents, since the
394 * block group's reservations counter is incremented while a read lock
395 * on the groups' semaphore is held and decremented after releasing
396 * the read access on that semaphore and creating the ordered extent.
397 */
398 down_write(&space_info->groups_sem);
399 up_write(&space_info->groups_sem);
400
401 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
402}
9f21246d
JB
403
404struct btrfs_caching_control *btrfs_get_caching_control(
32da5386 405 struct btrfs_block_group *cache)
9f21246d
JB
406{
407 struct btrfs_caching_control *ctl;
408
409 spin_lock(&cache->lock);
410 if (!cache->caching_ctl) {
411 spin_unlock(&cache->lock);
412 return NULL;
413 }
414
415 ctl = cache->caching_ctl;
416 refcount_inc(&ctl->count);
417 spin_unlock(&cache->lock);
418 return ctl;
419}
420
421void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
422{
423 if (refcount_dec_and_test(&ctl->count))
424 kfree(ctl);
425}
426
427/*
428 * When we wait for progress in the block group caching, its because our
429 * allocation attempt failed at least once. So, we must sleep and let some
430 * progress happen before we try again.
431 *
432 * This function will sleep at least once waiting for new free space to show
433 * up, and then it will check the block group free space numbers for our min
434 * num_bytes. Another option is to have it go ahead and look in the rbtree for
435 * a free extent of a given size, but this is a good start.
436 *
437 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
438 * any of the information in this block group.
439 */
32da5386 440void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
9f21246d
JB
441 u64 num_bytes)
442{
443 struct btrfs_caching_control *caching_ctl;
fc1f91b9 444 int progress;
9f21246d
JB
445
446 caching_ctl = btrfs_get_caching_control(cache);
447 if (!caching_ctl)
448 return;
449
fc1f91b9
JB
450 /*
451 * We've already failed to allocate from this block group, so even if
452 * there's enough space in the block group it isn't contiguous enough to
453 * allow for an allocation, so wait for at least the next wakeup tick,
454 * or for the thing to be done.
455 */
456 progress = atomic_read(&caching_ctl->progress);
457
32da5386 458 wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
fc1f91b9
JB
459 (progress != atomic_read(&caching_ctl->progress) &&
460 (cache->free_space_ctl->free_space >= num_bytes)));
9f21246d
JB
461
462 btrfs_put_caching_control(caching_ctl);
463}
464
ced8ecf0
OS
465static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
466 struct btrfs_caching_control *caching_ctl)
467{
468 wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
469 return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
470}
471
472static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
9f21246d
JB
473{
474 struct btrfs_caching_control *caching_ctl;
ced8ecf0 475 int ret;
9f21246d
JB
476
477 caching_ctl = btrfs_get_caching_control(cache);
478 if (!caching_ctl)
479 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
ced8ecf0 480 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
9f21246d
JB
481 btrfs_put_caching_control(caching_ctl);
482 return ret;
483}
484
485#ifdef CONFIG_BTRFS_DEBUG
32da5386 486static void fragment_free_space(struct btrfs_block_group *block_group)
9f21246d
JB
487{
488 struct btrfs_fs_info *fs_info = block_group->fs_info;
b3470b5d
DS
489 u64 start = block_group->start;
490 u64 len = block_group->length;
9f21246d
JB
491 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
492 fs_info->nodesize : fs_info->sectorsize;
493 u64 step = chunk << 1;
494
495 while (len > chunk) {
496 btrfs_remove_free_space(block_group, start, chunk);
497 start += step;
498 if (len < step)
499 len = 0;
500 else
501 len -= step;
502 }
503}
504#endif
505
506/*
28f60894
FM
507 * Add a free space range to the in memory free space cache of a block group.
508 * This checks if the range contains super block locations and any such
509 * locations are not added to the free space cache.
510 *
511 * @block_group: The target block group.
512 * @start: Start offset of the range.
513 * @end: End offset of the range (exclusive).
514 * @total_added_ret: Optional pointer to return the total amount of space
515 * added to the block group's free space cache.
516 *
517 * Returns 0 on success or < 0 on error.
9f21246d 518 */
3b9f0995
FM
519int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
520 u64 end, u64 *total_added_ret)
9f21246d
JB
521{
522 struct btrfs_fs_info *info = block_group->fs_info;
d8ccbd21 523 u64 extent_start, extent_end, size;
9f21246d
JB
524 int ret;
525
d8ccbd21
FM
526 if (total_added_ret)
527 *total_added_ret = 0;
528
9f21246d 529 while (start < end) {
e5860f82
FM
530 if (!find_first_extent_bit(&info->excluded_extents, start,
531 &extent_start, &extent_end,
532 EXTENT_DIRTY | EXTENT_UPTODATE,
533 NULL))
9f21246d
JB
534 break;
535
536 if (extent_start <= start) {
537 start = extent_end + 1;
538 } else if (extent_start > start && extent_start < end) {
539 size = extent_start - start;
b0643e59
DZ
540 ret = btrfs_add_free_space_async_trimmed(block_group,
541 start, size);
d8ccbd21
FM
542 if (ret)
543 return ret;
544 if (total_added_ret)
545 *total_added_ret += size;
9f21246d
JB
546 start = extent_end + 1;
547 } else {
548 break;
549 }
550 }
551
552 if (start < end) {
553 size = end - start;
b0643e59
DZ
554 ret = btrfs_add_free_space_async_trimmed(block_group, start,
555 size);
d8ccbd21
FM
556 if (ret)
557 return ret;
558 if (total_added_ret)
559 *total_added_ret += size;
9f21246d
JB
560 }
561
d8ccbd21 562 return 0;
9f21246d
JB
563}
564
c7eec3d9
BB
565/*
566 * Get an arbitrary extent item index / max_index through the block group
567 *
568 * @block_group the block group to sample from
569 * @index: the integral step through the block group to grab from
570 * @max_index: the granularity of the sampling
571 * @key: return value parameter for the item we find
572 *
573 * Pre-conditions on indices:
574 * 0 <= index <= max_index
575 * 0 < max_index
576 *
577 * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
578 * error code on error.
579 */
580static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
581 struct btrfs_block_group *block_group,
582 int index, int max_index,
12148367 583 struct btrfs_key *found_key)
c7eec3d9
BB
584{
585 struct btrfs_fs_info *fs_info = block_group->fs_info;
586 struct btrfs_root *extent_root;
c7eec3d9
BB
587 u64 search_offset;
588 u64 search_end = block_group->start + block_group->length;
589 struct btrfs_path *path;
12148367
BB
590 struct btrfs_key search_key;
591 int ret = 0;
c7eec3d9
BB
592
593 ASSERT(index >= 0);
594 ASSERT(index <= max_index);
595 ASSERT(max_index > 0);
596 lockdep_assert_held(&caching_ctl->mutex);
597 lockdep_assert_held_read(&fs_info->commit_root_sem);
598
599 path = btrfs_alloc_path();
600 if (!path)
601 return -ENOMEM;
602
603 extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
604 BTRFS_SUPER_INFO_OFFSET));
605
606 path->skip_locking = 1;
607 path->search_commit_root = 1;
608 path->reada = READA_FORWARD;
609
610 search_offset = index * div_u64(block_group->length, max_index);
12148367
BB
611 search_key.objectid = block_group->start + search_offset;
612 search_key.type = BTRFS_EXTENT_ITEM_KEY;
613 search_key.offset = 0;
c7eec3d9 614
12148367 615 btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
c7eec3d9 616 /* Success; sampled an extent item in the block group */
12148367
BB
617 if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
618 found_key->objectid >= block_group->start &&
619 found_key->objectid + found_key->offset <= search_end)
620 break;
c7eec3d9
BB
621
622 /* We can't possibly find a valid extent item anymore */
12148367 623 if (found_key->objectid >= search_end) {
c7eec3d9
BB
624 ret = 1;
625 break;
626 }
c7eec3d9 627 }
12148367 628
c7eec3d9
BB
629 lockdep_assert_held(&caching_ctl->mutex);
630 lockdep_assert_held_read(&fs_info->commit_root_sem);
631 btrfs_free_path(path);
632 return ret;
633}
634
635/*
636 * Best effort attempt to compute a block group's size class while caching it.
637 *
638 * @block_group: the block group we are caching
639 *
640 * We cannot infer the size class while adding free space extents, because that
641 * logic doesn't care about contiguous file extents (it doesn't differentiate
642 * between a 100M extent and 100 contiguous 1M extents). So we need to read the
643 * file extent items. Reading all of them is quite wasteful, because usually
644 * only a handful are enough to give a good answer. Therefore, we just grab 5 of
645 * them at even steps through the block group and pick the smallest size class
646 * we see. Since size class is best effort, and not guaranteed in general,
647 * inaccuracy is acceptable.
648 *
649 * To be more explicit about why this algorithm makes sense:
650 *
651 * If we are caching in a block group from disk, then there are three major cases
652 * to consider:
653 * 1. the block group is well behaved and all extents in it are the same size
654 * class.
655 * 2. the block group is mostly one size class with rare exceptions for last
656 * ditch allocations
657 * 3. the block group was populated before size classes and can have a totally
658 * arbitrary mix of size classes.
659 *
660 * In case 1, looking at any extent in the block group will yield the correct
661 * result. For the mixed cases, taking the minimum size class seems like a good
662 * approximation, since gaps from frees will be usable to the size class. For
663 * 2., a small handful of file extents is likely to yield the right answer. For
664 * 3, we can either read every file extent, or admit that this is best effort
665 * anyway and try to stay fast.
666 *
667 * Returns: 0 on success, negative error code on error.
668 */
669static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
670 struct btrfs_block_group *block_group)
671{
12148367 672 struct btrfs_fs_info *fs_info = block_group->fs_info;
c7eec3d9
BB
673 struct btrfs_key key;
674 int i;
675 u64 min_size = block_group->length;
676 enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
677 int ret;
678
cb0922f2 679 if (!btrfs_block_group_should_use_size_class(block_group))
c7eec3d9
BB
680 return 0;
681
12148367
BB
682 lockdep_assert_held(&caching_ctl->mutex);
683 lockdep_assert_held_read(&fs_info->commit_root_sem);
c7eec3d9
BB
684 for (i = 0; i < 5; ++i) {
685 ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
686 if (ret < 0)
687 goto out;
688 if (ret > 0)
689 continue;
690 min_size = min_t(u64, min_size, key.offset);
691 size_class = btrfs_calc_block_group_size_class(min_size);
692 }
693 if (size_class != BTRFS_BG_SZ_NONE) {
694 spin_lock(&block_group->lock);
695 block_group->size_class = size_class;
696 spin_unlock(&block_group->lock);
697 }
c7eec3d9
BB
698out:
699 return ret;
700}
701
9f21246d
JB
702static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
703{
32da5386 704 struct btrfs_block_group *block_group = caching_ctl->block_group;
9f21246d 705 struct btrfs_fs_info *fs_info = block_group->fs_info;
29cbcf40 706 struct btrfs_root *extent_root;
9f21246d
JB
707 struct btrfs_path *path;
708 struct extent_buffer *leaf;
709 struct btrfs_key key;
710 u64 total_found = 0;
711 u64 last = 0;
712 u32 nritems;
713 int ret;
714 bool wakeup = true;
715
716 path = btrfs_alloc_path();
717 if (!path)
718 return -ENOMEM;
719
b3470b5d 720 last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
29cbcf40 721 extent_root = btrfs_extent_root(fs_info, last);
9f21246d
JB
722
723#ifdef CONFIG_BTRFS_DEBUG
724 /*
725 * If we're fragmenting we don't want to make anybody think we can
726 * allocate from this block group until we've had a chance to fragment
727 * the free space.
728 */
729 if (btrfs_should_fragment_free_space(block_group))
730 wakeup = false;
731#endif
732 /*
733 * We don't want to deadlock with somebody trying to allocate a new
734 * extent for the extent root while also trying to search the extent
735 * root to add free space. So we skip locking and search the commit
736 * root, since its read-only
737 */
738 path->skip_locking = 1;
739 path->search_commit_root = 1;
740 path->reada = READA_FORWARD;
741
742 key.objectid = last;
743 key.offset = 0;
744 key.type = BTRFS_EXTENT_ITEM_KEY;
745
746next:
747 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
748 if (ret < 0)
749 goto out;
750
751 leaf = path->nodes[0];
752 nritems = btrfs_header_nritems(leaf);
753
754 while (1) {
755 if (btrfs_fs_closing(fs_info) > 1) {
756 last = (u64)-1;
757 break;
758 }
759
760 if (path->slots[0] < nritems) {
761 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
762 } else {
763 ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
764 if (ret)
765 break;
766
767 if (need_resched() ||
768 rwsem_is_contended(&fs_info->commit_root_sem)) {
9f21246d
JB
769 btrfs_release_path(path);
770 up_read(&fs_info->commit_root_sem);
771 mutex_unlock(&caching_ctl->mutex);
772 cond_resched();
773 mutex_lock(&caching_ctl->mutex);
774 down_read(&fs_info->commit_root_sem);
775 goto next;
776 }
777
778 ret = btrfs_next_leaf(extent_root, path);
779 if (ret < 0)
780 goto out;
781 if (ret)
782 break;
783 leaf = path->nodes[0];
784 nritems = btrfs_header_nritems(leaf);
785 continue;
786 }
787
788 if (key.objectid < last) {
789 key.objectid = last;
790 key.offset = 0;
791 key.type = BTRFS_EXTENT_ITEM_KEY;
9f21246d
JB
792 btrfs_release_path(path);
793 goto next;
794 }
795
b3470b5d 796 if (key.objectid < block_group->start) {
9f21246d
JB
797 path->slots[0]++;
798 continue;
799 }
800
b3470b5d 801 if (key.objectid >= block_group->start + block_group->length)
9f21246d
JB
802 break;
803
804 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
805 key.type == BTRFS_METADATA_ITEM_KEY) {
d8ccbd21
FM
806 u64 space_added;
807
3b9f0995
FM
808 ret = btrfs_add_new_free_space(block_group, last,
809 key.objectid, &space_added);
d8ccbd21
FM
810 if (ret)
811 goto out;
812 total_found += space_added;
9f21246d
JB
813 if (key.type == BTRFS_METADATA_ITEM_KEY)
814 last = key.objectid +
815 fs_info->nodesize;
816 else
817 last = key.objectid + key.offset;
818
819 if (total_found > CACHING_CTL_WAKE_UP) {
820 total_found = 0;
fc1f91b9
JB
821 if (wakeup) {
822 atomic_inc(&caching_ctl->progress);
9f21246d 823 wake_up(&caching_ctl->wait);
fc1f91b9 824 }
9f21246d
JB
825 }
826 }
827 path->slots[0]++;
828 }
9f21246d 829
3b9f0995
FM
830 ret = btrfs_add_new_free_space(block_group, last,
831 block_group->start + block_group->length,
832 NULL);
9f21246d
JB
833out:
834 btrfs_free_path(path);
835 return ret;
836}
837
98b5a8fd
FM
838static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
839{
840 clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
841 bg->start + bg->length - 1, EXTENT_UPTODATE);
842}
843
9f21246d
JB
844static noinline void caching_thread(struct btrfs_work *work)
845{
32da5386 846 struct btrfs_block_group *block_group;
9f21246d
JB
847 struct btrfs_fs_info *fs_info;
848 struct btrfs_caching_control *caching_ctl;
849 int ret;
850
851 caching_ctl = container_of(work, struct btrfs_caching_control, work);
852 block_group = caching_ctl->block_group;
853 fs_info = block_group->fs_info;
854
855 mutex_lock(&caching_ctl->mutex);
856 down_read(&fs_info->commit_root_sem);
857
c7eec3d9 858 load_block_group_size_class(caching_ctl, block_group);
e747853c
JB
859 if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
860 ret = load_free_space_cache(block_group);
861 if (ret == 1) {
862 ret = 0;
863 goto done;
864 }
865
866 /*
867 * We failed to load the space cache, set ourselves to
868 * CACHE_STARTED and carry on.
869 */
870 spin_lock(&block_group->lock);
871 block_group->cached = BTRFS_CACHE_STARTED;
872 spin_unlock(&block_group->lock);
873 wake_up(&caching_ctl->wait);
874 }
875
2f96e402
JB
876 /*
877 * If we are in the transaction that populated the free space tree we
878 * can't actually cache from the free space tree as our commit root and
879 * real root are the same, so we could change the contents of the blocks
880 * while caching. Instead do the slow caching in this case, and after
881 * the transaction has committed we will be safe.
882 */
883 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
884 !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
9f21246d
JB
885 ret = load_free_space_tree(caching_ctl);
886 else
887 ret = load_extent_tree_free(caching_ctl);
e747853c 888done:
9f21246d
JB
889 spin_lock(&block_group->lock);
890 block_group->caching_ctl = NULL;
891 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
892 spin_unlock(&block_group->lock);
893
894#ifdef CONFIG_BTRFS_DEBUG
895 if (btrfs_should_fragment_free_space(block_group)) {
896 u64 bytes_used;
897
898 spin_lock(&block_group->space_info->lock);
899 spin_lock(&block_group->lock);
b3470b5d 900 bytes_used = block_group->length - block_group->used;
9f21246d
JB
901 block_group->space_info->bytes_used += bytes_used >> 1;
902 spin_unlock(&block_group->lock);
903 spin_unlock(&block_group->space_info->lock);
e11c0406 904 fragment_free_space(block_group);
9f21246d
JB
905 }
906#endif
907
9f21246d
JB
908 up_read(&fs_info->commit_root_sem);
909 btrfs_free_excluded_extents(block_group);
910 mutex_unlock(&caching_ctl->mutex);
911
912 wake_up(&caching_ctl->wait);
913
914 btrfs_put_caching_control(caching_ctl);
915 btrfs_put_block_group(block_group);
916}
917
ced8ecf0 918int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
9f21246d 919{
9f21246d 920 struct btrfs_fs_info *fs_info = cache->fs_info;
e747853c 921 struct btrfs_caching_control *caching_ctl = NULL;
9f21246d
JB
922 int ret = 0;
923
2eda5708
NA
924 /* Allocator for zoned filesystems does not use the cache at all */
925 if (btrfs_is_zoned(fs_info))
926 return 0;
927
9f21246d
JB
928 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
929 if (!caching_ctl)
930 return -ENOMEM;
931
932 INIT_LIST_HEAD(&caching_ctl->list);
933 mutex_init(&caching_ctl->mutex);
934 init_waitqueue_head(&caching_ctl->wait);
935 caching_ctl->block_group = cache;
e747853c 936 refcount_set(&caching_ctl->count, 2);
fc1f91b9 937 atomic_set(&caching_ctl->progress, 0);
078b8b90 938 btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
9f21246d
JB
939
940 spin_lock(&cache->lock);
9f21246d 941 if (cache->cached != BTRFS_CACHE_NO) {
9f21246d 942 kfree(caching_ctl);
e747853c
JB
943
944 caching_ctl = cache->caching_ctl;
945 if (caching_ctl)
946 refcount_inc(&caching_ctl->count);
947 spin_unlock(&cache->lock);
948 goto out;
9f21246d
JB
949 }
950 WARN_ON(cache->caching_ctl);
951 cache->caching_ctl = caching_ctl;
ced8ecf0 952 cache->cached = BTRFS_CACHE_STARTED;
9f21246d
JB
953 spin_unlock(&cache->lock);
954
16b0c258 955 write_lock(&fs_info->block_group_cache_lock);
9f21246d
JB
956 refcount_inc(&caching_ctl->count);
957 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
16b0c258 958 write_unlock(&fs_info->block_group_cache_lock);
9f21246d
JB
959
960 btrfs_get_block_group(cache);
961
962 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
e747853c 963out:
ced8ecf0
OS
964 if (wait && caching_ctl)
965 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
e747853c
JB
966 if (caching_ctl)
967 btrfs_put_caching_control(caching_ctl);
9f21246d
JB
968
969 return ret;
970}
e3e0520b
JB
971
972static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
973{
974 u64 extra_flags = chunk_to_extended(flags) &
975 BTRFS_EXTENDED_PROFILE_MASK;
976
977 write_seqlock(&fs_info->profiles_lock);
978 if (flags & BTRFS_BLOCK_GROUP_DATA)
979 fs_info->avail_data_alloc_bits &= ~extra_flags;
980 if (flags & BTRFS_BLOCK_GROUP_METADATA)
981 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
982 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
983 fs_info->avail_system_alloc_bits &= ~extra_flags;
984 write_sequnlock(&fs_info->profiles_lock);
985}
986
987/*
988 * Clear incompat bits for the following feature(s):
989 *
990 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
991 * in the whole filesystem
9c907446
DS
992 *
993 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
e3e0520b
JB
994 */
995static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
996{
9c907446
DS
997 bool found_raid56 = false;
998 bool found_raid1c34 = false;
999
1000 if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
1001 (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
1002 (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
e3e0520b
JB
1003 struct list_head *head = &fs_info->space_info;
1004 struct btrfs_space_info *sinfo;
1005
1006 list_for_each_entry_rcu(sinfo, head, list) {
e3e0520b
JB
1007 down_read(&sinfo->groups_sem);
1008 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
9c907446 1009 found_raid56 = true;
e3e0520b 1010 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
9c907446
DS
1011 found_raid56 = true;
1012 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
1013 found_raid1c34 = true;
1014 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
1015 found_raid1c34 = true;
e3e0520b 1016 up_read(&sinfo->groups_sem);
e3e0520b 1017 }
d8e6fd5c 1018 if (!found_raid56)
9c907446 1019 btrfs_clear_fs_incompat(fs_info, RAID56);
d8e6fd5c 1020 if (!found_raid1c34)
9c907446 1021 btrfs_clear_fs_incompat(fs_info, RAID1C34);
e3e0520b
JB
1022 }
1023}
1024
7357623a
QW
1025static int remove_block_group_item(struct btrfs_trans_handle *trans,
1026 struct btrfs_path *path,
1027 struct btrfs_block_group *block_group)
1028{
1029 struct btrfs_fs_info *fs_info = trans->fs_info;
1030 struct btrfs_root *root;
1031 struct btrfs_key key;
1032 int ret;
1033
dfe8aec4 1034 root = btrfs_block_group_root(fs_info);
7357623a
QW
1035 key.objectid = block_group->start;
1036 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1037 key.offset = block_group->length;
1038
1039 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1040 if (ret > 0)
1041 ret = -ENOENT;
1042 if (ret < 0)
1043 return ret;
1044
1045 ret = btrfs_del_item(trans, root, path);
1046 return ret;
1047}
1048
e3e0520b 1049int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7dc66abb 1050 struct btrfs_chunk_map *map)
e3e0520b
JB
1051{
1052 struct btrfs_fs_info *fs_info = trans->fs_info;
e3e0520b 1053 struct btrfs_path *path;
32da5386 1054 struct btrfs_block_group *block_group;
e3e0520b 1055 struct btrfs_free_cluster *cluster;
e3e0520b
JB
1056 struct inode *inode;
1057 struct kobject *kobj = NULL;
1058 int ret;
1059 int index;
1060 int factor;
1061 struct btrfs_caching_control *caching_ctl = NULL;
7dc66abb 1062 bool remove_map;
e3e0520b
JB
1063 bool remove_rsv = false;
1064
7dc66abb 1065 block_group = btrfs_lookup_block_group(fs_info, map->start);
e3e0520b
JB
1066 BUG_ON(!block_group);
1067 BUG_ON(!block_group->ro);
1068
1069 trace_btrfs_remove_block_group(block_group);
1070 /*
1071 * Free the reserved super bytes from this block group before
1072 * remove it.
1073 */
1074 btrfs_free_excluded_extents(block_group);
b3470b5d
DS
1075 btrfs_free_ref_tree_range(fs_info, block_group->start,
1076 block_group->length);
e3e0520b 1077
e3e0520b
JB
1078 index = btrfs_bg_flags_to_raid_index(block_group->flags);
1079 factor = btrfs_bg_type_to_factor(block_group->flags);
1080
1081 /* make sure this block group isn't part of an allocation cluster */
1082 cluster = &fs_info->data_alloc_cluster;
1083 spin_lock(&cluster->refill_lock);
1084 btrfs_return_cluster_to_free_space(block_group, cluster);
1085 spin_unlock(&cluster->refill_lock);
1086
1087 /*
1088 * make sure this block group isn't part of a metadata
1089 * allocation cluster
1090 */
1091 cluster = &fs_info->meta_alloc_cluster;
1092 spin_lock(&cluster->refill_lock);
1093 btrfs_return_cluster_to_free_space(block_group, cluster);
1094 spin_unlock(&cluster->refill_lock);
1095
40ab3be1 1096 btrfs_clear_treelog_bg(block_group);
c2707a25 1097 btrfs_clear_data_reloc_bg(block_group);
40ab3be1 1098
e3e0520b
JB
1099 path = btrfs_alloc_path();
1100 if (!path) {
1101 ret = -ENOMEM;
9fecd132 1102 goto out;
e3e0520b
JB
1103 }
1104
1105 /*
1106 * get the inode first so any iput calls done for the io_list
1107 * aren't the final iput (no unlinks allowed now)
1108 */
1109 inode = lookup_free_space_inode(block_group, path);
1110
1111 mutex_lock(&trans->transaction->cache_write_mutex);
1112 /*
1113 * Make sure our free space cache IO is done before removing the
1114 * free space inode
1115 */
1116 spin_lock(&trans->transaction->dirty_bgs_lock);
1117 if (!list_empty(&block_group->io_list)) {
1118 list_del_init(&block_group->io_list);
1119
1120 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
1121
1122 spin_unlock(&trans->transaction->dirty_bgs_lock);
1123 btrfs_wait_cache_io(trans, block_group, path);
1124 btrfs_put_block_group(block_group);
1125 spin_lock(&trans->transaction->dirty_bgs_lock);
1126 }
1127
1128 if (!list_empty(&block_group->dirty_list)) {
1129 list_del_init(&block_group->dirty_list);
1130 remove_rsv = true;
1131 btrfs_put_block_group(block_group);
1132 }
1133 spin_unlock(&trans->transaction->dirty_bgs_lock);
1134 mutex_unlock(&trans->transaction->cache_write_mutex);
1135
36b216c8
BB
1136 ret = btrfs_remove_free_space_inode(trans, inode, block_group);
1137 if (ret)
9fecd132 1138 goto out;
e3e0520b 1139
16b0c258 1140 write_lock(&fs_info->block_group_cache_lock);
08dddb29
FM
1141 rb_erase_cached(&block_group->cache_node,
1142 &fs_info->block_group_cache_tree);
e3e0520b
JB
1143 RB_CLEAR_NODE(&block_group->cache_node);
1144
9fecd132
FM
1145 /* Once for the block groups rbtree */
1146 btrfs_put_block_group(block_group);
1147
16b0c258 1148 write_unlock(&fs_info->block_group_cache_lock);
e3e0520b
JB
1149
1150 down_write(&block_group->space_info->groups_sem);
1151 /*
1152 * we must use list_del_init so people can check to see if they
1153 * are still on the list after taking the semaphore
1154 */
1155 list_del_init(&block_group->list);
1156 if (list_empty(&block_group->space_info->block_groups[index])) {
1157 kobj = block_group->space_info->block_group_kobjs[index];
1158 block_group->space_info->block_group_kobjs[index] = NULL;
1159 clear_avail_alloc_bits(fs_info, block_group->flags);
1160 }
1161 up_write(&block_group->space_info->groups_sem);
1162 clear_incompat_bg_bits(fs_info, block_group->flags);
1163 if (kobj) {
1164 kobject_del(kobj);
1165 kobject_put(kobj);
1166 }
1167
e3e0520b
JB
1168 if (block_group->cached == BTRFS_CACHE_STARTED)
1169 btrfs_wait_block_group_cache_done(block_group);
7b9c293b
JB
1170
1171 write_lock(&fs_info->block_group_cache_lock);
1172 caching_ctl = btrfs_get_caching_control(block_group);
1173 if (!caching_ctl) {
1174 struct btrfs_caching_control *ctl;
1175
1176 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
1177 if (ctl->block_group == block_group) {
1178 caching_ctl = ctl;
1179 refcount_inc(&caching_ctl->count);
1180 break;
1181 }
e3e0520b
JB
1182 }
1183 }
7b9c293b
JB
1184 if (caching_ctl)
1185 list_del_init(&caching_ctl->list);
1186 write_unlock(&fs_info->block_group_cache_lock);
1187
1188 if (caching_ctl) {
1189 /* Once for the caching bgs list and once for us. */
1190 btrfs_put_caching_control(caching_ctl);
1191 btrfs_put_caching_control(caching_ctl);
1192 }
e3e0520b
JB
1193
1194 spin_lock(&trans->transaction->dirty_bgs_lock);
1195 WARN_ON(!list_empty(&block_group->dirty_list));
1196 WARN_ON(!list_empty(&block_group->io_list));
1197 spin_unlock(&trans->transaction->dirty_bgs_lock);
1198
1199 btrfs_remove_free_space_cache(block_group);
1200
1201 spin_lock(&block_group->space_info->lock);
1202 list_del_init(&block_group->ro_list);
1203
1204 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1205 WARN_ON(block_group->space_info->total_bytes
b3470b5d 1206 < block_group->length);
e3e0520b 1207 WARN_ON(block_group->space_info->bytes_readonly
169e0da9
NA
1208 < block_group->length - block_group->zone_unusable);
1209 WARN_ON(block_group->space_info->bytes_zone_unusable
1210 < block_group->zone_unusable);
e3e0520b 1211 WARN_ON(block_group->space_info->disk_total
b3470b5d 1212 < block_group->length * factor);
e3e0520b 1213 }
b3470b5d 1214 block_group->space_info->total_bytes -= block_group->length;
169e0da9
NA
1215 block_group->space_info->bytes_readonly -=
1216 (block_group->length - block_group->zone_unusable);
1217 block_group->space_info->bytes_zone_unusable -=
1218 block_group->zone_unusable;
b3470b5d 1219 block_group->space_info->disk_total -= block_group->length * factor;
e3e0520b
JB
1220
1221 spin_unlock(&block_group->space_info->lock);
1222
ffcb9d44
FM
1223 /*
1224 * Remove the free space for the block group from the free space tree
1225 * and the block group's item from the extent tree before marking the
1226 * block group as removed. This is to prevent races with tasks that
1227 * freeze and unfreeze a block group, this task and another task
1228 * allocating a new block group - the unfreeze task ends up removing
1229 * the block group's extent map before the task calling this function
1230 * deletes the block group item from the extent tree, allowing for
1231 * another task to attempt to create another block group with the same
1232 * item key (and failing with -EEXIST and a transaction abort).
1233 */
1234 ret = remove_block_group_free_space(trans, block_group);
1235 if (ret)
1236 goto out;
1237
1238 ret = remove_block_group_item(trans, path, block_group);
1239 if (ret < 0)
1240 goto out;
1241
e3e0520b 1242 spin_lock(&block_group->lock);
3349b57f
JB
1243 set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
1244
e3e0520b 1245 /*
6b7304af
FM
1246 * At this point trimming or scrub can't start on this block group,
1247 * because we removed the block group from the rbtree
1248 * fs_info->block_group_cache_tree so no one can't find it anymore and
1249 * even if someone already got this block group before we removed it
1250 * from the rbtree, they have already incremented block_group->frozen -
1251 * if they didn't, for the trimming case they won't find any free space
1252 * entries because we already removed them all when we called
1253 * btrfs_remove_free_space_cache().
e3e0520b 1254 *
7dc66abb 1255 * And we must not remove the chunk map from the fs_info->mapping_tree
e3e0520b 1256 * to prevent the same logical address range and physical device space
6b7304af
FM
1257 * ranges from being reused for a new block group. This is needed to
1258 * avoid races with trimming and scrub.
1259 *
1260 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
e3e0520b
JB
1261 * completely transactionless, so while it is trimming a range the
1262 * currently running transaction might finish and a new one start,
1263 * allowing for new block groups to be created that can reuse the same
1264 * physical device locations unless we take this special care.
1265 *
1266 * There may also be an implicit trim operation if the file system
1267 * is mounted with -odiscard. The same protections must remain
1268 * in place until the extents have been discarded completely when
1269 * the transaction commit has completed.
1270 */
7dc66abb 1271 remove_map = (atomic_read(&block_group->frozen) == 0);
e3e0520b
JB
1272 spin_unlock(&block_group->lock);
1273
7dc66abb
FM
1274 if (remove_map)
1275 btrfs_remove_chunk_map(fs_info, map);
f6033c5e 1276
9fecd132 1277out:
f6033c5e
XY
1278 /* Once for the lookup reference */
1279 btrfs_put_block_group(block_group);
e3e0520b 1280 if (remove_rsv)
f66e0209 1281 btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
e3e0520b
JB
1282 btrfs_free_path(path);
1283 return ret;
1284}
1285
1286struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1287 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1288{
dfe8aec4 1289 struct btrfs_root *root = btrfs_block_group_root(fs_info);
7dc66abb 1290 struct btrfs_chunk_map *map;
e3e0520b
JB
1291 unsigned int num_items;
1292
7dc66abb
FM
1293 map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
1294 ASSERT(map != NULL);
1295 ASSERT(map->start == chunk_offset);
e3e0520b
JB
1296
1297 /*
1298 * We need to reserve 3 + N units from the metadata space info in order
1299 * to remove a block group (done at btrfs_remove_chunk() and at
1300 * btrfs_remove_block_group()), which are used for:
1301 *
1302 * 1 unit for adding the free space inode's orphan (located in the tree
1303 * of tree roots).
1304 * 1 unit for deleting the block group item (located in the extent
1305 * tree).
1306 * 1 unit for deleting the free space item (located in tree of tree
1307 * roots).
1308 * N units for deleting N device extent items corresponding to each
1309 * stripe (located in the device tree).
1310 *
1311 * In order to remove a block group we also need to reserve units in the
1312 * system space info in order to update the chunk tree (update one or
1313 * more device items and remove one chunk item), but this is done at
1314 * btrfs_remove_chunk() through a call to check_system_chunk().
1315 */
e3e0520b 1316 num_items = 3 + map->num_stripes;
7dc66abb 1317 btrfs_free_chunk_map(map);
e3e0520b 1318
dfe8aec4 1319 return btrfs_start_transaction_fallback_global_rsv(root, num_items);
e3e0520b
JB
1320}
1321
26ce2095
JB
1322/*
1323 * Mark block group @cache read-only, so later write won't happen to block
1324 * group @cache.
1325 *
1326 * If @force is not set, this function will only mark the block group readonly
1327 * if we have enough free space (1M) in other metadata/system block groups.
1328 * If @force is not set, this function will mark the block group readonly
1329 * without checking free space.
1330 *
1331 * NOTE: This function doesn't care if other block groups can contain all the
1332 * data in this block group. That check should be done by relocation routine,
1333 * not this function.
1334 */
32da5386 1335static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
26ce2095
JB
1336{
1337 struct btrfs_space_info *sinfo = cache->space_info;
1338 u64 num_bytes;
26ce2095
JB
1339 int ret = -ENOSPC;
1340
26ce2095
JB
1341 spin_lock(&sinfo->lock);
1342 spin_lock(&cache->lock);
1343
195a49ea
FM
1344 if (cache->swap_extents) {
1345 ret = -ETXTBSY;
1346 goto out;
1347 }
1348
26ce2095
JB
1349 if (cache->ro) {
1350 cache->ro++;
1351 ret = 0;
1352 goto out;
1353 }
1354
b3470b5d 1355 num_bytes = cache->length - cache->reserved - cache->pinned -
169e0da9 1356 cache->bytes_super - cache->zone_unusable - cache->used;
26ce2095
JB
1357
1358 /*
a30a3d20
JB
1359 * Data never overcommits, even in mixed mode, so do just the straight
1360 * check of left over space in how much we have allocated.
26ce2095 1361 */
a30a3d20
JB
1362 if (force) {
1363 ret = 0;
1364 } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1365 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1366
1367 /*
1368 * Here we make sure if we mark this bg RO, we still have enough
1369 * free space as buffer.
1370 */
1371 if (sinfo_used + num_bytes <= sinfo->total_bytes)
1372 ret = 0;
1373 } else {
1374 /*
1375 * We overcommit metadata, so we need to do the
1376 * btrfs_can_overcommit check here, and we need to pass in
1377 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1378 * leeway to allow us to mark this block group as read only.
1379 */
1380 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1381 BTRFS_RESERVE_NO_FLUSH))
1382 ret = 0;
1383 }
1384
1385 if (!ret) {
26ce2095 1386 sinfo->bytes_readonly += num_bytes;
169e0da9
NA
1387 if (btrfs_is_zoned(cache->fs_info)) {
1388 /* Migrate zone_unusable bytes to readonly */
1389 sinfo->bytes_readonly += cache->zone_unusable;
1390 sinfo->bytes_zone_unusable -= cache->zone_unusable;
1391 cache->zone_unusable = 0;
1392 }
26ce2095
JB
1393 cache->ro++;
1394 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
26ce2095
JB
1395 }
1396out:
1397 spin_unlock(&cache->lock);
1398 spin_unlock(&sinfo->lock);
1399 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1400 btrfs_info(cache->fs_info,
b3470b5d 1401 "unable to make block group %llu ro", cache->start);
26ce2095
JB
1402 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1403 }
1404 return ret;
1405}
1406
fe119a6e
NB
1407static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1408 struct btrfs_block_group *bg)
45bb5d6a
NB
1409{
1410 struct btrfs_fs_info *fs_info = bg->fs_info;
fe119a6e 1411 struct btrfs_transaction *prev_trans = NULL;
45bb5d6a
NB
1412 const u64 start = bg->start;
1413 const u64 end = start + bg->length - 1;
1414 int ret;
1415
fe119a6e
NB
1416 spin_lock(&fs_info->trans_lock);
1417 if (trans->transaction->list.prev != &fs_info->trans_list) {
1418 prev_trans = list_last_entry(&trans->transaction->list,
1419 struct btrfs_transaction, list);
1420 refcount_inc(&prev_trans->use_count);
1421 }
1422 spin_unlock(&fs_info->trans_lock);
1423
45bb5d6a
NB
1424 /*
1425 * Hold the unused_bg_unpin_mutex lock to avoid racing with
1426 * btrfs_finish_extent_commit(). If we are at transaction N, another
1427 * task might be running finish_extent_commit() for the previous
1428 * transaction N - 1, and have seen a range belonging to the block
fe119a6e
NB
1429 * group in pinned_extents before we were able to clear the whole block
1430 * group range from pinned_extents. This means that task can lookup for
1431 * the block group after we unpinned it from pinned_extents and removed
1432 * it, leading to a BUG_ON() at unpin_extent_range().
45bb5d6a
NB
1433 */
1434 mutex_lock(&fs_info->unused_bg_unpin_mutex);
fe119a6e
NB
1435 if (prev_trans) {
1436 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1437 EXTENT_DIRTY);
1438 if (ret)
534cf531 1439 goto out;
fe119a6e 1440 }
45bb5d6a 1441
fe119a6e 1442 ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
45bb5d6a 1443 EXTENT_DIRTY);
534cf531 1444out:
45bb5d6a 1445 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5150bf19
FM
1446 if (prev_trans)
1447 btrfs_put_transaction(prev_trans);
45bb5d6a 1448
534cf531 1449 return ret == 0;
45bb5d6a
NB
1450}
1451
e3e0520b
JB
1452/*
1453 * Process the unused_bgs list and remove any that don't have any allocated
1454 * space inside of them.
1455 */
1456void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1457{
32da5386 1458 struct btrfs_block_group *block_group;
e3e0520b
JB
1459 struct btrfs_space_info *space_info;
1460 struct btrfs_trans_handle *trans;
6e80d4f8 1461 const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
e3e0520b
JB
1462 int ret = 0;
1463
1464 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1465 return;
1466
2f12741f
JB
1467 if (btrfs_fs_closing(fs_info))
1468 return;
1469
ddfd08cb
JB
1470 /*
1471 * Long running balances can keep us blocked here for eternity, so
1472 * simply skip deletion if we're unable to get the mutex.
1473 */
f3372065 1474 if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
ddfd08cb
JB
1475 return;
1476
e3e0520b
JB
1477 spin_lock(&fs_info->unused_bgs_lock);
1478 while (!list_empty(&fs_info->unused_bgs)) {
e3e0520b
JB
1479 int trimming;
1480
1481 block_group = list_first_entry(&fs_info->unused_bgs,
32da5386 1482 struct btrfs_block_group,
e3e0520b
JB
1483 bg_list);
1484 list_del_init(&block_group->bg_list);
1485
1486 space_info = block_group->space_info;
1487
1488 if (ret || btrfs_mixed_space_info(space_info)) {
1489 btrfs_put_block_group(block_group);
1490 continue;
1491 }
1492 spin_unlock(&fs_info->unused_bgs_lock);
1493
b0643e59
DZ
1494 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1495
e3e0520b
JB
1496 /* Don't want to race with allocators so take the groups_sem */
1497 down_write(&space_info->groups_sem);
6e80d4f8
DZ
1498
1499 /*
1500 * Async discard moves the final block group discard to be prior
1501 * to the unused_bgs code path. Therefore, if it's not fully
1502 * trimmed, punt it back to the async discard lists.
1503 */
1504 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1505 !btrfs_is_free_space_trimmed(block_group)) {
1506 trace_btrfs_skip_unused_block_group(block_group);
1507 up_write(&space_info->groups_sem);
1508 /* Requeue if we failed because of async discard */
1509 btrfs_discard_queue_work(&fs_info->discard_ctl,
1510 block_group);
1511 goto next;
1512 }
1513
e3e0520b 1514 spin_lock(&block_group->lock);
1693d544 1515 if (btrfs_is_block_group_used(block_group) || block_group->ro ||
e3e0520b
JB
1516 list_is_singular(&block_group->list)) {
1517 /*
1518 * We want to bail if we made new allocations or have
1519 * outstanding allocations in this block group. We do
1520 * the ro check in case balance is currently acting on
1521 * this block group.
1522 */
1523 trace_btrfs_skip_unused_block_group(block_group);
1524 spin_unlock(&block_group->lock);
1525 up_write(&space_info->groups_sem);
1526 goto next;
1527 }
1528 spin_unlock(&block_group->lock);
1529
1530 /* We don't want to force the issue, only flip if it's ok. */
e11c0406 1531 ret = inc_block_group_ro(block_group, 0);
e3e0520b
JB
1532 up_write(&space_info->groups_sem);
1533 if (ret < 0) {
1534 ret = 0;
1535 goto next;
1536 }
1537
74e91b12
NA
1538 ret = btrfs_zone_finish(block_group);
1539 if (ret < 0) {
1540 btrfs_dec_block_group_ro(block_group);
1541 if (ret == -EAGAIN)
1542 ret = 0;
1543 goto next;
1544 }
1545
e3e0520b
JB
1546 /*
1547 * Want to do this before we do anything else so we can recover
1548 * properly if we fail to join the transaction.
1549 */
1550 trans = btrfs_start_trans_remove_block_group(fs_info,
b3470b5d 1551 block_group->start);
e3e0520b
JB
1552 if (IS_ERR(trans)) {
1553 btrfs_dec_block_group_ro(block_group);
1554 ret = PTR_ERR(trans);
1555 goto next;
1556 }
1557
1558 /*
1559 * We could have pending pinned extents for this block group,
1560 * just delete them, we don't care about them anymore.
1561 */
534cf531
FM
1562 if (!clean_pinned_extents(trans, block_group)) {
1563 btrfs_dec_block_group_ro(block_group);
e3e0520b 1564 goto end_trans;
534cf531 1565 }
e3e0520b 1566
b0643e59
DZ
1567 /*
1568 * At this point, the block_group is read only and should fail
1569 * new allocations. However, btrfs_finish_extent_commit() can
1570 * cause this block_group to be placed back on the discard
1571 * lists because now the block_group isn't fully discarded.
1572 * Bail here and try again later after discarding everything.
1573 */
1574 spin_lock(&fs_info->discard_ctl.lock);
1575 if (!list_empty(&block_group->discard_list)) {
1576 spin_unlock(&fs_info->discard_ctl.lock);
1577 btrfs_dec_block_group_ro(block_group);
1578 btrfs_discard_queue_work(&fs_info->discard_ctl,
1579 block_group);
1580 goto end_trans;
1581 }
1582 spin_unlock(&fs_info->discard_ctl.lock);
1583
e3e0520b
JB
1584 /* Reset pinned so btrfs_put_block_group doesn't complain */
1585 spin_lock(&space_info->lock);
1586 spin_lock(&block_group->lock);
1587
1588 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1589 -block_group->pinned);
1590 space_info->bytes_readonly += block_group->pinned;
e3e0520b
JB
1591 block_group->pinned = 0;
1592
1593 spin_unlock(&block_group->lock);
1594 spin_unlock(&space_info->lock);
1595
6e80d4f8
DZ
1596 /*
1597 * The normal path here is an unused block group is passed here,
1598 * then trimming is handled in the transaction commit path.
1599 * Async discard interposes before this to do the trimming
1600 * before coming down the unused block group path as trimming
1601 * will no longer be done later in the transaction commit path.
1602 */
1603 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1604 goto flip_async;
1605
dcba6e48
NA
1606 /*
1607 * DISCARD can flip during remount. On zoned filesystems, we
1608 * need to reset sequential-required zones.
1609 */
1610 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1611 btrfs_is_zoned(fs_info);
e3e0520b
JB
1612
1613 /* Implicit trim during transaction commit. */
1614 if (trimming)
6b7304af 1615 btrfs_freeze_block_group(block_group);
e3e0520b
JB
1616
1617 /*
1618 * Btrfs_remove_chunk will abort the transaction if things go
1619 * horribly wrong.
1620 */
b3470b5d 1621 ret = btrfs_remove_chunk(trans, block_group->start);
e3e0520b
JB
1622
1623 if (ret) {
1624 if (trimming)
6b7304af 1625 btrfs_unfreeze_block_group(block_group);
e3e0520b
JB
1626 goto end_trans;
1627 }
1628
1629 /*
1630 * If we're not mounted with -odiscard, we can just forget
1631 * about this block group. Otherwise we'll need to wait
1632 * until transaction commit to do the actual discard.
1633 */
1634 if (trimming) {
1635 spin_lock(&fs_info->unused_bgs_lock);
1636 /*
1637 * A concurrent scrub might have added us to the list
1638 * fs_info->unused_bgs, so use a list_move operation
1639 * to add the block group to the deleted_bgs list.
1640 */
1641 list_move(&block_group->bg_list,
1642 &trans->transaction->deleted_bgs);
1643 spin_unlock(&fs_info->unused_bgs_lock);
1644 btrfs_get_block_group(block_group);
1645 }
1646end_trans:
1647 btrfs_end_transaction(trans);
1648next:
e3e0520b
JB
1649 btrfs_put_block_group(block_group);
1650 spin_lock(&fs_info->unused_bgs_lock);
1651 }
1652 spin_unlock(&fs_info->unused_bgs_lock);
f3372065 1653 mutex_unlock(&fs_info->reclaim_bgs_lock);
6e80d4f8
DZ
1654 return;
1655
1656flip_async:
1657 btrfs_end_transaction(trans);
f3372065 1658 mutex_unlock(&fs_info->reclaim_bgs_lock);
6e80d4f8
DZ
1659 btrfs_put_block_group(block_group);
1660 btrfs_discard_punt_unused_bgs_list(fs_info);
e3e0520b
JB
1661}
1662
32da5386 1663void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
e3e0520b
JB
1664{
1665 struct btrfs_fs_info *fs_info = bg->fs_info;
1666
1667 spin_lock(&fs_info->unused_bgs_lock);
1668 if (list_empty(&bg->bg_list)) {
1669 btrfs_get_block_group(bg);
0657b20c 1670 trace_btrfs_add_unused_block_group(bg);
e3e0520b 1671 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
0657b20c 1672 } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
a9f18971 1673 /* Pull out the block group from the reclaim_bgs list. */
0657b20c 1674 trace_btrfs_add_unused_block_group(bg);
a9f18971 1675 list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
e3e0520b
JB
1676 }
1677 spin_unlock(&fs_info->unused_bgs_lock);
1678}
4358d963 1679
2ca0ec77
JT
1680/*
1681 * We want block groups with a low number of used bytes to be in the beginning
1682 * of the list, so they will get reclaimed first.
1683 */
1684static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1685 const struct list_head *b)
1686{
1687 const struct btrfs_block_group *bg1, *bg2;
1688
1689 bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1690 bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1691
1692 return bg1->used > bg2->used;
1693}
1694
3687fcb0
JT
1695static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
1696{
1697 if (btrfs_is_zoned(fs_info))
1698 return btrfs_zoned_should_reclaim(fs_info);
1699 return true;
1700}
1701
81531225
BB
1702static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
1703{
1704 const struct btrfs_space_info *space_info = bg->space_info;
1705 const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
1706 const u64 new_val = bg->used;
1707 const u64 old_val = new_val + bytes_freed;
1708 u64 thresh;
1709
1710 if (reclaim_thresh == 0)
1711 return false;
1712
428c8e03 1713 thresh = mult_perc(bg->length, reclaim_thresh);
81531225
BB
1714
1715 /*
1716 * If we were below the threshold before don't reclaim, we are likely a
1717 * brand new block group and we don't want to relocate new block groups.
1718 */
1719 if (old_val < thresh)
1720 return false;
1721 if (new_val >= thresh)
1722 return false;
1723 return true;
1724}
1725
18bb8bbf
JT
1726void btrfs_reclaim_bgs_work(struct work_struct *work)
1727{
1728 struct btrfs_fs_info *fs_info =
1729 container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1730 struct btrfs_block_group *bg;
1731 struct btrfs_space_info *space_info;
18bb8bbf
JT
1732
1733 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1734 return;
1735
2f12741f
JB
1736 if (btrfs_fs_closing(fs_info))
1737 return;
1738
3687fcb0
JT
1739 if (!btrfs_should_reclaim(fs_info))
1740 return;
1741
ca5e4ea0
NA
1742 sb_start_write(fs_info->sb);
1743
1744 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1745 sb_end_write(fs_info->sb);
18bb8bbf 1746 return;
ca5e4ea0 1747 }
18bb8bbf 1748
9cc0b837
JT
1749 /*
1750 * Long running balances can keep us blocked here for eternity, so
1751 * simply skip reclaim if we're unable to get the mutex.
1752 */
1753 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1754 btrfs_exclop_finish(fs_info);
ca5e4ea0 1755 sb_end_write(fs_info->sb);
9cc0b837
JT
1756 return;
1757 }
1758
18bb8bbf 1759 spin_lock(&fs_info->unused_bgs_lock);
2ca0ec77
JT
1760 /*
1761 * Sort happens under lock because we can't simply splice it and sort.
1762 * The block groups might still be in use and reachable via bg_list,
1763 * and their presence in the reclaim_bgs list must be preserved.
1764 */
1765 list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
18bb8bbf 1766 while (!list_empty(&fs_info->reclaim_bgs)) {
5f93e776 1767 u64 zone_unusable;
1cea5cf0
FM
1768 int ret = 0;
1769
18bb8bbf
JT
1770 bg = list_first_entry(&fs_info->reclaim_bgs,
1771 struct btrfs_block_group,
1772 bg_list);
1773 list_del_init(&bg->bg_list);
1774
1775 space_info = bg->space_info;
1776 spin_unlock(&fs_info->unused_bgs_lock);
1777
1778 /* Don't race with allocators so take the groups_sem */
1779 down_write(&space_info->groups_sem);
1780
1781 spin_lock(&bg->lock);
1782 if (bg->reserved || bg->pinned || bg->ro) {
1783 /*
1784 * We want to bail if we made new allocations or have
1785 * outstanding allocations in this block group. We do
1786 * the ro check in case balance is currently acting on
1787 * this block group.
1788 */
1789 spin_unlock(&bg->lock);
1790 up_write(&space_info->groups_sem);
1791 goto next;
1792 }
cc4804bf
BB
1793 if (bg->used == 0) {
1794 /*
1795 * It is possible that we trigger relocation on a block
1796 * group as its extents are deleted and it first goes
1797 * below the threshold, then shortly after goes empty.
1798 *
1799 * In this case, relocating it does delete it, but has
1800 * some overhead in relocation specific metadata, looking
1801 * for the non-existent extents and running some extra
1802 * transactions, which we can avoid by using one of the
1803 * other mechanisms for dealing with empty block groups.
1804 */
1805 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
1806 btrfs_mark_bg_unused(bg);
1807 spin_unlock(&bg->lock);
1808 up_write(&space_info->groups_sem);
1809 goto next;
81531225
BB
1810
1811 }
1812 /*
1813 * The block group might no longer meet the reclaim condition by
1814 * the time we get around to reclaiming it, so to avoid
1815 * reclaiming overly full block_groups, skip reclaiming them.
1816 *
1817 * Since the decision making process also depends on the amount
1818 * being freed, pass in a fake giant value to skip that extra
1819 * check, which is more meaningful when adding to the list in
1820 * the first place.
1821 */
1822 if (!should_reclaim_block_group(bg, bg->length)) {
1823 spin_unlock(&bg->lock);
1824 up_write(&space_info->groups_sem);
1825 goto next;
cc4804bf 1826 }
18bb8bbf
JT
1827 spin_unlock(&bg->lock);
1828
93463ff7
NA
1829 /*
1830 * Get out fast, in case we're read-only or unmounting the
1831 * filesystem. It is OK to drop block groups from the list even
1832 * for the read-only case. As we did sb_start_write(),
1833 * "mount -o remount,ro" won't happen and read-only filesystem
1834 * means it is forced read-only due to a fatal error. So, it
1835 * never gets back to read-write to let us reclaim again.
1836 */
1837 if (btrfs_need_cleaner_sleep(fs_info)) {
18bb8bbf
JT
1838 up_write(&space_info->groups_sem);
1839 goto next;
1840 }
1841
5f93e776
JT
1842 /*
1843 * Cache the zone_unusable value before turning the block group
1844 * to read only. As soon as the blog group is read only it's
1845 * zone_unusable value gets moved to the block group's read-only
1846 * bytes and isn't available for calculations anymore.
1847 */
1848 zone_unusable = bg->zone_unusable;
18bb8bbf
JT
1849 ret = inc_block_group_ro(bg, 0);
1850 up_write(&space_info->groups_sem);
1851 if (ret < 0)
1852 goto next;
1853
5f93e776
JT
1854 btrfs_info(fs_info,
1855 "reclaiming chunk %llu with %llu%% used %llu%% unusable",
95cd356c
JT
1856 bg->start,
1857 div64_u64(bg->used * 100, bg->length),
5f93e776 1858 div64_u64(zone_unusable * 100, bg->length));
18bb8bbf
JT
1859 trace_btrfs_reclaim_block_group(bg);
1860 ret = btrfs_relocate_chunk(fs_info, bg->start);
74944c87
JB
1861 if (ret) {
1862 btrfs_dec_block_group_ro(bg);
18bb8bbf
JT
1863 btrfs_err(fs_info, "error relocating chunk %llu",
1864 bg->start);
74944c87 1865 }
18bb8bbf
JT
1866
1867next:
7e271809
NA
1868 if (ret)
1869 btrfs_mark_bg_to_reclaim(bg);
d96b3424 1870 btrfs_put_block_group(bg);
3ed01616
NA
1871
1872 mutex_unlock(&fs_info->reclaim_bgs_lock);
1873 /*
1874 * Reclaiming all the block groups in the list can take really
1875 * long. Prioritize cleaning up unused block groups.
1876 */
1877 btrfs_delete_unused_bgs(fs_info);
1878 /*
1879 * If we are interrupted by a balance, we can just bail out. The
1880 * cleaner thread restart again if necessary.
1881 */
1882 if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1883 goto end;
18bb8bbf
JT
1884 spin_lock(&fs_info->unused_bgs_lock);
1885 }
1886 spin_unlock(&fs_info->unused_bgs_lock);
1887 mutex_unlock(&fs_info->reclaim_bgs_lock);
3ed01616 1888end:
18bb8bbf 1889 btrfs_exclop_finish(fs_info);
ca5e4ea0 1890 sb_end_write(fs_info->sb);
18bb8bbf
JT
1891}
1892
1893void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
1894{
1895 spin_lock(&fs_info->unused_bgs_lock);
1896 if (!list_empty(&fs_info->reclaim_bgs))
1897 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
1898 spin_unlock(&fs_info->unused_bgs_lock);
1899}
1900
1901void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
1902{
1903 struct btrfs_fs_info *fs_info = bg->fs_info;
1904
1905 spin_lock(&fs_info->unused_bgs_lock);
1906 if (list_empty(&bg->bg_list)) {
1907 btrfs_get_block_group(bg);
1908 trace_btrfs_add_reclaim_block_group(bg);
1909 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
1910 }
1911 spin_unlock(&fs_info->unused_bgs_lock);
1912}
1913
e3ba67a1
JT
1914static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
1915 struct btrfs_path *path)
1916{
7dc66abb 1917 struct btrfs_chunk_map *map;
e3ba67a1
JT
1918 struct btrfs_block_group_item bg;
1919 struct extent_buffer *leaf;
1920 int slot;
1921 u64 flags;
1922 int ret = 0;
1923
1924 slot = path->slots[0];
1925 leaf = path->nodes[0];
1926
7dc66abb
FM
1927 map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
1928 if (!map) {
e3ba67a1
JT
1929 btrfs_err(fs_info,
1930 "logical %llu len %llu found bg but no related chunk",
1931 key->objectid, key->offset);
1932 return -ENOENT;
1933 }
1934
7dc66abb 1935 if (map->start != key->objectid || map->chunk_len != key->offset) {
e3ba67a1
JT
1936 btrfs_err(fs_info,
1937 "block group %llu len %llu mismatch with chunk %llu len %llu",
7dc66abb 1938 key->objectid, key->offset, map->start, map->chunk_len);
e3ba67a1 1939 ret = -EUCLEAN;
7dc66abb 1940 goto out_free_map;
e3ba67a1
JT
1941 }
1942
1943 read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
1944 sizeof(bg));
1945 flags = btrfs_stack_block_group_flags(&bg) &
1946 BTRFS_BLOCK_GROUP_TYPE_MASK;
1947
7dc66abb 1948 if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
e3ba67a1
JT
1949 btrfs_err(fs_info,
1950"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1951 key->objectid, key->offset, flags,
7dc66abb 1952 (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
e3ba67a1
JT
1953 ret = -EUCLEAN;
1954 }
1955
7dc66abb
FM
1956out_free_map:
1957 btrfs_free_chunk_map(map);
e3ba67a1
JT
1958 return ret;
1959}
1960
4358d963
JB
1961static int find_first_block_group(struct btrfs_fs_info *fs_info,
1962 struct btrfs_path *path,
1963 struct btrfs_key *key)
1964{
dfe8aec4 1965 struct btrfs_root *root = btrfs_block_group_root(fs_info);
e3ba67a1 1966 int ret;
4358d963 1967 struct btrfs_key found_key;
4358d963 1968
36dfbbe2 1969 btrfs_for_each_slot(root, key, &found_key, path, ret) {
4358d963
JB
1970 if (found_key.objectid >= key->objectid &&
1971 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
36dfbbe2 1972 return read_bg_from_eb(fs_info, &found_key, path);
4358d963 1973 }
4358d963 1974 }
4358d963
JB
1975 return ret;
1976}
1977
1978static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1979{
1980 u64 extra_flags = chunk_to_extended(flags) &
1981 BTRFS_EXTENDED_PROFILE_MASK;
1982
1983 write_seqlock(&fs_info->profiles_lock);
1984 if (flags & BTRFS_BLOCK_GROUP_DATA)
1985 fs_info->avail_data_alloc_bits |= extra_flags;
1986 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1987 fs_info->avail_metadata_alloc_bits |= extra_flags;
1988 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1989 fs_info->avail_system_alloc_bits |= extra_flags;
1990 write_sequnlock(&fs_info->profiles_lock);
1991}
1992
43dd529a
DS
1993/*
1994 * Map a physical disk address to a list of logical addresses.
9ee9b979
NB
1995 *
1996 * @fs_info: the filesystem
96a14336
NB
1997 * @chunk_start: logical address of block group
1998 * @physical: physical address to map to logical addresses
1999 * @logical: return array of logical addresses which map to @physical
2000 * @naddrs: length of @logical
2001 * @stripe_len: size of IO stripe for the given block group
2002 *
2003 * Maps a particular @physical disk address to a list of @logical addresses.
2004 * Used primarily to exclude those portions of a block group that contain super
2005 * block copies.
2006 */
96a14336 2007int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1eb82ef8 2008 u64 physical, u64 **logical, int *naddrs, int *stripe_len)
96a14336 2009{
7dc66abb 2010 struct btrfs_chunk_map *map;
96a14336
NB
2011 u64 *buf;
2012 u64 bytenr;
1776ad17
NB
2013 u64 data_stripe_length;
2014 u64 io_stripe_size;
2015 int i, nr = 0;
2016 int ret = 0;
96a14336 2017
7dc66abb
FM
2018 map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
2019 if (IS_ERR(map))
96a14336
NB
2020 return -EIO;
2021
7dc66abb 2022 data_stripe_length = map->stripe_size;
a97699d1 2023 io_stripe_size = BTRFS_STRIPE_LEN;
7dc66abb 2024 chunk_start = map->start;
96a14336 2025
9e22b925
NB
2026 /* For RAID5/6 adjust to a full IO stripe length */
2027 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
cb091225 2028 io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
96a14336
NB
2029
2030 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1776ad17
NB
2031 if (!buf) {
2032 ret = -ENOMEM;
2033 goto out;
2034 }
96a14336
NB
2035
2036 for (i = 0; i < map->num_stripes; i++) {
1776ad17 2037 bool already_inserted = false;
6ded22c1
QW
2038 u32 stripe_nr;
2039 u32 offset;
1776ad17
NB
2040 int j;
2041
2042 if (!in_range(physical, map->stripes[i].physical,
2043 data_stripe_length))
96a14336
NB
2044 continue;
2045
a97699d1
QW
2046 stripe_nr = (physical - map->stripes[i].physical) >>
2047 BTRFS_STRIPE_LEN_SHIFT;
2048 offset = (physical - map->stripes[i].physical) &
2049 BTRFS_STRIPE_LEN_MASK;
96a14336 2050
ac067734 2051 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6ded22c1
QW
2052 BTRFS_BLOCK_GROUP_RAID10))
2053 stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
2054 map->sub_stripes);
96a14336
NB
2055 /*
2056 * The remaining case would be for RAID56, multiply by
2057 * nr_data_stripes(). Alternatively, just use rmap_len below
2058 * instead of map->stripe_len
2059 */
138082f3 2060 bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
1776ad17
NB
2061
2062 /* Ensure we don't add duplicate addresses */
96a14336 2063 for (j = 0; j < nr; j++) {
1776ad17
NB
2064 if (buf[j] == bytenr) {
2065 already_inserted = true;
96a14336 2066 break;
1776ad17 2067 }
96a14336 2068 }
1776ad17
NB
2069
2070 if (!already_inserted)
96a14336 2071 buf[nr++] = bytenr;
96a14336
NB
2072 }
2073
2074 *logical = buf;
2075 *naddrs = nr;
1776ad17
NB
2076 *stripe_len = io_stripe_size;
2077out:
7dc66abb 2078 btrfs_free_chunk_map(map);
1776ad17 2079 return ret;
96a14336
NB
2080}
2081
32da5386 2082static int exclude_super_stripes(struct btrfs_block_group *cache)
4358d963
JB
2083{
2084 struct btrfs_fs_info *fs_info = cache->fs_info;
12659251 2085 const bool zoned = btrfs_is_zoned(fs_info);
4358d963
JB
2086 u64 bytenr;
2087 u64 *logical;
2088 int stripe_len;
2089 int i, nr, ret;
2090
b3470b5d
DS
2091 if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
2092 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
4358d963 2093 cache->bytes_super += stripe_len;
b1c8f527
FM
2094 ret = set_extent_bit(&fs_info->excluded_extents, cache->start,
2095 cache->start + stripe_len - 1,
2096 EXTENT_UPTODATE, NULL);
4358d963
JB
2097 if (ret)
2098 return ret;
2099 }
2100
2101 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2102 bytenr = btrfs_sb_offset(i);
1eb82ef8 2103 ret = btrfs_rmap_block(fs_info, cache->start,
4358d963
JB
2104 bytenr, &logical, &nr, &stripe_len);
2105 if (ret)
2106 return ret;
2107
12659251
NA
2108 /* Shouldn't have super stripes in sequential zones */
2109 if (zoned && nr) {
f1a07c2b 2110 kfree(logical);
12659251
NA
2111 btrfs_err(fs_info,
2112 "zoned: block group %llu must not contain super block",
2113 cache->start);
2114 return -EUCLEAN;
2115 }
2116
4358d963 2117 while (nr--) {
96f9b0f2
NB
2118 u64 len = min_t(u64, stripe_len,
2119 cache->start + cache->length - logical[nr]);
4358d963
JB
2120
2121 cache->bytes_super += len;
b1c8f527
FM
2122 ret = set_extent_bit(&fs_info->excluded_extents, logical[nr],
2123 logical[nr] + len - 1,
2124 EXTENT_UPTODATE, NULL);
4358d963
JB
2125 if (ret) {
2126 kfree(logical);
2127 return ret;
2128 }
2129 }
2130
2131 kfree(logical);
2132 }
2133 return 0;
2134}
2135
32da5386 2136static struct btrfs_block_group *btrfs_create_block_group_cache(
9afc6649 2137 struct btrfs_fs_info *fs_info, u64 start)
4358d963 2138{
32da5386 2139 struct btrfs_block_group *cache;
4358d963
JB
2140
2141 cache = kzalloc(sizeof(*cache), GFP_NOFS);
2142 if (!cache)
2143 return NULL;
2144
2145 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
2146 GFP_NOFS);
2147 if (!cache->free_space_ctl) {
2148 kfree(cache);
2149 return NULL;
2150 }
2151
b3470b5d 2152 cache->start = start;
4358d963
JB
2153
2154 cache->fs_info = fs_info;
2155 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
4358d963 2156
6e80d4f8
DZ
2157 cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
2158
48aaeebe 2159 refcount_set(&cache->refs, 1);
4358d963
JB
2160 spin_lock_init(&cache->lock);
2161 init_rwsem(&cache->data_rwsem);
2162 INIT_LIST_HEAD(&cache->list);
2163 INIT_LIST_HEAD(&cache->cluster_list);
2164 INIT_LIST_HEAD(&cache->bg_list);
2165 INIT_LIST_HEAD(&cache->ro_list);
b0643e59 2166 INIT_LIST_HEAD(&cache->discard_list);
4358d963
JB
2167 INIT_LIST_HEAD(&cache->dirty_list);
2168 INIT_LIST_HEAD(&cache->io_list);
afba2bc0 2169 INIT_LIST_HEAD(&cache->active_bg_list);
cd79909b 2170 btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
6b7304af 2171 atomic_set(&cache->frozen, 0);
4358d963 2172 mutex_init(&cache->free_space_lock);
4358d963
JB
2173
2174 return cache;
2175}
2176
2177/*
2178 * Iterate all chunks and verify that each of them has the corresponding block
2179 * group
2180 */
2181static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
2182{
4358d963
JB
2183 u64 start = 0;
2184 int ret = 0;
2185
2186 while (1) {
7dc66abb
FM
2187 struct btrfs_chunk_map *map;
2188 struct btrfs_block_group *bg;
2189
4358d963 2190 /*
7dc66abb
FM
2191 * btrfs_find_chunk_map() will return the first chunk map
2192 * intersecting the range, so setting @length to 1 is enough to
4358d963
JB
2193 * get the first chunk.
2194 */
7dc66abb
FM
2195 map = btrfs_find_chunk_map(fs_info, start, 1);
2196 if (!map)
4358d963
JB
2197 break;
2198
7dc66abb 2199 bg = btrfs_lookup_block_group(fs_info, map->start);
4358d963
JB
2200 if (!bg) {
2201 btrfs_err(fs_info,
2202 "chunk start=%llu len=%llu doesn't have corresponding block group",
7dc66abb 2203 map->start, map->chunk_len);
4358d963 2204 ret = -EUCLEAN;
7dc66abb 2205 btrfs_free_chunk_map(map);
4358d963
JB
2206 break;
2207 }
7dc66abb 2208 if (bg->start != map->start || bg->length != map->chunk_len ||
4358d963 2209 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
7dc66abb 2210 (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
4358d963
JB
2211 btrfs_err(fs_info,
2212"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
7dc66abb
FM
2213 map->start, map->chunk_len,
2214 map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
b3470b5d 2215 bg->start, bg->length,
4358d963
JB
2216 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
2217 ret = -EUCLEAN;
7dc66abb 2218 btrfs_free_chunk_map(map);
4358d963
JB
2219 btrfs_put_block_group(bg);
2220 break;
2221 }
7dc66abb
FM
2222 start = map->start + map->chunk_len;
2223 btrfs_free_chunk_map(map);
4358d963
JB
2224 btrfs_put_block_group(bg);
2225 }
2226 return ret;
2227}
2228
ffb9e0f0 2229static int read_one_block_group(struct btrfs_fs_info *info,
4afd2fe8 2230 struct btrfs_block_group_item *bgi,
d49a2ddb 2231 const struct btrfs_key *key,
ffb9e0f0
QW
2232 int need_clear)
2233{
32da5386 2234 struct btrfs_block_group *cache;
ffb9e0f0 2235 const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
ffb9e0f0
QW
2236 int ret;
2237
d49a2ddb 2238 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
ffb9e0f0 2239
9afc6649 2240 cache = btrfs_create_block_group_cache(info, key->objectid);
ffb9e0f0
QW
2241 if (!cache)
2242 return -ENOMEM;
2243
4afd2fe8
JT
2244 cache->length = key->offset;
2245 cache->used = btrfs_stack_block_group_used(bgi);
7248e0ce 2246 cache->commit_used = cache->used;
4afd2fe8 2247 cache->flags = btrfs_stack_block_group_flags(bgi);
f7238e50 2248 cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
9afc6649 2249
e3e39c72
MPS
2250 set_free_space_tree_thresholds(cache);
2251
ffb9e0f0
QW
2252 if (need_clear) {
2253 /*
2254 * When we mount with old space cache, we need to
2255 * set BTRFS_DC_CLEAR and set dirty flag.
2256 *
2257 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2258 * truncate the old free space cache inode and
2259 * setup a new one.
2260 * b) Setting 'dirty flag' makes sure that we flush
2261 * the new space cache info onto disk.
2262 */
2263 if (btrfs_test_opt(info, SPACE_CACHE))
2264 cache->disk_cache_state = BTRFS_DC_CLEAR;
2265 }
ffb9e0f0
QW
2266 if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2267 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2268 btrfs_err(info,
2269"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2270 cache->start);
2271 ret = -EINVAL;
2272 goto error;
2273 }
2274
a94794d5 2275 ret = btrfs_load_block_group_zone_info(cache, false);
08e11a3d
NA
2276 if (ret) {
2277 btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2278 cache->start);
2279 goto error;
2280 }
2281
ffb9e0f0
QW
2282 /*
2283 * We need to exclude the super stripes now so that the space info has
2284 * super bytes accounted for, otherwise we'll think we have more space
2285 * than we actually do.
2286 */
2287 ret = exclude_super_stripes(cache);
2288 if (ret) {
2289 /* We may have excluded something, so call this just in case. */
2290 btrfs_free_excluded_extents(cache);
2291 goto error;
2292 }
2293
2294 /*
169e0da9
NA
2295 * For zoned filesystem, space after the allocation offset is the only
2296 * free space for a block group. So, we don't need any caching work.
2297 * btrfs_calc_zone_unusable() will set the amount of free space and
2298 * zone_unusable space.
2299 *
2300 * For regular filesystem, check for two cases, either we are full, and
2301 * therefore don't need to bother with the caching work since we won't
2302 * find any space, or we are empty, and we can just add all the space
2303 * in and be done with it. This saves us _a_lot_ of time, particularly
2304 * in the full case.
ffb9e0f0 2305 */
169e0da9
NA
2306 if (btrfs_is_zoned(info)) {
2307 btrfs_calc_zone_unusable(cache);
c46c4247
NA
2308 /* Should not have any excluded extents. Just in case, though. */
2309 btrfs_free_excluded_extents(cache);
169e0da9 2310 } else if (cache->length == cache->used) {
ffb9e0f0
QW
2311 cache->cached = BTRFS_CACHE_FINISHED;
2312 btrfs_free_excluded_extents(cache);
2313 } else if (cache->used == 0) {
ffb9e0f0 2314 cache->cached = BTRFS_CACHE_FINISHED;
3b9f0995
FM
2315 ret = btrfs_add_new_free_space(cache, cache->start,
2316 cache->start + cache->length, NULL);
ffb9e0f0 2317 btrfs_free_excluded_extents(cache);
d8ccbd21
FM
2318 if (ret)
2319 goto error;
ffb9e0f0
QW
2320 }
2321
2322 ret = btrfs_add_block_group_cache(info, cache);
2323 if (ret) {
2324 btrfs_remove_free_space_cache(cache);
2325 goto error;
2326 }
2327 trace_btrfs_add_block_group(info, cache, 0);
723de71d 2328 btrfs_add_bg_to_space_info(info, cache);
ffb9e0f0
QW
2329
2330 set_avail_alloc_bits(info, cache->flags);
a09f23c3
AJ
2331 if (btrfs_chunk_writeable(info, cache->start)) {
2332 if (cache->used == 0) {
2333 ASSERT(list_empty(&cache->bg_list));
2334 if (btrfs_test_opt(info, DISCARD_ASYNC))
2335 btrfs_discard_queue_work(&info->discard_ctl, cache);
2336 else
2337 btrfs_mark_bg_unused(cache);
2338 }
2339 } else {
ffb9e0f0 2340 inc_block_group_ro(cache, 1);
ffb9e0f0 2341 }
a09f23c3 2342
ffb9e0f0
QW
2343 return 0;
2344error:
2345 btrfs_put_block_group(cache);
2346 return ret;
2347}
2348
42437a63
JB
2349static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2350{
42437a63
JB
2351 struct rb_node *node;
2352 int ret = 0;
2353
7dc66abb
FM
2354 for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
2355 struct btrfs_chunk_map *map;
42437a63
JB
2356 struct btrfs_block_group *bg;
2357
7dc66abb
FM
2358 map = rb_entry(node, struct btrfs_chunk_map, rb_node);
2359 bg = btrfs_create_block_group_cache(fs_info, map->start);
42437a63
JB
2360 if (!bg) {
2361 ret = -ENOMEM;
2362 break;
2363 }
2364
2365 /* Fill dummy cache as FULL */
7dc66abb 2366 bg->length = map->chunk_len;
42437a63 2367 bg->flags = map->type;
42437a63 2368 bg->cached = BTRFS_CACHE_FINISHED;
7dc66abb 2369 bg->used = map->chunk_len;
42437a63
JB
2370 bg->flags = map->type;
2371 ret = btrfs_add_block_group_cache(fs_info, bg);
2b29726c
QW
2372 /*
2373 * We may have some valid block group cache added already, in
2374 * that case we skip to the next one.
2375 */
2376 if (ret == -EEXIST) {
2377 ret = 0;
2378 btrfs_put_block_group(bg);
2379 continue;
2380 }
2381
42437a63
JB
2382 if (ret) {
2383 btrfs_remove_free_space_cache(bg);
2384 btrfs_put_block_group(bg);
2385 break;
2386 }
2b29726c 2387
723de71d 2388 btrfs_add_bg_to_space_info(fs_info, bg);
42437a63
JB
2389
2390 set_avail_alloc_bits(fs_info, bg->flags);
2391 }
2392 if (!ret)
2393 btrfs_init_global_block_rsv(fs_info);
2394 return ret;
2395}
2396
4358d963
JB
2397int btrfs_read_block_groups(struct btrfs_fs_info *info)
2398{
dfe8aec4 2399 struct btrfs_root *root = btrfs_block_group_root(info);
4358d963
JB
2400 struct btrfs_path *path;
2401 int ret;
32da5386 2402 struct btrfs_block_group *cache;
4358d963
JB
2403 struct btrfs_space_info *space_info;
2404 struct btrfs_key key;
4358d963
JB
2405 int need_clear = 0;
2406 u64 cache_gen;
4358d963 2407
81d5d614
QW
2408 /*
2409 * Either no extent root (with ibadroots rescue option) or we have
2410 * unsupported RO options. The fs can never be mounted read-write, so no
2411 * need to waste time searching block group items.
2412 *
2413 * This also allows new extent tree related changes to be RO compat,
2414 * no need for a full incompat flag.
2415 */
2416 if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
2417 ~BTRFS_FEATURE_COMPAT_RO_SUPP))
42437a63
JB
2418 return fill_dummy_bgs(info);
2419
4358d963
JB
2420 key.objectid = 0;
2421 key.offset = 0;
2422 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2423 path = btrfs_alloc_path();
2424 if (!path)
2425 return -ENOMEM;
4358d963
JB
2426
2427 cache_gen = btrfs_super_cache_generation(info->super_copy);
2428 if (btrfs_test_opt(info, SPACE_CACHE) &&
2429 btrfs_super_generation(info->super_copy) != cache_gen)
2430 need_clear = 1;
2431 if (btrfs_test_opt(info, CLEAR_CACHE))
2432 need_clear = 1;
2433
2434 while (1) {
4afd2fe8
JT
2435 struct btrfs_block_group_item bgi;
2436 struct extent_buffer *leaf;
2437 int slot;
2438
4358d963
JB
2439 ret = find_first_block_group(info, path, &key);
2440 if (ret > 0)
2441 break;
2442 if (ret != 0)
2443 goto error;
2444
4afd2fe8
JT
2445 leaf = path->nodes[0];
2446 slot = path->slots[0];
2447
2448 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2449 sizeof(bgi));
2450
2451 btrfs_item_key_to_cpu(leaf, &key, slot);
2452 btrfs_release_path(path);
2453 ret = read_one_block_group(info, &bgi, &key, need_clear);
ffb9e0f0 2454 if (ret < 0)
4358d963 2455 goto error;
ffb9e0f0
QW
2456 key.objectid += key.offset;
2457 key.offset = 0;
4358d963 2458 }
7837fa88 2459 btrfs_release_path(path);
4358d963 2460
72804905 2461 list_for_each_entry(space_info, &info->space_info, list) {
49ea112d
JB
2462 int i;
2463
2464 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2465 if (list_empty(&space_info->block_groups[i]))
2466 continue;
2467 cache = list_first_entry(&space_info->block_groups[i],
2468 struct btrfs_block_group,
2469 list);
2470 btrfs_sysfs_add_block_group_type(cache);
2471 }
2472
4358d963
JB
2473 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2474 (BTRFS_BLOCK_GROUP_RAID10 |
2475 BTRFS_BLOCK_GROUP_RAID1_MASK |
2476 BTRFS_BLOCK_GROUP_RAID56_MASK |
2477 BTRFS_BLOCK_GROUP_DUP)))
2478 continue;
2479 /*
2480 * Avoid allocating from un-mirrored block group if there are
2481 * mirrored block groups.
2482 */
2483 list_for_each_entry(cache,
2484 &space_info->block_groups[BTRFS_RAID_RAID0],
2485 list)
e11c0406 2486 inc_block_group_ro(cache, 1);
4358d963
JB
2487 list_for_each_entry(cache,
2488 &space_info->block_groups[BTRFS_RAID_SINGLE],
2489 list)
e11c0406 2490 inc_block_group_ro(cache, 1);
4358d963
JB
2491 }
2492
2493 btrfs_init_global_block_rsv(info);
2494 ret = check_chunk_block_group_mappings(info);
2495error:
2496 btrfs_free_path(path);
2b29726c
QW
2497 /*
2498 * We've hit some error while reading the extent tree, and have
2499 * rescue=ibadroots mount option.
2500 * Try to fill the tree using dummy block groups so that the user can
2501 * continue to mount and grab their data.
2502 */
2503 if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2504 ret = fill_dummy_bgs(info);
4358d963
JB
2505 return ret;
2506}
2507
79bd3712
FM
2508/*
2509 * This function, insert_block_group_item(), belongs to the phase 2 of chunk
2510 * allocation.
2511 *
2512 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2513 * phases.
2514 */
97f4728a
QW
2515static int insert_block_group_item(struct btrfs_trans_handle *trans,
2516 struct btrfs_block_group *block_group)
2517{
2518 struct btrfs_fs_info *fs_info = trans->fs_info;
2519 struct btrfs_block_group_item bgi;
dfe8aec4 2520 struct btrfs_root *root = btrfs_block_group_root(fs_info);
97f4728a 2521 struct btrfs_key key;
675dfe12
FM
2522 u64 old_commit_used;
2523 int ret;
97f4728a
QW
2524
2525 spin_lock(&block_group->lock);
2526 btrfs_set_stack_block_group_used(&bgi, block_group->used);
2527 btrfs_set_stack_block_group_chunk_objectid(&bgi,
f7238e50 2528 block_group->global_root_id);
97f4728a 2529 btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
675dfe12
FM
2530 old_commit_used = block_group->commit_used;
2531 block_group->commit_used = block_group->used;
97f4728a
QW
2532 key.objectid = block_group->start;
2533 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2534 key.offset = block_group->length;
2535 spin_unlock(&block_group->lock);
2536
675dfe12
FM
2537 ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2538 if (ret < 0) {
2539 spin_lock(&block_group->lock);
2540 block_group->commit_used = old_commit_used;
2541 spin_unlock(&block_group->lock);
2542 }
2543
2544 return ret;
97f4728a
QW
2545}
2546
2eadb9e7
NB
2547static int insert_dev_extent(struct btrfs_trans_handle *trans,
2548 struct btrfs_device *device, u64 chunk_offset,
2549 u64 start, u64 num_bytes)
2550{
2551 struct btrfs_fs_info *fs_info = device->fs_info;
2552 struct btrfs_root *root = fs_info->dev_root;
2553 struct btrfs_path *path;
2554 struct btrfs_dev_extent *extent;
2555 struct extent_buffer *leaf;
2556 struct btrfs_key key;
2557 int ret;
2558
2559 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2560 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2561 path = btrfs_alloc_path();
2562 if (!path)
2563 return -ENOMEM;
2564
2565 key.objectid = device->devid;
2566 key.type = BTRFS_DEV_EXTENT_KEY;
2567 key.offset = start;
2568 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2569 if (ret)
2570 goto out;
2571
2572 leaf = path->nodes[0];
2573 extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2574 btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2575 btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2576 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2577 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2578
2579 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
50564b65 2580 btrfs_mark_buffer_dirty(trans, leaf);
2eadb9e7
NB
2581out:
2582 btrfs_free_path(path);
2583 return ret;
2584}
2585
2586/*
2587 * This function belongs to phase 2.
2588 *
2589 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2590 * phases.
2591 */
2592static int insert_dev_extents(struct btrfs_trans_handle *trans,
2593 u64 chunk_offset, u64 chunk_size)
2594{
2595 struct btrfs_fs_info *fs_info = trans->fs_info;
2596 struct btrfs_device *device;
7dc66abb 2597 struct btrfs_chunk_map *map;
2eadb9e7 2598 u64 dev_offset;
2eadb9e7
NB
2599 int i;
2600 int ret = 0;
2601
7dc66abb
FM
2602 map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2603 if (IS_ERR(map))
2604 return PTR_ERR(map);
2eadb9e7 2605
2eadb9e7
NB
2606 /*
2607 * Take the device list mutex to prevent races with the final phase of
2608 * a device replace operation that replaces the device object associated
2609 * with the map's stripes, because the device object's id can change
2610 * at any time during that final phase of the device replace operation
2611 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2612 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2613 * resulting in persisting a device extent item with such ID.
2614 */
2615 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2616 for (i = 0; i < map->num_stripes; i++) {
2617 device = map->stripes[i].dev;
2618 dev_offset = map->stripes[i].physical;
2619
2620 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
71fca47b 2621 map->stripe_size);
2eadb9e7
NB
2622 if (ret)
2623 break;
2624 }
2625 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2626
7dc66abb 2627 btrfs_free_chunk_map(map);
2eadb9e7
NB
2628 return ret;
2629}
2630
79bd3712
FM
2631/*
2632 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2633 * chunk allocation.
2634 *
2635 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2636 * phases.
2637 */
4358d963
JB
2638void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2639{
2640 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 2641 struct btrfs_block_group *block_group;
4358d963
JB
2642 int ret = 0;
2643
4358d963 2644 while (!list_empty(&trans->new_bgs)) {
49ea112d
JB
2645 int index;
2646
4358d963 2647 block_group = list_first_entry(&trans->new_bgs,
32da5386 2648 struct btrfs_block_group,
4358d963
JB
2649 bg_list);
2650 if (ret)
2651 goto next;
2652
49ea112d
JB
2653 index = btrfs_bg_flags_to_raid_index(block_group->flags);
2654
97f4728a 2655 ret = insert_block_group_item(trans, block_group);
4358d963
JB
2656 if (ret)
2657 btrfs_abort_transaction(trans, ret);
3349b57f
JB
2658 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
2659 &block_group->runtime_flags)) {
79bd3712
FM
2660 mutex_lock(&fs_info->chunk_mutex);
2661 ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2662 mutex_unlock(&fs_info->chunk_mutex);
2663 if (ret)
2664 btrfs_abort_transaction(trans, ret);
2665 }
2eadb9e7
NB
2666 ret = insert_dev_extents(trans, block_group->start,
2667 block_group->length);
4358d963
JB
2668 if (ret)
2669 btrfs_abort_transaction(trans, ret);
2670 add_block_group_free_space(trans, block_group);
49ea112d
JB
2671
2672 /*
2673 * If we restriped during balance, we may have added a new raid
2674 * type, so now add the sysfs entries when it is safe to do so.
2675 * We don't have to worry about locking here as it's handled in
2676 * btrfs_sysfs_add_block_group_type.
2677 */
2678 if (block_group->space_info->block_group_kobjs[index] == NULL)
2679 btrfs_sysfs_add_block_group_type(block_group);
2680
4358d963
JB
2681 /* Already aborted the transaction if it failed. */
2682next:
9ef17228 2683 btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
4358d963 2684 list_del_init(&block_group->bg_list);
0657b20c 2685 clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
4358d963
JB
2686 }
2687 btrfs_trans_release_chunk_metadata(trans);
2688}
2689
f7238e50
JB
2690/*
2691 * For extent tree v2 we use the block_group_item->chunk_offset to point at our
2692 * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2693 */
2694static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
2695{
2696 u64 div = SZ_1G;
2697 u64 index;
2698
2699 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2700 return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2701
2702 /* If we have a smaller fs index based on 128MiB. */
2703 if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2704 div = SZ_128M;
2705
2706 offset = div64_u64(offset, div);
2707 div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2708 return index;
2709}
2710
79bd3712 2711struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
5758d1bd 2712 u64 type,
79bd3712 2713 u64 chunk_offset, u64 size)
4358d963
JB
2714{
2715 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 2716 struct btrfs_block_group *cache;
4358d963
JB
2717 int ret;
2718
2719 btrfs_set_log_full_commit(trans);
2720
9afc6649 2721 cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
4358d963 2722 if (!cache)
79bd3712 2723 return ERR_PTR(-ENOMEM);
4358d963 2724
0657b20c
FM
2725 /*
2726 * Mark it as new before adding it to the rbtree of block groups or any
2727 * list, so that no other task finds it and calls btrfs_mark_bg_unused()
2728 * before the new flag is set.
2729 */
2730 set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
2731
9afc6649 2732 cache->length = size;
e3e39c72 2733 set_free_space_tree_thresholds(cache);
4358d963 2734 cache->flags = type;
4358d963 2735 cache->cached = BTRFS_CACHE_FINISHED;
f7238e50
JB
2736 cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2737
997e3e2e 2738 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
0d7764ff 2739 set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
08e11a3d 2740
a94794d5 2741 ret = btrfs_load_block_group_zone_info(cache, true);
08e11a3d
NA
2742 if (ret) {
2743 btrfs_put_block_group(cache);
79bd3712 2744 return ERR_PTR(ret);
08e11a3d
NA
2745 }
2746
4358d963
JB
2747 ret = exclude_super_stripes(cache);
2748 if (ret) {
2749 /* We may have excluded something, so call this just in case */
2750 btrfs_free_excluded_extents(cache);
2751 btrfs_put_block_group(cache);
79bd3712 2752 return ERR_PTR(ret);
4358d963
JB
2753 }
2754
3b9f0995 2755 ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
4358d963 2756 btrfs_free_excluded_extents(cache);
d8ccbd21
FM
2757 if (ret) {
2758 btrfs_put_block_group(cache);
2759 return ERR_PTR(ret);
2760 }
4358d963 2761
4358d963
JB
2762 /*
2763 * Ensure the corresponding space_info object is created and
2764 * assigned to our block group. We want our bg to be added to the rbtree
2765 * with its ->space_info set.
2766 */
2767 cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2768 ASSERT(cache->space_info);
2769
2770 ret = btrfs_add_block_group_cache(fs_info, cache);
2771 if (ret) {
2772 btrfs_remove_free_space_cache(cache);
2773 btrfs_put_block_group(cache);
79bd3712 2774 return ERR_PTR(ret);
4358d963
JB
2775 }
2776
2777 /*
2778 * Now that our block group has its ->space_info set and is inserted in
2779 * the rbtree, update the space info's counters.
2780 */
2781 trace_btrfs_add_block_group(fs_info, cache, 1);
723de71d 2782 btrfs_add_bg_to_space_info(fs_info, cache);
4358d963
JB
2783 btrfs_update_global_block_rsv(fs_info);
2784
9d4b0a12
JB
2785#ifdef CONFIG_BTRFS_DEBUG
2786 if (btrfs_should_fragment_free_space(cache)) {
5758d1bd 2787 cache->space_info->bytes_used += size >> 1;
9d4b0a12
JB
2788 fragment_free_space(cache);
2789 }
2790#endif
4358d963
JB
2791
2792 list_add_tail(&cache->bg_list, &trans->new_bgs);
9ef17228 2793 btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
4358d963
JB
2794
2795 set_avail_alloc_bits(fs_info, type);
79bd3712 2796 return cache;
4358d963 2797}
26ce2095 2798
b12de528
QW
2799/*
2800 * Mark one block group RO, can be called several times for the same block
2801 * group.
2802 *
2803 * @cache: the destination block group
2804 * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
2805 * ensure we still have some free space after marking this
2806 * block group RO.
2807 */
2808int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2809 bool do_chunk_alloc)
26ce2095
JB
2810{
2811 struct btrfs_fs_info *fs_info = cache->fs_info;
2812 struct btrfs_trans_handle *trans;
dfe8aec4 2813 struct btrfs_root *root = btrfs_block_group_root(fs_info);
26ce2095
JB
2814 u64 alloc_flags;
2815 int ret;
b6e9f16c 2816 bool dirty_bg_running;
26ce2095 2817
2d192fc4
QW
2818 /*
2819 * This can only happen when we are doing read-only scrub on read-only
2820 * mount.
2821 * In that case we should not start a new transaction on read-only fs.
2822 * Thus here we skip all chunk allocations.
2823 */
2824 if (sb_rdonly(fs_info->sb)) {
2825 mutex_lock(&fs_info->ro_block_group_mutex);
2826 ret = inc_block_group_ro(cache, 0);
2827 mutex_unlock(&fs_info->ro_block_group_mutex);
2828 return ret;
2829 }
2830
b6e9f16c 2831 do {
dfe8aec4 2832 trans = btrfs_join_transaction(root);
b6e9f16c
NB
2833 if (IS_ERR(trans))
2834 return PTR_ERR(trans);
26ce2095 2835
b6e9f16c 2836 dirty_bg_running = false;
26ce2095 2837
b6e9f16c
NB
2838 /*
2839 * We're not allowed to set block groups readonly after the dirty
2840 * block group cache has started writing. If it already started,
2841 * back off and let this transaction commit.
2842 */
2843 mutex_lock(&fs_info->ro_block_group_mutex);
2844 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2845 u64 transid = trans->transid;
26ce2095 2846
b6e9f16c
NB
2847 mutex_unlock(&fs_info->ro_block_group_mutex);
2848 btrfs_end_transaction(trans);
2849
2850 ret = btrfs_wait_for_commit(fs_info, transid);
2851 if (ret)
2852 return ret;
2853 dirty_bg_running = true;
2854 }
2855 } while (dirty_bg_running);
26ce2095 2856
b12de528 2857 if (do_chunk_alloc) {
26ce2095 2858 /*
b12de528
QW
2859 * If we are changing raid levels, try to allocate a
2860 * corresponding block group with the new raid level.
26ce2095 2861 */
349e120e 2862 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
b12de528
QW
2863 if (alloc_flags != cache->flags) {
2864 ret = btrfs_chunk_alloc(trans, alloc_flags,
2865 CHUNK_ALLOC_FORCE);
2866 /*
2867 * ENOSPC is allowed here, we may have enough space
2868 * already allocated at the new raid level to carry on
2869 */
2870 if (ret == -ENOSPC)
2871 ret = 0;
2872 if (ret < 0)
2873 goto out;
2874 }
26ce2095
JB
2875 }
2876
a7a63acc 2877 ret = inc_block_group_ro(cache, 0);
26ce2095
JB
2878 if (!ret)
2879 goto out;
7561551e
QW
2880 if (ret == -ETXTBSY)
2881 goto unlock_out;
2882
2883 /*
eefaf0a1 2884 * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
7561551e
QW
2885 * chunk allocation storm to exhaust the system chunk array. Otherwise
2886 * we still want to try our best to mark the block group read-only.
2887 */
2888 if (!do_chunk_alloc && ret == -ENOSPC &&
2889 (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
2890 goto unlock_out;
2891
26ce2095
JB
2892 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2893 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2894 if (ret < 0)
2895 goto out;
b6a98021
NA
2896 /*
2897 * We have allocated a new chunk. We also need to activate that chunk to
2898 * grant metadata tickets for zoned filesystem.
2899 */
2900 ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
2901 if (ret < 0)
2902 goto out;
2903
e11c0406 2904 ret = inc_block_group_ro(cache, 0);
195a49ea
FM
2905 if (ret == -ETXTBSY)
2906 goto unlock_out;
26ce2095
JB
2907out:
2908 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
349e120e 2909 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
26ce2095
JB
2910 mutex_lock(&fs_info->chunk_mutex);
2911 check_system_chunk(trans, alloc_flags);
2912 mutex_unlock(&fs_info->chunk_mutex);
2913 }
b12de528 2914unlock_out:
26ce2095
JB
2915 mutex_unlock(&fs_info->ro_block_group_mutex);
2916
2917 btrfs_end_transaction(trans);
2918 return ret;
2919}
2920
32da5386 2921void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
26ce2095
JB
2922{
2923 struct btrfs_space_info *sinfo = cache->space_info;
2924 u64 num_bytes;
2925
2926 BUG_ON(!cache->ro);
2927
2928 spin_lock(&sinfo->lock);
2929 spin_lock(&cache->lock);
2930 if (!--cache->ro) {
169e0da9
NA
2931 if (btrfs_is_zoned(cache->fs_info)) {
2932 /* Migrate zone_unusable bytes back */
98173255
NA
2933 cache->zone_unusable =
2934 (cache->alloc_offset - cache->used) +
2935 (cache->length - cache->zone_capacity);
169e0da9
NA
2936 sinfo->bytes_zone_unusable += cache->zone_unusable;
2937 sinfo->bytes_readonly -= cache->zone_unusable;
2938 }
f9f28e5b
NA
2939 num_bytes = cache->length - cache->reserved -
2940 cache->pinned - cache->bytes_super -
2941 cache->zone_unusable - cache->used;
2942 sinfo->bytes_readonly -= num_bytes;
26ce2095
JB
2943 list_del_init(&cache->ro_list);
2944 }
2945 spin_unlock(&cache->lock);
2946 spin_unlock(&sinfo->lock);
2947}
77745c05 2948
3be4d8ef
QW
2949static int update_block_group_item(struct btrfs_trans_handle *trans,
2950 struct btrfs_path *path,
2951 struct btrfs_block_group *cache)
77745c05
JB
2952{
2953 struct btrfs_fs_info *fs_info = trans->fs_info;
2954 int ret;
dfe8aec4 2955 struct btrfs_root *root = btrfs_block_group_root(fs_info);
77745c05
JB
2956 unsigned long bi;
2957 struct extent_buffer *leaf;
bf38be65 2958 struct btrfs_block_group_item bgi;
b3470b5d 2959 struct btrfs_key key;
7248e0ce
QW
2960 u64 old_commit_used;
2961 u64 used;
2962
2963 /*
2964 * Block group items update can be triggered out of commit transaction
2965 * critical section, thus we need a consistent view of used bytes.
2966 * We cannot use cache->used directly outside of the spin lock, as it
2967 * may be changed.
2968 */
2969 spin_lock(&cache->lock);
2970 old_commit_used = cache->commit_used;
2971 used = cache->used;
2972 /* No change in used bytes, can safely skip it. */
2973 if (cache->commit_used == used) {
2974 spin_unlock(&cache->lock);
2975 return 0;
2976 }
2977 cache->commit_used = used;
2978 spin_unlock(&cache->lock);
b3470b5d
DS
2979
2980 key.objectid = cache->start;
2981 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2982 key.offset = cache->length;
77745c05 2983
3be4d8ef 2984 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
77745c05
JB
2985 if (ret) {
2986 if (ret > 0)
2987 ret = -ENOENT;
2988 goto fail;
2989 }
2990
2991 leaf = path->nodes[0];
2992 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
7248e0ce 2993 btrfs_set_stack_block_group_used(&bgi, used);
de0dc456 2994 btrfs_set_stack_block_group_chunk_objectid(&bgi,
f7238e50 2995 cache->global_root_id);
de0dc456 2996 btrfs_set_stack_block_group_flags(&bgi, cache->flags);
bf38be65 2997 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
50564b65 2998 btrfs_mark_buffer_dirty(trans, leaf);
77745c05
JB
2999fail:
3000 btrfs_release_path(path);
2d6cd791
FM
3001 /*
3002 * We didn't update the block group item, need to revert commit_used
3003 * unless the block group item didn't exist yet - this is to prevent a
3004 * race with a concurrent insertion of the block group item, with
3005 * insert_block_group_item(), that happened just after we attempted to
3006 * update. In that case we would reset commit_used to 0 just after the
3007 * insertion set it to a value greater than 0 - if the block group later
3008 * becomes with 0 used bytes, we would incorrectly skip its update.
3009 */
3010 if (ret < 0 && ret != -ENOENT) {
7248e0ce
QW
3011 spin_lock(&cache->lock);
3012 cache->commit_used = old_commit_used;
3013 spin_unlock(&cache->lock);
3014 }
77745c05
JB
3015 return ret;
3016
3017}
3018
32da5386 3019static int cache_save_setup(struct btrfs_block_group *block_group,
77745c05
JB
3020 struct btrfs_trans_handle *trans,
3021 struct btrfs_path *path)
3022{
3023 struct btrfs_fs_info *fs_info = block_group->fs_info;
77745c05
JB
3024 struct inode *inode = NULL;
3025 struct extent_changeset *data_reserved = NULL;
3026 u64 alloc_hint = 0;
3027 int dcs = BTRFS_DC_ERROR;
0044ae11 3028 u64 cache_size = 0;
77745c05
JB
3029 int retries = 0;
3030 int ret = 0;
3031
af456a2c
BB
3032 if (!btrfs_test_opt(fs_info, SPACE_CACHE))
3033 return 0;
3034
77745c05
JB
3035 /*
3036 * If this block group is smaller than 100 megs don't bother caching the
3037 * block group.
3038 */
b3470b5d 3039 if (block_group->length < (100 * SZ_1M)) {
77745c05
JB
3040 spin_lock(&block_group->lock);
3041 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3042 spin_unlock(&block_group->lock);
3043 return 0;
3044 }
3045
bf31f87f 3046 if (TRANS_ABORTED(trans))
77745c05
JB
3047 return 0;
3048again:
3049 inode = lookup_free_space_inode(block_group, path);
3050 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3051 ret = PTR_ERR(inode);
3052 btrfs_release_path(path);
3053 goto out;
3054 }
3055
3056 if (IS_ERR(inode)) {
3057 BUG_ON(retries);
3058 retries++;
3059
3060 if (block_group->ro)
3061 goto out_free;
3062
3063 ret = create_free_space_inode(trans, block_group, path);
3064 if (ret)
3065 goto out_free;
3066 goto again;
3067 }
3068
3069 /*
3070 * We want to set the generation to 0, that way if anything goes wrong
3071 * from here on out we know not to trust this cache when we load up next
3072 * time.
3073 */
3074 BTRFS_I(inode)->generation = 0;
8b9d0322 3075 ret = btrfs_update_inode(trans, BTRFS_I(inode));
77745c05
JB
3076 if (ret) {
3077 /*
3078 * So theoretically we could recover from this, simply set the
3079 * super cache generation to 0 so we know to invalidate the
3080 * cache, but then we'd have to keep track of the block groups
3081 * that fail this way so we know we _have_ to reset this cache
3082 * before the next commit or risk reading stale cache. So to
3083 * limit our exposure to horrible edge cases lets just abort the
3084 * transaction, this only happens in really bad situations
3085 * anyway.
3086 */
3087 btrfs_abort_transaction(trans, ret);
3088 goto out_put;
3089 }
3090 WARN_ON(ret);
3091
3092 /* We've already setup this transaction, go ahead and exit */
3093 if (block_group->cache_generation == trans->transid &&
3094 i_size_read(inode)) {
3095 dcs = BTRFS_DC_SETUP;
3096 goto out_put;
3097 }
3098
3099 if (i_size_read(inode) > 0) {
3100 ret = btrfs_check_trunc_cache_free_space(fs_info,
3101 &fs_info->global_block_rsv);
3102 if (ret)
3103 goto out_put;
3104
3105 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3106 if (ret)
3107 goto out_put;
3108 }
3109
3110 spin_lock(&block_group->lock);
3111 if (block_group->cached != BTRFS_CACHE_FINISHED ||
3112 !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3113 /*
3114 * don't bother trying to write stuff out _if_
3115 * a) we're not cached,
3116 * b) we're with nospace_cache mount option,
3117 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3118 */
3119 dcs = BTRFS_DC_WRITTEN;
3120 spin_unlock(&block_group->lock);
3121 goto out_put;
3122 }
3123 spin_unlock(&block_group->lock);
3124
3125 /*
3126 * We hit an ENOSPC when setting up the cache in this transaction, just
3127 * skip doing the setup, we've already cleared the cache so we're safe.
3128 */
3129 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3130 ret = -ENOSPC;
3131 goto out_put;
3132 }
3133
3134 /*
3135 * Try to preallocate enough space based on how big the block group is.
3136 * Keep in mind this has to include any pinned space which could end up
3137 * taking up quite a bit since it's not folded into the other space
3138 * cache.
3139 */
0044ae11
QW
3140 cache_size = div_u64(block_group->length, SZ_256M);
3141 if (!cache_size)
3142 cache_size = 1;
77745c05 3143
0044ae11
QW
3144 cache_size *= 16;
3145 cache_size *= fs_info->sectorsize;
77745c05 3146
36ea6f3e 3147 ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
1daedb1d 3148 cache_size, false);
77745c05
JB
3149 if (ret)
3150 goto out_put;
3151
0044ae11
QW
3152 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
3153 cache_size, cache_size,
77745c05
JB
3154 &alloc_hint);
3155 /*
3156 * Our cache requires contiguous chunks so that we don't modify a bunch
3157 * of metadata or split extents when writing the cache out, which means
3158 * we can enospc if we are heavily fragmented in addition to just normal
3159 * out of space conditions. So if we hit this just skip setting up any
3160 * other block groups for this transaction, maybe we'll unpin enough
3161 * space the next time around.
3162 */
3163 if (!ret)
3164 dcs = BTRFS_DC_SETUP;
3165 else if (ret == -ENOSPC)
3166 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3167
3168out_put:
3169 iput(inode);
3170out_free:
3171 btrfs_release_path(path);
3172out:
3173 spin_lock(&block_group->lock);
3174 if (!ret && dcs == BTRFS_DC_SETUP)
3175 block_group->cache_generation = trans->transid;
3176 block_group->disk_cache_state = dcs;
3177 spin_unlock(&block_group->lock);
3178
3179 extent_changeset_free(data_reserved);
3180 return ret;
3181}
3182
3183int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3184{
3185 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 3186 struct btrfs_block_group *cache, *tmp;
77745c05
JB
3187 struct btrfs_transaction *cur_trans = trans->transaction;
3188 struct btrfs_path *path;
3189
3190 if (list_empty(&cur_trans->dirty_bgs) ||
3191 !btrfs_test_opt(fs_info, SPACE_CACHE))
3192 return 0;
3193
3194 path = btrfs_alloc_path();
3195 if (!path)
3196 return -ENOMEM;
3197
3198 /* Could add new block groups, use _safe just in case */
3199 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3200 dirty_list) {
3201 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3202 cache_save_setup(cache, trans, path);
3203 }
3204
3205 btrfs_free_path(path);
3206 return 0;
3207}
3208
3209/*
3210 * Transaction commit does final block group cache writeback during a critical
3211 * section where nothing is allowed to change the FS. This is required in
3212 * order for the cache to actually match the block group, but can introduce a
3213 * lot of latency into the commit.
3214 *
3215 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
3216 * There's a chance we'll have to redo some of it if the block group changes
3217 * again during the commit, but it greatly reduces the commit latency by
3218 * getting rid of the easy block groups while we're still allowing others to
3219 * join the commit.
3220 */
3221int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3222{
3223 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 3224 struct btrfs_block_group *cache;
77745c05
JB
3225 struct btrfs_transaction *cur_trans = trans->transaction;
3226 int ret = 0;
3227 int should_put;
3228 struct btrfs_path *path = NULL;
3229 LIST_HEAD(dirty);
3230 struct list_head *io = &cur_trans->io_bgs;
77745c05
JB
3231 int loops = 0;
3232
3233 spin_lock(&cur_trans->dirty_bgs_lock);
3234 if (list_empty(&cur_trans->dirty_bgs)) {
3235 spin_unlock(&cur_trans->dirty_bgs_lock);
3236 return 0;
3237 }
3238 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3239 spin_unlock(&cur_trans->dirty_bgs_lock);
3240
3241again:
3242 /* Make sure all the block groups on our dirty list actually exist */
3243 btrfs_create_pending_block_groups(trans);
3244
3245 if (!path) {
3246 path = btrfs_alloc_path();
938fcbfb
JB
3247 if (!path) {
3248 ret = -ENOMEM;
3249 goto out;
3250 }
77745c05
JB
3251 }
3252
3253 /*
3254 * cache_write_mutex is here only to save us from balance or automatic
3255 * removal of empty block groups deleting this block group while we are
3256 * writing out the cache
3257 */
3258 mutex_lock(&trans->transaction->cache_write_mutex);
3259 while (!list_empty(&dirty)) {
3260 bool drop_reserve = true;
3261
32da5386 3262 cache = list_first_entry(&dirty, struct btrfs_block_group,
77745c05
JB
3263 dirty_list);
3264 /*
3265 * This can happen if something re-dirties a block group that
3266 * is already under IO. Just wait for it to finish and then do
3267 * it all again
3268 */
3269 if (!list_empty(&cache->io_list)) {
3270 list_del_init(&cache->io_list);
3271 btrfs_wait_cache_io(trans, cache, path);
3272 btrfs_put_block_group(cache);
3273 }
3274
3275
3276 /*
3277 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
3278 * it should update the cache_state. Don't delete until after
3279 * we wait.
3280 *
3281 * Since we're not running in the commit critical section
3282 * we need the dirty_bgs_lock to protect from update_block_group
3283 */
3284 spin_lock(&cur_trans->dirty_bgs_lock);
3285 list_del_init(&cache->dirty_list);
3286 spin_unlock(&cur_trans->dirty_bgs_lock);
3287
3288 should_put = 1;
3289
3290 cache_save_setup(cache, trans, path);
3291
3292 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3293 cache->io_ctl.inode = NULL;
3294 ret = btrfs_write_out_cache(trans, cache, path);
3295 if (ret == 0 && cache->io_ctl.inode) {
77745c05
JB
3296 should_put = 0;
3297
3298 /*
3299 * The cache_write_mutex is protecting the
3300 * io_list, also refer to the definition of
3301 * btrfs_transaction::io_bgs for more details
3302 */
3303 list_add_tail(&cache->io_list, io);
3304 } else {
3305 /*
3306 * If we failed to write the cache, the
3307 * generation will be bad and life goes on
3308 */
3309 ret = 0;
3310 }
3311 }
3312 if (!ret) {
3be4d8ef 3313 ret = update_block_group_item(trans, path, cache);
77745c05
JB
3314 /*
3315 * Our block group might still be attached to the list
3316 * of new block groups in the transaction handle of some
3317 * other task (struct btrfs_trans_handle->new_bgs). This
3318 * means its block group item isn't yet in the extent
3319 * tree. If this happens ignore the error, as we will
3320 * try again later in the critical section of the
3321 * transaction commit.
3322 */
3323 if (ret == -ENOENT) {
3324 ret = 0;
3325 spin_lock(&cur_trans->dirty_bgs_lock);
3326 if (list_empty(&cache->dirty_list)) {
3327 list_add_tail(&cache->dirty_list,
3328 &cur_trans->dirty_bgs);
3329 btrfs_get_block_group(cache);
3330 drop_reserve = false;
3331 }
3332 spin_unlock(&cur_trans->dirty_bgs_lock);
3333 } else if (ret) {
3334 btrfs_abort_transaction(trans, ret);
3335 }
3336 }
3337
3338 /* If it's not on the io list, we need to put the block group */
3339 if (should_put)
3340 btrfs_put_block_group(cache);
3341 if (drop_reserve)
f66e0209 3342 btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
77745c05
JB
3343 /*
3344 * Avoid blocking other tasks for too long. It might even save
3345 * us from writing caches for block groups that are going to be
3346 * removed.
3347 */
3348 mutex_unlock(&trans->transaction->cache_write_mutex);
938fcbfb
JB
3349 if (ret)
3350 goto out;
77745c05
JB
3351 mutex_lock(&trans->transaction->cache_write_mutex);
3352 }
3353 mutex_unlock(&trans->transaction->cache_write_mutex);
3354
3355 /*
3356 * Go through delayed refs for all the stuff we've just kicked off
3357 * and then loop back (just once)
3358 */
34d1eb0e
JB
3359 if (!ret)
3360 ret = btrfs_run_delayed_refs(trans, 0);
77745c05
JB
3361 if (!ret && loops == 0) {
3362 loops++;
3363 spin_lock(&cur_trans->dirty_bgs_lock);
3364 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3365 /*
3366 * dirty_bgs_lock protects us from concurrent block group
3367 * deletes too (not just cache_write_mutex).
3368 */
3369 if (!list_empty(&dirty)) {
3370 spin_unlock(&cur_trans->dirty_bgs_lock);
3371 goto again;
3372 }
3373 spin_unlock(&cur_trans->dirty_bgs_lock);
938fcbfb
JB
3374 }
3375out:
3376 if (ret < 0) {
3377 spin_lock(&cur_trans->dirty_bgs_lock);
3378 list_splice_init(&dirty, &cur_trans->dirty_bgs);
3379 spin_unlock(&cur_trans->dirty_bgs_lock);
77745c05
JB
3380 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3381 }
3382
3383 btrfs_free_path(path);
3384 return ret;
3385}
3386
3387int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3388{
3389 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 3390 struct btrfs_block_group *cache;
77745c05
JB
3391 struct btrfs_transaction *cur_trans = trans->transaction;
3392 int ret = 0;
3393 int should_put;
3394 struct btrfs_path *path;
3395 struct list_head *io = &cur_trans->io_bgs;
77745c05
JB
3396
3397 path = btrfs_alloc_path();
3398 if (!path)
3399 return -ENOMEM;
3400
3401 /*
3402 * Even though we are in the critical section of the transaction commit,
3403 * we can still have concurrent tasks adding elements to this
3404 * transaction's list of dirty block groups. These tasks correspond to
3405 * endio free space workers started when writeback finishes for a
3406 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3407 * allocate new block groups as a result of COWing nodes of the root
3408 * tree when updating the free space inode. The writeback for the space
3409 * caches is triggered by an earlier call to
3410 * btrfs_start_dirty_block_groups() and iterations of the following
3411 * loop.
3412 * Also we want to do the cache_save_setup first and then run the
3413 * delayed refs to make sure we have the best chance at doing this all
3414 * in one shot.
3415 */
3416 spin_lock(&cur_trans->dirty_bgs_lock);
3417 while (!list_empty(&cur_trans->dirty_bgs)) {
3418 cache = list_first_entry(&cur_trans->dirty_bgs,
32da5386 3419 struct btrfs_block_group,
77745c05
JB
3420 dirty_list);
3421
3422 /*
3423 * This can happen if cache_save_setup re-dirties a block group
3424 * that is already under IO. Just wait for it to finish and
3425 * then do it all again
3426 */
3427 if (!list_empty(&cache->io_list)) {
3428 spin_unlock(&cur_trans->dirty_bgs_lock);
3429 list_del_init(&cache->io_list);
3430 btrfs_wait_cache_io(trans, cache, path);
3431 btrfs_put_block_group(cache);
3432 spin_lock(&cur_trans->dirty_bgs_lock);
3433 }
3434
3435 /*
3436 * Don't remove from the dirty list until after we've waited on
3437 * any pending IO
3438 */
3439 list_del_init(&cache->dirty_list);
3440 spin_unlock(&cur_trans->dirty_bgs_lock);
3441 should_put = 1;
3442
3443 cache_save_setup(cache, trans, path);
3444
3445 if (!ret)
8a526c44 3446 ret = btrfs_run_delayed_refs(trans, U64_MAX);
77745c05
JB
3447
3448 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3449 cache->io_ctl.inode = NULL;
3450 ret = btrfs_write_out_cache(trans, cache, path);
3451 if (ret == 0 && cache->io_ctl.inode) {
77745c05
JB
3452 should_put = 0;
3453 list_add_tail(&cache->io_list, io);
3454 } else {
3455 /*
3456 * If we failed to write the cache, the
3457 * generation will be bad and life goes on
3458 */
3459 ret = 0;
3460 }
3461 }
3462 if (!ret) {
3be4d8ef 3463 ret = update_block_group_item(trans, path, cache);
77745c05
JB
3464 /*
3465 * One of the free space endio workers might have
3466 * created a new block group while updating a free space
3467 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3468 * and hasn't released its transaction handle yet, in
3469 * which case the new block group is still attached to
3470 * its transaction handle and its creation has not
3471 * finished yet (no block group item in the extent tree
3472 * yet, etc). If this is the case, wait for all free
3473 * space endio workers to finish and retry. This is a
260db43c 3474 * very rare case so no need for a more efficient and
77745c05
JB
3475 * complex approach.
3476 */
3477 if (ret == -ENOENT) {
3478 wait_event(cur_trans->writer_wait,
3479 atomic_read(&cur_trans->num_writers) == 1);
3be4d8ef 3480 ret = update_block_group_item(trans, path, cache);
77745c05
JB
3481 }
3482 if (ret)
3483 btrfs_abort_transaction(trans, ret);
3484 }
3485
3486 /* If its not on the io list, we need to put the block group */
3487 if (should_put)
3488 btrfs_put_block_group(cache);
f66e0209 3489 btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
77745c05
JB
3490 spin_lock(&cur_trans->dirty_bgs_lock);
3491 }
3492 spin_unlock(&cur_trans->dirty_bgs_lock);
3493
3494 /*
3495 * Refer to the definition of io_bgs member for details why it's safe
3496 * to use it without any locking
3497 */
3498 while (!list_empty(io)) {
32da5386 3499 cache = list_first_entry(io, struct btrfs_block_group,
77745c05
JB
3500 io_list);
3501 list_del_init(&cache->io_list);
3502 btrfs_wait_cache_io(trans, cache, path);
3503 btrfs_put_block_group(cache);
3504 }
3505
3506 btrfs_free_path(path);
3507 return ret;
3508}
606d1bf1
JB
3509
3510int btrfs_update_block_group(struct btrfs_trans_handle *trans,
11b66fa6 3511 u64 bytenr, u64 num_bytes, bool alloc)
606d1bf1
JB
3512{
3513 struct btrfs_fs_info *info = trans->fs_info;
4d20c1de
FM
3514 struct btrfs_space_info *space_info;
3515 struct btrfs_block_group *cache;
606d1bf1 3516 u64 old_val;
4d20c1de 3517 bool reclaim = false;
f66e0209 3518 bool bg_already_dirty = true;
606d1bf1 3519 int factor;
606d1bf1
JB
3520
3521 /* Block accounting for super block */
3522 spin_lock(&info->delalloc_root_lock);
3523 old_val = btrfs_super_bytes_used(info->super_copy);
3524 if (alloc)
3525 old_val += num_bytes;
3526 else
3527 old_val -= num_bytes;
3528 btrfs_set_super_bytes_used(info->super_copy, old_val);
3529 spin_unlock(&info->delalloc_root_lock);
3530
4d20c1de
FM
3531 cache = btrfs_lookup_block_group(info, bytenr);
3532 if (!cache)
3533 return -ENOENT;
ac2f1e63 3534
4d20c1de
FM
3535 /* An extent can not span multiple block groups. */
3536 ASSERT(bytenr + num_bytes <= cache->start + cache->length);
606d1bf1 3537
4d20c1de
FM
3538 space_info = cache->space_info;
3539 factor = btrfs_bg_type_to_factor(cache->flags);
606d1bf1 3540
4d20c1de
FM
3541 /*
3542 * If this block group has free space cache written out, we need to make
3543 * sure to load it if we are removing space. This is because we need
3544 * the unpinning stage to actually add the space back to the block group,
3545 * otherwise we will leak space.
3546 */
3547 if (!alloc && !btrfs_block_group_done(cache))
3548 btrfs_cache_block_group(cache, true);
606d1bf1 3549
4d20c1de
FM
3550 spin_lock(&space_info->lock);
3551 spin_lock(&cache->lock);
606d1bf1 3552
4d20c1de
FM
3553 if (btrfs_test_opt(info, SPACE_CACHE) &&
3554 cache->disk_cache_state < BTRFS_DC_CLEAR)
3555 cache->disk_cache_state = BTRFS_DC_CLEAR;
606d1bf1 3556
4d20c1de
FM
3557 old_val = cache->used;
3558 if (alloc) {
3559 old_val += num_bytes;
3560 cache->used = old_val;
3561 cache->reserved -= num_bytes;
3562 space_info->bytes_reserved -= num_bytes;
3563 space_info->bytes_used += num_bytes;
3564 space_info->disk_used += num_bytes * factor;
3565 spin_unlock(&cache->lock);
3566 spin_unlock(&space_info->lock);
3567 } else {
3568 old_val -= num_bytes;
3569 cache->used = old_val;
3570 cache->pinned += num_bytes;
3571 btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
3572 space_info->bytes_used -= num_bytes;
3573 space_info->disk_used -= num_bytes * factor;
606d1bf1 3574
4d20c1de 3575 reclaim = should_reclaim_block_group(cache, num_bytes);
606d1bf1 3576
4d20c1de
FM
3577 spin_unlock(&cache->lock);
3578 spin_unlock(&space_info->lock);
606d1bf1 3579
4d20c1de
FM
3580 set_extent_bit(&trans->transaction->pinned_extents, bytenr,
3581 bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
3582 }
3583
3584 spin_lock(&trans->transaction->dirty_bgs_lock);
3585 if (list_empty(&cache->dirty_list)) {
3586 list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
f66e0209 3587 bg_already_dirty = false;
4d20c1de
FM
3588 btrfs_get_block_group(cache);
3589 }
3590 spin_unlock(&trans->transaction->dirty_bgs_lock);
3591
3592 /*
3593 * No longer have used bytes in this block group, queue it for deletion.
3594 * We do this after adding the block group to the dirty list to avoid
3595 * races between cleaner kthread and space cache writeout.
3596 */
3597 if (!alloc && old_val == 0) {
3598 if (!btrfs_test_opt(info, DISCARD_ASYNC))
3599 btrfs_mark_bg_unused(cache);
3600 } else if (!alloc && reclaim) {
3601 btrfs_mark_bg_to_reclaim(cache);
606d1bf1
JB
3602 }
3603
4d20c1de
FM
3604 btrfs_put_block_group(cache);
3605
606d1bf1 3606 /* Modified block groups are accounted for in the delayed_refs_rsv. */
f66e0209
FM
3607 if (!bg_already_dirty)
3608 btrfs_inc_delayed_refs_rsv_bg_updates(info);
4d20c1de
FM
3609
3610 return 0;
606d1bf1
JB
3611}
3612
43dd529a
DS
3613/*
3614 * Update the block_group and space info counters.
3615 *
606d1bf1
JB
3616 * @cache: The cache we are manipulating
3617 * @ram_bytes: The number of bytes of file content, and will be same to
3618 * @num_bytes except for the compress path.
3619 * @num_bytes: The number of bytes in question
3620 * @delalloc: The blocks are allocated for the delalloc write
3621 *
3622 * This is called by the allocator when it reserves space. If this is a
3623 * reservation and the block group has become read only we cannot make the
3624 * reservation and return -EAGAIN, otherwise this function always succeeds.
3625 */
32da5386 3626int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
52bb7a21
BB
3627 u64 ram_bytes, u64 num_bytes, int delalloc,
3628 bool force_wrong_size_class)
606d1bf1
JB
3629{
3630 struct btrfs_space_info *space_info = cache->space_info;
52bb7a21 3631 enum btrfs_block_group_size_class size_class;
606d1bf1
JB
3632 int ret = 0;
3633
3634 spin_lock(&space_info->lock);
3635 spin_lock(&cache->lock);
3636 if (cache->ro) {
3637 ret = -EAGAIN;
52bb7a21
BB
3638 goto out;
3639 }
99ffb43e 3640
cb0922f2 3641 if (btrfs_block_group_should_use_size_class(cache)) {
52bb7a21
BB
3642 size_class = btrfs_calc_block_group_size_class(num_bytes);
3643 ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
3644 if (ret)
3645 goto out;
606d1bf1 3646 }
52bb7a21
BB
3647 cache->reserved += num_bytes;
3648 space_info->bytes_reserved += num_bytes;
3649 trace_btrfs_space_reservation(cache->fs_info, "space_info",
3650 space_info->flags, num_bytes, 1);
3651 btrfs_space_info_update_bytes_may_use(cache->fs_info,
3652 space_info, -ram_bytes);
3653 if (delalloc)
3654 cache->delalloc_bytes += num_bytes;
3655
3656 /*
3657 * Compression can use less space than we reserved, so wake tickets if
3658 * that happens.
3659 */
3660 if (num_bytes < ram_bytes)
3661 btrfs_try_granting_tickets(cache->fs_info, space_info);
3662out:
606d1bf1
JB
3663 spin_unlock(&cache->lock);
3664 spin_unlock(&space_info->lock);
3665 return ret;
3666}
3667
43dd529a
DS
3668/*
3669 * Update the block_group and space info counters.
3670 *
606d1bf1
JB
3671 * @cache: The cache we are manipulating
3672 * @num_bytes: The number of bytes in question
3673 * @delalloc: The blocks are allocated for the delalloc write
3674 *
3675 * This is called by somebody who is freeing space that was never actually used
3676 * on disk. For example if you reserve some space for a new leaf in transaction
3677 * A and before transaction A commits you free that leaf, you call this with
3678 * reserve set to 0 in order to clear the reservation.
3679 */
32da5386 3680void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
606d1bf1
JB
3681 u64 num_bytes, int delalloc)
3682{
3683 struct btrfs_space_info *space_info = cache->space_info;
3684
3685 spin_lock(&space_info->lock);
3686 spin_lock(&cache->lock);
3687 if (cache->ro)
3688 space_info->bytes_readonly += num_bytes;
3689 cache->reserved -= num_bytes;
3690 space_info->bytes_reserved -= num_bytes;
3691 space_info->max_extent_size = 0;
3692
3693 if (delalloc)
3694 cache->delalloc_bytes -= num_bytes;
3695 spin_unlock(&cache->lock);
3308234a
JB
3696
3697 btrfs_try_granting_tickets(cache->fs_info, space_info);
606d1bf1
JB
3698 spin_unlock(&space_info->lock);
3699}
07730d87
JB
3700
3701static void force_metadata_allocation(struct btrfs_fs_info *info)
3702{
3703 struct list_head *head = &info->space_info;
3704 struct btrfs_space_info *found;
3705
72804905 3706 list_for_each_entry(found, head, list) {
07730d87
JB
3707 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3708 found->force_alloc = CHUNK_ALLOC_FORCE;
3709 }
07730d87
JB
3710}
3711
3712static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3713 struct btrfs_space_info *sinfo, int force)
3714{
3715 u64 bytes_used = btrfs_space_info_used(sinfo, false);
3716 u64 thresh;
3717
3718 if (force == CHUNK_ALLOC_FORCE)
3719 return 1;
3720
3721 /*
3722 * in limited mode, we want to have some free space up to
3723 * about 1% of the FS size.
3724 */
3725 if (force == CHUNK_ALLOC_LIMITED) {
3726 thresh = btrfs_super_total_bytes(fs_info->super_copy);
428c8e03 3727 thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
07730d87
JB
3728
3729 if (sinfo->total_bytes - bytes_used < thresh)
3730 return 1;
3731 }
3732
428c8e03 3733 if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
07730d87
JB
3734 return 0;
3735 return 1;
3736}
3737
3738int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3739{
3740 u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3741
3742 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3743}
3744
820c363b 3745static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
79bd3712
FM
3746{
3747 struct btrfs_block_group *bg;
3748 int ret;
3749
3750 /*
3751 * Check if we have enough space in the system space info because we
3752 * will need to update device items in the chunk btree and insert a new
3753 * chunk item in the chunk btree as well. This will allocate a new
3754 * system block group if needed.
3755 */
3756 check_system_chunk(trans, flags);
3757
f6f39f7a 3758 bg = btrfs_create_chunk(trans, flags);
79bd3712
FM
3759 if (IS_ERR(bg)) {
3760 ret = PTR_ERR(bg);
3761 goto out;
3762 }
3763
79bd3712
FM
3764 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3765 /*
3766 * Normally we are not expected to fail with -ENOSPC here, since we have
3767 * previously reserved space in the system space_info and allocated one
ecd84d54 3768 * new system chunk if necessary. However there are three exceptions:
79bd3712
FM
3769 *
3770 * 1) We may have enough free space in the system space_info but all the
3771 * existing system block groups have a profile which can not be used
3772 * for extent allocation.
3773 *
3774 * This happens when mounting in degraded mode. For example we have a
3775 * RAID1 filesystem with 2 devices, lose one device and mount the fs
3776 * using the other device in degraded mode. If we then allocate a chunk,
3777 * we may have enough free space in the existing system space_info, but
3778 * none of the block groups can be used for extent allocation since they
3779 * have a RAID1 profile, and because we are in degraded mode with a
3780 * single device, we are forced to allocate a new system chunk with a
3781 * SINGLE profile. Making check_system_chunk() iterate over all system
3782 * block groups and check if they have a usable profile and enough space
3783 * can be slow on very large filesystems, so we tolerate the -ENOSPC and
3784 * try again after forcing allocation of a new system chunk. Like this
3785 * we avoid paying the cost of that search in normal circumstances, when
3786 * we were not mounted in degraded mode;
3787 *
3788 * 2) We had enough free space info the system space_info, and one suitable
3789 * block group to allocate from when we called check_system_chunk()
3790 * above. However right after we called it, the only system block group
3791 * with enough free space got turned into RO mode by a running scrub,
3792 * and in this case we have to allocate a new one and retry. We only
3793 * need do this allocate and retry once, since we have a transaction
ecd84d54
FM
3794 * handle and scrub uses the commit root to search for block groups;
3795 *
3796 * 3) We had one system block group with enough free space when we called
3797 * check_system_chunk(), but after that, right before we tried to
3798 * allocate the last extent buffer we needed, a discard operation came
3799 * in and it temporarily removed the last free space entry from the
3800 * block group (discard removes a free space entry, discards it, and
3801 * then adds back the entry to the block group cache).
79bd3712
FM
3802 */
3803 if (ret == -ENOSPC) {
3804 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3805 struct btrfs_block_group *sys_bg;
3806
f6f39f7a 3807 sys_bg = btrfs_create_chunk(trans, sys_flags);
79bd3712
FM
3808 if (IS_ERR(sys_bg)) {
3809 ret = PTR_ERR(sys_bg);
3810 btrfs_abort_transaction(trans, ret);
3811 goto out;
3812 }
3813
3814 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3815 if (ret) {
3816 btrfs_abort_transaction(trans, ret);
3817 goto out;
3818 }
3819
3820 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3821 if (ret) {
3822 btrfs_abort_transaction(trans, ret);
3823 goto out;
3824 }
3825 } else if (ret) {
3826 btrfs_abort_transaction(trans, ret);
3827 goto out;
3828 }
3829out:
3830 btrfs_trans_release_chunk_metadata(trans);
3831
820c363b
NA
3832 if (ret)
3833 return ERR_PTR(ret);
3834
3835 btrfs_get_block_group(bg);
3836 return bg;
79bd3712
FM
3837}
3838
07730d87 3839/*
79bd3712
FM
3840 * Chunk allocation is done in 2 phases:
3841 *
3842 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
3843 * the chunk, the chunk mapping, create its block group and add the items
3844 * that belong in the chunk btree to it - more specifically, we need to
3845 * update device items in the chunk btree and add a new chunk item to it.
3846 *
3847 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
3848 * group item to the extent btree and the device extent items to the devices
3849 * btree.
3850 *
3851 * This is done to prevent deadlocks. For example when COWing a node from the
3852 * extent btree we are holding a write lock on the node's parent and if we
3853 * trigger chunk allocation and attempted to insert the new block group item
3854 * in the extent btree right way, we could deadlock because the path for the
3855 * insertion can include that parent node. At first glance it seems impossible
3856 * to trigger chunk allocation after starting a transaction since tasks should
3857 * reserve enough transaction units (metadata space), however while that is true
3858 * most of the time, chunk allocation may still be triggered for several reasons:
3859 *
3860 * 1) When reserving metadata, we check if there is enough free space in the
3861 * metadata space_info and therefore don't trigger allocation of a new chunk.
3862 * However later when the task actually tries to COW an extent buffer from
3863 * the extent btree or from the device btree for example, it is forced to
3864 * allocate a new block group (chunk) because the only one that had enough
3865 * free space was just turned to RO mode by a running scrub for example (or
3866 * device replace, block group reclaim thread, etc), so we can not use it
3867 * for allocating an extent and end up being forced to allocate a new one;
3868 *
3869 * 2) Because we only check that the metadata space_info has enough free bytes,
3870 * we end up not allocating a new metadata chunk in that case. However if
3871 * the filesystem was mounted in degraded mode, none of the existing block
3872 * groups might be suitable for extent allocation due to their incompatible
3873 * profile (for e.g. mounting a 2 devices filesystem, where all block groups
3874 * use a RAID1 profile, in degraded mode using a single device). In this case
3875 * when the task attempts to COW some extent buffer of the extent btree for
3876 * example, it will trigger allocation of a new metadata block group with a
3877 * suitable profile (SINGLE profile in the example of the degraded mount of
3878 * the RAID1 filesystem);
3879 *
3880 * 3) The task has reserved enough transaction units / metadata space, but when
3881 * it attempts to COW an extent buffer from the extent or device btree for
3882 * example, it does not find any free extent in any metadata block group,
3883 * therefore forced to try to allocate a new metadata block group.
3884 * This is because some other task allocated all available extents in the
3885 * meanwhile - this typically happens with tasks that don't reserve space
3886 * properly, either intentionally or as a bug. One example where this is
3887 * done intentionally is fsync, as it does not reserve any transaction units
3888 * and ends up allocating a variable number of metadata extents for log
ecd84d54
FM
3889 * tree extent buffers;
3890 *
3891 * 4) The task has reserved enough transaction units / metadata space, but right
3892 * before it tries to allocate the last extent buffer it needs, a discard
3893 * operation comes in and, temporarily, removes the last free space entry from
3894 * the only metadata block group that had free space (discard starts by
3895 * removing a free space entry from a block group, then does the discard
3896 * operation and, once it's done, it adds back the free space entry to the
3897 * block group).
79bd3712
FM
3898 *
3899 * We also need this 2 phases setup when adding a device to a filesystem with
3900 * a seed device - we must create new metadata and system chunks without adding
3901 * any of the block group items to the chunk, extent and device btrees. If we
3902 * did not do it this way, we would get ENOSPC when attempting to update those
3903 * btrees, since all the chunks from the seed device are read-only.
3904 *
3905 * Phase 1 does the updates and insertions to the chunk btree because if we had
3906 * it done in phase 2 and have a thundering herd of tasks allocating chunks in
3907 * parallel, we risk having too many system chunks allocated by many tasks if
3908 * many tasks reach phase 1 without the previous ones completing phase 2. In the
3909 * extreme case this leads to exhaustion of the system chunk array in the
3910 * superblock. This is easier to trigger if using a btree node/leaf size of 64K
3911 * and with RAID filesystems (so we have more device items in the chunk btree).
3912 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
3913 * the system chunk array due to concurrent allocations") provides more details.
3914 *
2bb2e00e
FM
3915 * Allocation of system chunks does not happen through this function. A task that
3916 * needs to update the chunk btree (the only btree that uses system chunks), must
3917 * preallocate chunk space by calling either check_system_chunk() or
3918 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
3919 * metadata chunk or when removing a chunk, while the later is used before doing
3920 * a modification to the chunk btree - use cases for the later are adding,
3921 * removing and resizing a device as well as relocation of a system chunk.
3922 * See the comment below for more details.
79bd3712
FM
3923 *
3924 * The reservation of system space, done through check_system_chunk(), as well
3925 * as all the updates and insertions into the chunk btree must be done while
3926 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
3927 * an extent buffer from the chunks btree we never trigger allocation of a new
3928 * system chunk, which would result in a deadlock (trying to lock twice an
3929 * extent buffer of the chunk btree, first time before triggering the chunk
3930 * allocation and the second time during chunk allocation while attempting to
3931 * update the chunks btree). The system chunk array is also updated while holding
3932 * that mutex. The same logic applies to removing chunks - we must reserve system
3933 * space, update the chunk btree and the system chunk array in the superblock
3934 * while holding fs_info->chunk_mutex.
3935 *
3936 * This function, btrfs_chunk_alloc(), belongs to phase 1.
3937 *
3938 * If @force is CHUNK_ALLOC_FORCE:
07730d87
JB
3939 * - return 1 if it successfully allocates a chunk,
3940 * - return errors including -ENOSPC otherwise.
79bd3712 3941 * If @force is NOT CHUNK_ALLOC_FORCE:
07730d87
JB
3942 * - return 0 if it doesn't need to allocate a new chunk,
3943 * - return 1 if it successfully allocates a chunk,
3944 * - return errors including -ENOSPC otherwise.
3945 */
3946int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3947 enum btrfs_chunk_alloc_enum force)
3948{
3949 struct btrfs_fs_info *fs_info = trans->fs_info;
3950 struct btrfs_space_info *space_info;
820c363b 3951 struct btrfs_block_group *ret_bg;
07730d87
JB
3952 bool wait_for_alloc = false;
3953 bool should_alloc = false;
760e69c4 3954 bool from_extent_allocation = false;
07730d87
JB
3955 int ret = 0;
3956
760e69c4
NA
3957 if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
3958 from_extent_allocation = true;
3959 force = CHUNK_ALLOC_FORCE;
3960 }
3961
07730d87
JB
3962 /* Don't re-enter if we're already allocating a chunk */
3963 if (trans->allocating_chunk)
3964 return -ENOSPC;
79bd3712 3965 /*
2bb2e00e
FM
3966 * Allocation of system chunks can not happen through this path, as we
3967 * could end up in a deadlock if we are allocating a data or metadata
3968 * chunk and there is another task modifying the chunk btree.
3969 *
3970 * This is because while we are holding the chunk mutex, we will attempt
3971 * to add the new chunk item to the chunk btree or update an existing
3972 * device item in the chunk btree, while the other task that is modifying
3973 * the chunk btree is attempting to COW an extent buffer while holding a
3974 * lock on it and on its parent - if the COW operation triggers a system
3975 * chunk allocation, then we can deadlock because we are holding the
3976 * chunk mutex and we may need to access that extent buffer or its parent
3977 * in order to add the chunk item or update a device item.
3978 *
3979 * Tasks that want to modify the chunk tree should reserve system space
3980 * before updating the chunk btree, by calling either
3981 * btrfs_reserve_chunk_metadata() or check_system_chunk().
3982 * It's possible that after a task reserves the space, it still ends up
3983 * here - this happens in the cases described above at do_chunk_alloc().
3984 * The task will have to either retry or fail.
79bd3712 3985 */
2bb2e00e 3986 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
79bd3712 3987 return -ENOSPC;
07730d87
JB
3988
3989 space_info = btrfs_find_space_info(fs_info, flags);
3990 ASSERT(space_info);
3991
3992 do {
3993 spin_lock(&space_info->lock);
3994 if (force < space_info->force_alloc)
3995 force = space_info->force_alloc;
3996 should_alloc = should_alloc_chunk(fs_info, space_info, force);
3997 if (space_info->full) {
3998 /* No more free physical space */
3999 if (should_alloc)
4000 ret = -ENOSPC;
4001 else
4002 ret = 0;
4003 spin_unlock(&space_info->lock);
4004 return ret;
4005 } else if (!should_alloc) {
4006 spin_unlock(&space_info->lock);
4007 return 0;
4008 } else if (space_info->chunk_alloc) {
4009 /*
4010 * Someone is already allocating, so we need to block
4011 * until this someone is finished and then loop to
4012 * recheck if we should continue with our allocation
4013 * attempt.
4014 */
4015 wait_for_alloc = true;
1314ca78 4016 force = CHUNK_ALLOC_NO_FORCE;
07730d87
JB
4017 spin_unlock(&space_info->lock);
4018 mutex_lock(&fs_info->chunk_mutex);
4019 mutex_unlock(&fs_info->chunk_mutex);
4020 } else {
4021 /* Proceed with allocation */
4022 space_info->chunk_alloc = 1;
4023 wait_for_alloc = false;
4024 spin_unlock(&space_info->lock);
4025 }
4026
4027 cond_resched();
4028 } while (wait_for_alloc);
4029
4030 mutex_lock(&fs_info->chunk_mutex);
4031 trans->allocating_chunk = true;
4032
4033 /*
4034 * If we have mixed data/metadata chunks we want to make sure we keep
4035 * allocating mixed chunks instead of individual chunks.
4036 */
4037 if (btrfs_mixed_space_info(space_info))
4038 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4039
4040 /*
4041 * if we're doing a data chunk, go ahead and make sure that
4042 * we keep a reasonable number of metadata chunks allocated in the
4043 * FS as well.
4044 */
4045 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4046 fs_info->data_chunk_allocations++;
4047 if (!(fs_info->data_chunk_allocations %
4048 fs_info->metadata_ratio))
4049 force_metadata_allocation(fs_info);
4050 }
4051
820c363b 4052 ret_bg = do_chunk_alloc(trans, flags);
07730d87
JB
4053 trans->allocating_chunk = false;
4054
760e69c4 4055 if (IS_ERR(ret_bg)) {
820c363b 4056 ret = PTR_ERR(ret_bg);
5a7d107e 4057 } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
760e69c4
NA
4058 /*
4059 * New block group is likely to be used soon. Try to activate
4060 * it now. Failure is OK for now.
4061 */
4062 btrfs_zone_activate(ret_bg);
4063 }
4064
4065 if (!ret)
820c363b
NA
4066 btrfs_put_block_group(ret_bg);
4067
07730d87
JB
4068 spin_lock(&space_info->lock);
4069 if (ret < 0) {
4070 if (ret == -ENOSPC)
4071 space_info->full = 1;
4072 else
4073 goto out;
4074 } else {
4075 ret = 1;
4076 space_info->max_extent_size = 0;
4077 }
4078
4079 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4080out:
4081 space_info->chunk_alloc = 0;
4082 spin_unlock(&space_info->lock);
4083 mutex_unlock(&fs_info->chunk_mutex);
07730d87
JB
4084
4085 return ret;
4086}
4087
4088static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4089{
4090 u64 num_dev;
4091
4092 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4093 if (!num_dev)
4094 num_dev = fs_info->fs_devices->rw_devices;
4095
4096 return num_dev;
4097}
4098
2bb2e00e
FM
4099static void reserve_chunk_space(struct btrfs_trans_handle *trans,
4100 u64 bytes,
4101 u64 type)
07730d87
JB
4102{
4103 struct btrfs_fs_info *fs_info = trans->fs_info;
4104 struct btrfs_space_info *info;
4105 u64 left;
07730d87 4106 int ret = 0;
07730d87
JB
4107
4108 /*
4109 * Needed because we can end up allocating a system chunk and for an
4110 * atomic and race free space reservation in the chunk block reserve.
4111 */
4112 lockdep_assert_held(&fs_info->chunk_mutex);
4113
4114 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4115 spin_lock(&info->lock);
4116 left = info->total_bytes - btrfs_space_info_used(info, true);
4117 spin_unlock(&info->lock);
4118
2bb2e00e 4119 if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
07730d87 4120 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
2bb2e00e 4121 left, bytes, type);
07730d87
JB
4122 btrfs_dump_space_info(fs_info, info, 0, 0);
4123 }
4124
2bb2e00e 4125 if (left < bytes) {
07730d87 4126 u64 flags = btrfs_system_alloc_profile(fs_info);
79bd3712 4127 struct btrfs_block_group *bg;
07730d87
JB
4128
4129 /*
4130 * Ignore failure to create system chunk. We might end up not
4131 * needing it, as we might not need to COW all nodes/leafs from
4132 * the paths we visit in the chunk tree (they were already COWed
4133 * or created in the current transaction for example).
4134 */
f6f39f7a 4135 bg = btrfs_create_chunk(trans, flags);
79bd3712
FM
4136 if (IS_ERR(bg)) {
4137 ret = PTR_ERR(bg);
2bb2e00e 4138 } else {
b6a98021
NA
4139 /*
4140 * We have a new chunk. We also need to activate it for
4141 * zoned filesystem.
4142 */
4143 ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
4144 if (ret < 0)
4145 return;
4146
79bd3712
FM
4147 /*
4148 * If we fail to add the chunk item here, we end up
4149 * trying again at phase 2 of chunk allocation, at
4150 * btrfs_create_pending_block_groups(). So ignore
2bb2e00e
FM
4151 * any error here. An ENOSPC here could happen, due to
4152 * the cases described at do_chunk_alloc() - the system
4153 * block group we just created was just turned into RO
4154 * mode by a scrub for example, or a running discard
4155 * temporarily removed its free space entries, etc.
79bd3712
FM
4156 */
4157 btrfs_chunk_alloc_add_chunk_item(trans, bg);
4158 }
07730d87
JB
4159 }
4160
4161 if (!ret) {
9270501c 4162 ret = btrfs_block_rsv_add(fs_info,
07730d87 4163 &fs_info->chunk_block_rsv,
2bb2e00e 4164 bytes, BTRFS_RESERVE_NO_FLUSH);
1cb3db1c 4165 if (!ret)
2bb2e00e 4166 trans->chunk_bytes_reserved += bytes;
07730d87
JB
4167 }
4168}
4169
2bb2e00e
FM
4170/*
4171 * Reserve space in the system space for allocating or removing a chunk.
4172 * The caller must be holding fs_info->chunk_mutex.
4173 */
4174void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4175{
4176 struct btrfs_fs_info *fs_info = trans->fs_info;
4177 const u64 num_devs = get_profile_num_devs(fs_info, type);
4178 u64 bytes;
4179
4180 /* num_devs device items to update and 1 chunk item to add or remove. */
4181 bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
4182 btrfs_calc_insert_metadata_size(fs_info, 1);
4183
4184 reserve_chunk_space(trans, bytes, type);
4185}
4186
4187/*
4188 * Reserve space in the system space, if needed, for doing a modification to the
4189 * chunk btree.
4190 *
4191 * @trans: A transaction handle.
4192 * @is_item_insertion: Indicate if the modification is for inserting a new item
4193 * in the chunk btree or if it's for the deletion or update
4194 * of an existing item.
4195 *
4196 * This is used in a context where we need to update the chunk btree outside
4197 * block group allocation and removal, to avoid a deadlock with a concurrent
4198 * task that is allocating a metadata or data block group and therefore needs to
4199 * update the chunk btree while holding the chunk mutex. After the update to the
4200 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
4201 *
4202 */
4203void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
4204 bool is_item_insertion)
4205{
4206 struct btrfs_fs_info *fs_info = trans->fs_info;
4207 u64 bytes;
4208
4209 if (is_item_insertion)
4210 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
4211 else
4212 bytes = btrfs_calc_metadata_size(fs_info, 1);
4213
4214 mutex_lock(&fs_info->chunk_mutex);
4215 reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
4216 mutex_unlock(&fs_info->chunk_mutex);
4217}
4218
3e43c279
JB
4219void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
4220{
32da5386 4221 struct btrfs_block_group *block_group;
3e43c279 4222
50c31eaa
JB
4223 block_group = btrfs_lookup_first_block_group(info, 0);
4224 while (block_group) {
4225 btrfs_wait_block_group_cache_done(block_group);
4226 spin_lock(&block_group->lock);
4227 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
4228 &block_group->runtime_flags)) {
4229 struct inode *inode = block_group->inode;
4230
4231 block_group->inode = NULL;
3e43c279 4232 spin_unlock(&block_group->lock);
3e43c279 4233
50c31eaa
JB
4234 ASSERT(block_group->io_ctl.inode == NULL);
4235 iput(inode);
4236 } else {
4237 spin_unlock(&block_group->lock);
4238 }
4239 block_group = btrfs_next_block_group(block_group);
3e43c279
JB
4240 }
4241}
4242
4243/*
4244 * Must be called only after stopping all workers, since we could have block
4245 * group caching kthreads running, and therefore they could race with us if we
4246 * freed the block groups before stopping them.
4247 */
4248int btrfs_free_block_groups(struct btrfs_fs_info *info)
4249{
32da5386 4250 struct btrfs_block_group *block_group;
3e43c279
JB
4251 struct btrfs_space_info *space_info;
4252 struct btrfs_caching_control *caching_ctl;
4253 struct rb_node *n;
4254
13bb483d
NA
4255 if (btrfs_is_zoned(info)) {
4256 if (info->active_meta_bg) {
4257 btrfs_put_block_group(info->active_meta_bg);
4258 info->active_meta_bg = NULL;
4259 }
4260 if (info->active_system_bg) {
4261 btrfs_put_block_group(info->active_system_bg);
4262 info->active_system_bg = NULL;
4263 }
4264 }
4265
16b0c258 4266 write_lock(&info->block_group_cache_lock);
3e43c279
JB
4267 while (!list_empty(&info->caching_block_groups)) {
4268 caching_ctl = list_entry(info->caching_block_groups.next,
4269 struct btrfs_caching_control, list);
4270 list_del(&caching_ctl->list);
4271 btrfs_put_caching_control(caching_ctl);
4272 }
16b0c258 4273 write_unlock(&info->block_group_cache_lock);
3e43c279
JB
4274
4275 spin_lock(&info->unused_bgs_lock);
4276 while (!list_empty(&info->unused_bgs)) {
4277 block_group = list_first_entry(&info->unused_bgs,
32da5386 4278 struct btrfs_block_group,
3e43c279
JB
4279 bg_list);
4280 list_del_init(&block_group->bg_list);
4281 btrfs_put_block_group(block_group);
4282 }
3e43c279 4283
18bb8bbf
JT
4284 while (!list_empty(&info->reclaim_bgs)) {
4285 block_group = list_first_entry(&info->reclaim_bgs,
4286 struct btrfs_block_group,
4287 bg_list);
4288 list_del_init(&block_group->bg_list);
4289 btrfs_put_block_group(block_group);
4290 }
4291 spin_unlock(&info->unused_bgs_lock);
4292
afba2bc0
NA
4293 spin_lock(&info->zone_active_bgs_lock);
4294 while (!list_empty(&info->zone_active_bgs)) {
4295 block_group = list_first_entry(&info->zone_active_bgs,
4296 struct btrfs_block_group,
4297 active_bg_list);
4298 list_del_init(&block_group->active_bg_list);
4299 btrfs_put_block_group(block_group);
4300 }
4301 spin_unlock(&info->zone_active_bgs_lock);
4302
16b0c258 4303 write_lock(&info->block_group_cache_lock);
08dddb29 4304 while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
32da5386 4305 block_group = rb_entry(n, struct btrfs_block_group,
3e43c279 4306 cache_node);
08dddb29
FM
4307 rb_erase_cached(&block_group->cache_node,
4308 &info->block_group_cache_tree);
3e43c279 4309 RB_CLEAR_NODE(&block_group->cache_node);
16b0c258 4310 write_unlock(&info->block_group_cache_lock);
3e43c279
JB
4311
4312 down_write(&block_group->space_info->groups_sem);
4313 list_del(&block_group->list);
4314 up_write(&block_group->space_info->groups_sem);
4315
4316 /*
4317 * We haven't cached this block group, which means we could
4318 * possibly have excluded extents on this block group.
4319 */
4320 if (block_group->cached == BTRFS_CACHE_NO ||
4321 block_group->cached == BTRFS_CACHE_ERROR)
4322 btrfs_free_excluded_extents(block_group);
4323
4324 btrfs_remove_free_space_cache(block_group);
4325 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4326 ASSERT(list_empty(&block_group->dirty_list));
4327 ASSERT(list_empty(&block_group->io_list));
4328 ASSERT(list_empty(&block_group->bg_list));
48aaeebe 4329 ASSERT(refcount_read(&block_group->refs) == 1);
195a49ea 4330 ASSERT(block_group->swap_extents == 0);
3e43c279
JB
4331 btrfs_put_block_group(block_group);
4332
16b0c258 4333 write_lock(&info->block_group_cache_lock);
3e43c279 4334 }
16b0c258 4335 write_unlock(&info->block_group_cache_lock);
3e43c279 4336
3e43c279
JB
4337 btrfs_release_global_block_rsv(info);
4338
4339 while (!list_empty(&info->space_info)) {
4340 space_info = list_entry(info->space_info.next,
4341 struct btrfs_space_info,
4342 list);
4343
4344 /*
4345 * Do not hide this behind enospc_debug, this is actually
4346 * important and indicates a real bug if this happens.
4347 */
4348 if (WARN_ON(space_info->bytes_pinned > 0 ||
3e43c279
JB
4349 space_info->bytes_may_use > 0))
4350 btrfs_dump_space_info(info, space_info, 0, 0);
40cdc509
FM
4351
4352 /*
4353 * If there was a failure to cleanup a log tree, very likely due
4354 * to an IO failure on a writeback attempt of one or more of its
4355 * extent buffers, we could not do proper (and cheap) unaccounting
4356 * of their reserved space, so don't warn on bytes_reserved > 0 in
4357 * that case.
4358 */
4359 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4360 !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4361 if (WARN_ON(space_info->bytes_reserved > 0))
4362 btrfs_dump_space_info(info, space_info, 0, 0);
4363 }
4364
d611add4 4365 WARN_ON(space_info->reclaim_size > 0);
3e43c279
JB
4366 list_del(&space_info->list);
4367 btrfs_sysfs_remove_space_info(space_info);
4368 }
4369 return 0;
4370}
684b752b
FM
4371
4372void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4373{
4374 atomic_inc(&cache->frozen);
4375}
4376
4377void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4378{
4379 struct btrfs_fs_info *fs_info = block_group->fs_info;
684b752b
FM
4380 bool cleanup;
4381
4382 spin_lock(&block_group->lock);
4383 cleanup = (atomic_dec_and_test(&block_group->frozen) &&
3349b57f 4384 test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
684b752b
FM
4385 spin_unlock(&block_group->lock);
4386
4387 if (cleanup) {
7dc66abb
FM
4388 struct btrfs_chunk_map *map;
4389
4390 map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
4391 /* Logic error, can't happen. */
4392 ASSERT(map);
4393
4394 btrfs_remove_chunk_map(fs_info, map);
4395
4396 /* Once for our lookup reference. */
4397 btrfs_free_chunk_map(map);
684b752b
FM
4398
4399 /*
4400 * We may have left one free space entry and other possible
4401 * tasks trimming this block group have left 1 entry each one.
4402 * Free them if any.
4403 */
fc80f7ac 4404 btrfs_remove_free_space_cache(block_group);
684b752b
FM
4405 }
4406}
195a49ea
FM
4407
4408bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4409{
4410 bool ret = true;
4411
4412 spin_lock(&bg->lock);
4413 if (bg->ro)
4414 ret = false;
4415 else
4416 bg->swap_extents++;
4417 spin_unlock(&bg->lock);
4418
4419 return ret;
4420}
4421
4422void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4423{
4424 spin_lock(&bg->lock);
4425 ASSERT(!bg->ro);
4426 ASSERT(bg->swap_extents >= amount);
4427 bg->swap_extents -= amount;
4428 spin_unlock(&bg->lock);
4429}
52bb7a21
BB
4430
4431enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
4432{
4433 if (size <= SZ_128K)
4434 return BTRFS_BG_SZ_SMALL;
4435 if (size <= SZ_8M)
4436 return BTRFS_BG_SZ_MEDIUM;
4437 return BTRFS_BG_SZ_LARGE;
4438}
4439
4440/*
4441 * Handle a block group allocating an extent in a size class
4442 *
4443 * @bg: The block group we allocated in.
4444 * @size_class: The size class of the allocation.
4445 * @force_wrong_size_class: Whether we are desperate enough to allow
4446 * mismatched size classes.
4447 *
4448 * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
4449 * case of a race that leads to the wrong size class without
4450 * force_wrong_size_class set.
4451 *
4452 * find_free_extent will skip block groups with a mismatched size class until
4453 * it really needs to avoid ENOSPC. In that case it will set
4454 * force_wrong_size_class. However, if a block group is newly allocated and
4455 * doesn't yet have a size class, then it is possible for two allocations of
4456 * different sizes to race and both try to use it. The loser is caught here and
4457 * has to retry.
4458 */
4459int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
4460 enum btrfs_block_group_size_class size_class,
4461 bool force_wrong_size_class)
4462{
4463 ASSERT(size_class != BTRFS_BG_SZ_NONE);
4464
4465 /* The new allocation is in the right size class, do nothing */
4466 if (bg->size_class == size_class)
4467 return 0;
4468 /*
4469 * The new allocation is in a mismatched size class.
4470 * This means one of two things:
4471 *
4472 * 1. Two tasks in find_free_extent for different size_classes raced
4473 * and hit the same empty block_group. Make the loser try again.
4474 * 2. A call to find_free_extent got desperate enough to set
4475 * 'force_wrong_slab'. Don't change the size_class, but allow the
4476 * allocation.
4477 */
4478 if (bg->size_class != BTRFS_BG_SZ_NONE) {
4479 if (force_wrong_size_class)
4480 return 0;
4481 return -EAGAIN;
4482 }
4483 /*
4484 * The happy new block group case: the new allocation is the first
4485 * one in the block_group so we set size_class.
4486 */
4487 bg->size_class = size_class;
4488
4489 return 0;
4490}
cb0922f2
BB
4491
4492bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
4493{
4494 if (btrfs_is_zoned(bg->fs_info))
4495 return false;
4496 if (!btrfs_is_block_group_data_only(bg))
4497 return false;
4498 return true;
4499}