1 // SPDX-License-Identifier: GPL-2.0
5 #include "space-info.h"
8 #include "free-space-cache.h"
9 #include "ordered-data.h"
10 #include "transaction.h"
11 #include "block-group.h"
14 #include "accessors.h"
15 #include "extent-tree.h"
18 * HOW DOES SPACE RESERVATION WORK
20 * If you want to know about delalloc specifically, there is a separate comment
21 * for that with the delalloc code. This comment is about how the whole system
26 * 1) space_info. This is the ultimate arbiter of how much space we can use.
27 * There's a description of the bytes_ fields with the struct declaration,
28 * refer to that for specifics on each field. Suffice it to say that for
29 * reservations we care about total_bytes - SUM(space_info->bytes_) when
30 * determining if there is space to make an allocation. There is a space_info
31 * for METADATA, SYSTEM, and DATA areas.
33 * 2) block_rsv's. These are basically buckets for every different type of
34 * metadata reservation we have. You can see the comment in the block_rsv
35 * code on the rules for each type, but generally block_rsv->reserved is how
36 * much space is accounted for in space_info->bytes_may_use.
38 * 3) btrfs_calc*_size. These are the worst case calculations we used based
39 * on the number of items we will want to modify. We have one for changing
40 * items, and one for inserting new items. Generally we use these helpers to
41 * determine the size of the block reserves, and then use the actual bytes
42 * values to adjust the space_info counters.
44 * MAKING RESERVATIONS, THE NORMAL CASE
46 * We call into either btrfs_reserve_data_bytes() or
47 * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
48 * num_bytes we want to reserve.
51 * space_info->bytes_may_reserve += num_bytes
54 * Call btrfs_add_reserved_bytes() which does
55 * space_info->bytes_may_reserve -= num_bytes
56 * space_info->bytes_reserved += extent_bytes
59 * Call btrfs_update_block_group() which does
60 * space_info->bytes_reserved -= extent_bytes
61 * space_info->bytes_used += extent_bytes
63 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
65 * Assume we are unable to simply make the reservation because we do not have
69 * create a reserve_ticket with ->bytes set to our reservation, add it to
70 * the tail of space_info->tickets, kick async flush thread
72 * ->handle_reserve_ticket
73 * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
76 * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
77 * Flushes various things attempting to free up space.
79 * -> btrfs_try_granting_tickets()
80 * This is called by anything that either subtracts space from
81 * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
82 * space_info->total_bytes. This loops through the ->priority_tickets and
83 * then the ->tickets list checking to see if the reservation can be
84 * completed. If it can the space is added to space_info->bytes_may_use and
85 * the ticket is woken up.
88 * Check if ->bytes == 0, if it does we got our reservation and we can carry
89 * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
92 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
94 * Same as the above, except we add ourselves to the
95 * space_info->priority_tickets, and we do not use ticket->wait, we simply
96 * call flush_space() ourselves for the states that are safe for us to call
97 * without deadlocking and hope for the best.
101 * Generally speaking we will have two cases for each state, a "nice" state
102 * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
103 * reduce the locking over head on the various trees, and even to keep from
104 * doing any work at all in the case of delayed refs. Each of these delayed
105 * things however hold reservations, and so letting them run allows us to
106 * reclaim space so we can make new reservations.
108 * FLUSH_DELAYED_ITEMS
109 * Every inode has a delayed item to update the inode. Take a simple write
110 * for example, we would update the inode item at write time to update the
111 * mtime, and then again at finish_ordered_io() time in order to update the
112 * isize or bytes. We keep these delayed items to coalesce these operations
113 * into a single operation done on demand. These are an easy way to reclaim
117 * Look at the delalloc comment to get an idea of how much space is reserved
118 * for delayed allocation. We can reclaim some of this space simply by
119 * running delalloc, but usually we need to wait for ordered extents to
120 * reclaim the bulk of this space.
123 * We have a block reserve for the outstanding delayed refs space, and every
124 * delayed ref operation holds a reservation. Running these is a quick way
125 * to reclaim space, but we want to hold this until the end because COW can
126 * churn a lot and we can avoid making some extent tree modifications if we
127 * are able to delay for as long as possible.
130 * We will skip this the first time through space reservation, because of
131 * overcommit and we don't want to have a lot of useless metadata space when
132 * our worst case reservations will likely never come true.
135 * If we're freeing inodes we're likely freeing checksums, file extent
136 * items, and extent tree items. Loads of space could be freed up by these
137 * operations, however they won't be usable until the transaction commits.
140 * This will commit the transaction. Historically we had a lot of logic
141 * surrounding whether or not we'd commit the transaction, but this waits born
142 * out of a pre-tickets era where we could end up committing the transaction
143 * thousands of times in a row without making progress. Now thanks to our
144 * ticketing system we know if we're not making progress and can error
145 * everybody out after a few commits rather than burning the disk hoping for
146 * a different answer.
150 * Because we hold so many reservations for metadata we will allow you to
151 * reserve more space than is currently free in the currently allocate
152 * metadata space. This only happens with metadata, data does not allow
155 * You can see the current logic for when we allow overcommit in
156 * btrfs_can_overcommit(), but it only applies to unallocated space. If there
157 * is no unallocated space to be had, all reservations are kept within the
158 * free space in the allocated metadata chunks.
160 * Because of overcommitting, you generally want to use the
161 * btrfs_can_overcommit() logic for metadata allocations, as it does the right
162 * thing with or without extra unallocated space.
165 u64 __pure
btrfs_space_info_used(struct btrfs_space_info
*s_info
,
166 bool may_use_included
)
169 return s_info
->bytes_used
+ s_info
->bytes_reserved
+
170 s_info
->bytes_pinned
+ s_info
->bytes_readonly
+
171 s_info
->bytes_zone_unusable
+
172 (may_use_included
? s_info
->bytes_may_use
: 0);
176 * after adding space to the filesystem, we need to clear the full flags
177 * on all the space infos.
179 void btrfs_clear_space_info_full(struct btrfs_fs_info
*info
)
181 struct list_head
*head
= &info
->space_info
;
182 struct btrfs_space_info
*found
;
184 list_for_each_entry(found
, head
, list
)
189 * Block groups with more than this value (percents) of unusable space will be
190 * scheduled for background reclaim.
192 #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
195 * Calculate chunk size depending on volume type (regular or zoned).
197 static u64
calc_chunk_size(const struct btrfs_fs_info
*fs_info
, u64 flags
)
199 if (btrfs_is_zoned(fs_info
))
200 return fs_info
->zone_size
;
202 ASSERT(flags
& BTRFS_BLOCK_GROUP_TYPE_MASK
);
204 if (flags
& BTRFS_BLOCK_GROUP_DATA
)
205 return BTRFS_MAX_DATA_CHUNK_SIZE
;
206 else if (flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
209 /* Handle BTRFS_BLOCK_GROUP_METADATA */
210 if (fs_info
->fs_devices
->total_rw_bytes
> 50ULL * SZ_1G
)
217 * Update default chunk size.
219 void btrfs_update_space_info_chunk_size(struct btrfs_space_info
*space_info
,
222 WRITE_ONCE(space_info
->chunk_size
, chunk_size
);
225 static int create_space_info(struct btrfs_fs_info
*info
, u64 flags
)
228 struct btrfs_space_info
*space_info
;
232 space_info
= kzalloc(sizeof(*space_info
), GFP_NOFS
);
236 for (i
= 0; i
< BTRFS_NR_RAID_TYPES
; i
++)
237 INIT_LIST_HEAD(&space_info
->block_groups
[i
]);
238 init_rwsem(&space_info
->groups_sem
);
239 spin_lock_init(&space_info
->lock
);
240 space_info
->flags
= flags
& BTRFS_BLOCK_GROUP_TYPE_MASK
;
241 space_info
->force_alloc
= CHUNK_ALLOC_NO_FORCE
;
242 INIT_LIST_HEAD(&space_info
->ro_bgs
);
243 INIT_LIST_HEAD(&space_info
->tickets
);
244 INIT_LIST_HEAD(&space_info
->priority_tickets
);
245 space_info
->clamp
= 1;
246 btrfs_update_space_info_chunk_size(space_info
, calc_chunk_size(info
, flags
));
248 if (btrfs_is_zoned(info
))
249 space_info
->bg_reclaim_threshold
= BTRFS_DEFAULT_ZONED_RECLAIM_THRESH
;
251 ret
= btrfs_sysfs_add_space_info_type(info
, space_info
);
255 list_add(&space_info
->list
, &info
->space_info
);
256 if (flags
& BTRFS_BLOCK_GROUP_DATA
)
257 info
->data_sinfo
= space_info
;
262 int btrfs_init_space_info(struct btrfs_fs_info
*fs_info
)
264 struct btrfs_super_block
*disk_super
;
270 disk_super
= fs_info
->super_copy
;
271 if (!btrfs_super_root(disk_super
))
274 features
= btrfs_super_incompat_flags(disk_super
);
275 if (features
& BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS
)
278 flags
= BTRFS_BLOCK_GROUP_SYSTEM
;
279 ret
= create_space_info(fs_info
, flags
);
284 flags
= BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_DATA
;
285 ret
= create_space_info(fs_info
, flags
);
287 flags
= BTRFS_BLOCK_GROUP_METADATA
;
288 ret
= create_space_info(fs_info
, flags
);
292 flags
= BTRFS_BLOCK_GROUP_DATA
;
293 ret
= create_space_info(fs_info
, flags
);
299 void btrfs_add_bg_to_space_info(struct btrfs_fs_info
*info
,
300 struct btrfs_block_group
*block_group
)
302 struct btrfs_space_info
*found
;
305 factor
= btrfs_bg_type_to_factor(block_group
->flags
);
307 found
= btrfs_find_space_info(info
, block_group
->flags
);
309 spin_lock(&found
->lock
);
310 found
->total_bytes
+= block_group
->length
;
311 found
->disk_total
+= block_group
->length
* factor
;
312 found
->bytes_used
+= block_group
->used
;
313 found
->disk_used
+= block_group
->used
* factor
;
314 found
->bytes_readonly
+= block_group
->bytes_super
;
315 found
->bytes_zone_unusable
+= block_group
->zone_unusable
;
316 if (block_group
->length
> 0)
318 btrfs_try_granting_tickets(info
, found
);
319 spin_unlock(&found
->lock
);
321 block_group
->space_info
= found
;
323 index
= btrfs_bg_flags_to_raid_index(block_group
->flags
);
324 down_write(&found
->groups_sem
);
325 list_add_tail(&block_group
->list
, &found
->block_groups
[index
]);
326 up_write(&found
->groups_sem
);
329 struct btrfs_space_info
*btrfs_find_space_info(struct btrfs_fs_info
*info
,
332 struct list_head
*head
= &info
->space_info
;
333 struct btrfs_space_info
*found
;
335 flags
&= BTRFS_BLOCK_GROUP_TYPE_MASK
;
337 list_for_each_entry(found
, head
, list
) {
338 if (found
->flags
& flags
)
344 static u64
calc_available_free_space(struct btrfs_fs_info
*fs_info
,
345 struct btrfs_space_info
*space_info
,
346 enum btrfs_reserve_flush_enum flush
)
352 if (space_info
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
353 profile
= btrfs_system_alloc_profile(fs_info
);
355 profile
= btrfs_metadata_alloc_profile(fs_info
);
357 avail
= atomic64_read(&fs_info
->free_chunk_space
);
360 * If we have dup, raid1 or raid10 then only half of the free
361 * space is actually usable. For raid56, the space info used
362 * doesn't include the parity drive, so we don't have to
365 factor
= btrfs_bg_type_to_factor(profile
);
366 avail
= div_u64(avail
, factor
);
369 * If we aren't flushing all things, let us overcommit up to
370 * 1/2th of the space. If we can flush, don't let us overcommit
371 * too much, let it overcommit up to 1/8 of the space.
373 if (flush
== BTRFS_RESERVE_FLUSH_ALL
)
380 int btrfs_can_overcommit(struct btrfs_fs_info
*fs_info
,
381 struct btrfs_space_info
*space_info
, u64 bytes
,
382 enum btrfs_reserve_flush_enum flush
)
387 /* Don't overcommit when in mixed mode */
388 if (space_info
->flags
& BTRFS_BLOCK_GROUP_DATA
)
391 used
= btrfs_space_info_used(space_info
, true);
392 if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
) &&
393 (space_info
->flags
& BTRFS_BLOCK_GROUP_METADATA
))
396 avail
= calc_available_free_space(fs_info
, space_info
, flush
);
398 if (used
+ bytes
< space_info
->total_bytes
+ avail
)
403 static void remove_ticket(struct btrfs_space_info
*space_info
,
404 struct reserve_ticket
*ticket
)
406 if (!list_empty(&ticket
->list
)) {
407 list_del_init(&ticket
->list
);
408 ASSERT(space_info
->reclaim_size
>= ticket
->bytes
);
409 space_info
->reclaim_size
-= ticket
->bytes
;
414 * This is for space we already have accounted in space_info->bytes_may_use, so
415 * basically when we're returning space from block_rsv's.
417 void btrfs_try_granting_tickets(struct btrfs_fs_info
*fs_info
,
418 struct btrfs_space_info
*space_info
)
420 struct list_head
*head
;
421 enum btrfs_reserve_flush_enum flush
= BTRFS_RESERVE_NO_FLUSH
;
423 lockdep_assert_held(&space_info
->lock
);
425 head
= &space_info
->priority_tickets
;
427 while (!list_empty(head
)) {
428 struct reserve_ticket
*ticket
;
429 u64 used
= btrfs_space_info_used(space_info
, true);
431 ticket
= list_first_entry(head
, struct reserve_ticket
, list
);
433 /* Check and see if our ticket can be satisfied now. */
434 if ((used
+ ticket
->bytes
<= space_info
->total_bytes
) ||
435 btrfs_can_overcommit(fs_info
, space_info
, ticket
->bytes
,
437 btrfs_space_info_update_bytes_may_use(fs_info
,
440 remove_ticket(space_info
, ticket
);
442 space_info
->tickets_id
++;
443 wake_up(&ticket
->wait
);
449 if (head
== &space_info
->priority_tickets
) {
450 head
= &space_info
->tickets
;
451 flush
= BTRFS_RESERVE_FLUSH_ALL
;
456 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \
458 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
459 spin_lock(&__rsv->lock); \
460 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
461 __rsv->size, __rsv->reserved); \
462 spin_unlock(&__rsv->lock); \
465 static const char *space_info_flag_to_str(const struct btrfs_space_info
*space_info
)
467 switch (space_info
->flags
) {
468 case BTRFS_BLOCK_GROUP_SYSTEM
:
470 case BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_DATA
:
471 return "DATA+METADATA";
472 case BTRFS_BLOCK_GROUP_DATA
:
474 case BTRFS_BLOCK_GROUP_METADATA
:
481 static void dump_global_block_rsv(struct btrfs_fs_info
*fs_info
)
483 DUMP_BLOCK_RSV(fs_info
, global_block_rsv
);
484 DUMP_BLOCK_RSV(fs_info
, trans_block_rsv
);
485 DUMP_BLOCK_RSV(fs_info
, chunk_block_rsv
);
486 DUMP_BLOCK_RSV(fs_info
, delayed_block_rsv
);
487 DUMP_BLOCK_RSV(fs_info
, delayed_refs_rsv
);
490 static void __btrfs_dump_space_info(struct btrfs_fs_info
*fs_info
,
491 struct btrfs_space_info
*info
)
493 const char *flag_str
= space_info_flag_to_str(info
);
494 lockdep_assert_held(&info
->lock
);
496 /* The free space could be negative in case of overcommit */
497 btrfs_info(fs_info
, "space_info %s has %lld free, is %sfull",
499 (s64
)(info
->total_bytes
- btrfs_space_info_used(info
, true)),
500 info
->full
? "" : "not ");
502 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
503 info
->total_bytes
, info
->bytes_used
, info
->bytes_pinned
,
504 info
->bytes_reserved
, info
->bytes_may_use
,
505 info
->bytes_readonly
, info
->bytes_zone_unusable
);
508 void btrfs_dump_space_info(struct btrfs_fs_info
*fs_info
,
509 struct btrfs_space_info
*info
, u64 bytes
,
510 int dump_block_groups
)
512 struct btrfs_block_group
*cache
;
515 spin_lock(&info
->lock
);
516 __btrfs_dump_space_info(fs_info
, info
);
517 dump_global_block_rsv(fs_info
);
518 spin_unlock(&info
->lock
);
520 if (!dump_block_groups
)
523 down_read(&info
->groups_sem
);
525 list_for_each_entry(cache
, &info
->block_groups
[index
], list
) {
526 spin_lock(&cache
->lock
);
528 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
529 cache
->start
, cache
->length
, cache
->used
, cache
->pinned
,
530 cache
->reserved
, cache
->zone_unusable
,
531 cache
->ro
? "[readonly]" : "");
532 spin_unlock(&cache
->lock
);
533 btrfs_dump_free_space(cache
, bytes
);
535 if (++index
< BTRFS_NR_RAID_TYPES
)
537 up_read(&info
->groups_sem
);
540 static inline u64
calc_reclaim_items_nr(const struct btrfs_fs_info
*fs_info
,
546 bytes
= btrfs_calc_insert_metadata_size(fs_info
, 1);
547 nr
= div64_u64(to_reclaim
, bytes
);
553 static inline u64
calc_delayed_refs_nr(const struct btrfs_fs_info
*fs_info
,
556 const u64 bytes
= btrfs_calc_delayed_ref_bytes(fs_info
, 1);
559 nr
= div64_u64(to_reclaim
, bytes
);
565 #define EXTENT_SIZE_PER_ITEM SZ_256K
568 * shrink metadata reservation for delalloc
570 static void shrink_delalloc(struct btrfs_fs_info
*fs_info
,
571 struct btrfs_space_info
*space_info
,
572 u64 to_reclaim
, bool wait_ordered
,
575 struct btrfs_trans_handle
*trans
;
582 delalloc_bytes
= percpu_counter_sum_positive(&fs_info
->delalloc_bytes
);
583 ordered_bytes
= percpu_counter_sum_positive(&fs_info
->ordered_bytes
);
584 if (delalloc_bytes
== 0 && ordered_bytes
== 0)
587 /* Calc the number of the pages we need flush for space reservation */
588 if (to_reclaim
== U64_MAX
) {
592 * to_reclaim is set to however much metadata we need to
593 * reclaim, but reclaiming that much data doesn't really track
594 * exactly. What we really want to do is reclaim full inode's
595 * worth of reservations, however that's not available to us
596 * here. We will take a fraction of the delalloc bytes for our
597 * flushing loops and hope for the best. Delalloc will expand
598 * the amount we write to cover an entire dirty extent, which
599 * will reclaim the metadata reservation for that range. If
600 * it's not enough subsequent flush stages will be more
603 to_reclaim
= max(to_reclaim
, delalloc_bytes
>> 3);
604 items
= calc_reclaim_items_nr(fs_info
, to_reclaim
) * 2;
607 trans
= current
->journal_info
;
610 * If we are doing more ordered than delalloc we need to just wait on
611 * ordered extents, otherwise we'll waste time trying to flush delalloc
612 * that likely won't give us the space back we need.
614 if (ordered_bytes
> delalloc_bytes
&& !for_preempt
)
618 while ((delalloc_bytes
|| ordered_bytes
) && loops
< 3) {
619 u64 temp
= min(delalloc_bytes
, to_reclaim
) >> PAGE_SHIFT
;
620 long nr_pages
= min_t(u64
, temp
, LONG_MAX
);
623 btrfs_start_delalloc_roots(fs_info
, nr_pages
, true);
626 * We need to make sure any outstanding async pages are now
627 * processed before we continue. This is because things like
628 * sync_inode() try to be smart and skip writing if the inode is
629 * marked clean. We don't use filemap_fwrite for flushing
630 * because we want to control how many pages we write out at a
631 * time, thus this is the only safe way to make sure we've
632 * waited for outstanding compressed workers to have started
633 * their jobs and thus have ordered extents set up properly.
635 * This exists because we do not want to wait for each
636 * individual inode to finish its async work, we simply want to
637 * start the IO on everybody, and then come back here and wait
638 * for all of the async work to catch up. Once we're done with
639 * that we know we'll have ordered extents for everything and we
640 * can decide if we wait for that or not.
642 * If we choose to replace this in the future, make absolutely
643 * sure that the proper waiting is being done in the async case,
644 * as there have been bugs in that area before.
646 async_pages
= atomic_read(&fs_info
->async_delalloc_pages
);
651 * We don't want to wait forever, if we wrote less pages in this
652 * loop than we have outstanding, only wait for that number of
653 * pages, otherwise we can wait for all async pages to finish
656 if (async_pages
> nr_pages
)
657 async_pages
-= nr_pages
;
660 wait_event(fs_info
->async_submit_wait
,
661 atomic_read(&fs_info
->async_delalloc_pages
) <=
665 if (wait_ordered
&& !trans
) {
666 btrfs_wait_ordered_roots(fs_info
, items
, 0, (u64
)-1);
668 time_left
= schedule_timeout_killable(1);
674 * If we are for preemption we just want a one-shot of delalloc
675 * flushing so we can stop flushing if we decide we don't need
681 spin_lock(&space_info
->lock
);
682 if (list_empty(&space_info
->tickets
) &&
683 list_empty(&space_info
->priority_tickets
)) {
684 spin_unlock(&space_info
->lock
);
687 spin_unlock(&space_info
->lock
);
689 delalloc_bytes
= percpu_counter_sum_positive(
690 &fs_info
->delalloc_bytes
);
691 ordered_bytes
= percpu_counter_sum_positive(
692 &fs_info
->ordered_bytes
);
697 * Try to flush some data based on policy set by @state. This is only advisory
698 * and may fail for various reasons. The caller is supposed to examine the
699 * state of @space_info to detect the outcome.
701 static void flush_space(struct btrfs_fs_info
*fs_info
,
702 struct btrfs_space_info
*space_info
, u64 num_bytes
,
703 enum btrfs_flush_state state
, bool for_preempt
)
705 struct btrfs_root
*root
= fs_info
->tree_root
;
706 struct btrfs_trans_handle
*trans
;
711 case FLUSH_DELAYED_ITEMS_NR
:
712 case FLUSH_DELAYED_ITEMS
:
713 if (state
== FLUSH_DELAYED_ITEMS_NR
)
714 nr
= calc_reclaim_items_nr(fs_info
, num_bytes
) * 2;
718 trans
= btrfs_join_transaction(root
);
720 ret
= PTR_ERR(trans
);
723 ret
= btrfs_run_delayed_items_nr(trans
, nr
);
724 btrfs_end_transaction(trans
);
727 case FLUSH_DELALLOC_WAIT
:
728 case FLUSH_DELALLOC_FULL
:
729 if (state
== FLUSH_DELALLOC_FULL
)
731 shrink_delalloc(fs_info
, space_info
, num_bytes
,
732 state
!= FLUSH_DELALLOC
, for_preempt
);
734 case FLUSH_DELAYED_REFS_NR
:
735 case FLUSH_DELAYED_REFS
:
736 trans
= btrfs_join_transaction(root
);
738 ret
= PTR_ERR(trans
);
741 if (state
== FLUSH_DELAYED_REFS_NR
)
742 nr
= calc_delayed_refs_nr(fs_info
, num_bytes
);
745 btrfs_run_delayed_refs(trans
, nr
);
746 btrfs_end_transaction(trans
);
749 case ALLOC_CHUNK_FORCE
:
751 * For metadata space on zoned filesystem, reaching here means we
752 * don't have enough space left in active_total_bytes. Try to
753 * activate a block group first, because we may have inactive
754 * block group already allocated.
756 ret
= btrfs_zoned_activate_one_bg(fs_info
, space_info
, false);
762 trans
= btrfs_join_transaction(root
);
764 ret
= PTR_ERR(trans
);
767 ret
= btrfs_chunk_alloc(trans
,
768 btrfs_get_alloc_profile(fs_info
, space_info
->flags
),
769 (state
== ALLOC_CHUNK
) ? CHUNK_ALLOC_NO_FORCE
:
771 btrfs_end_transaction(trans
);
774 * For metadata space on zoned filesystem, allocating a new chunk
775 * is not enough. We still need to activate the block * group.
776 * Active the newly allocated block group by (maybe) finishing
780 ret
= btrfs_zoned_activate_one_bg(fs_info
, space_info
, true);
782 * Revert to the original ret regardless we could finish
783 * one block group or not.
789 if (ret
> 0 || ret
== -ENOSPC
)
792 case RUN_DELAYED_IPUTS
:
794 * If we have pending delayed iputs then we could free up a
795 * bunch of pinned space, so make sure we run the iputs before
796 * we do our pinned bytes check below.
798 btrfs_run_delayed_iputs(fs_info
);
799 btrfs_wait_on_delayed_iputs(fs_info
);
802 ASSERT(current
->journal_info
== NULL
);
803 trans
= btrfs_join_transaction(root
);
805 ret
= PTR_ERR(trans
);
808 ret
= btrfs_commit_transaction(trans
);
815 trace_btrfs_flush_space(fs_info
, space_info
->flags
, num_bytes
, state
,
821 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info
*fs_info
,
822 struct btrfs_space_info
*space_info
)
826 u64 to_reclaim
= space_info
->reclaim_size
;
828 lockdep_assert_held(&space_info
->lock
);
830 avail
= calc_available_free_space(fs_info
, space_info
,
831 BTRFS_RESERVE_FLUSH_ALL
);
832 used
= btrfs_space_info_used(space_info
, true);
835 * We may be flushing because suddenly we have less space than we had
836 * before, and now we're well over-committed based on our current free
837 * space. If that's the case add in our overage so we make sure to put
838 * appropriate pressure on the flushing state machine.
840 if (space_info
->total_bytes
+ avail
< used
)
841 to_reclaim
+= used
- (space_info
->total_bytes
+ avail
);
846 static bool need_preemptive_reclaim(struct btrfs_fs_info
*fs_info
,
847 struct btrfs_space_info
*space_info
)
849 u64 global_rsv_size
= fs_info
->global_block_rsv
.reserved
;
850 u64 ordered
, delalloc
;
854 thresh
= mult_perc(space_info
->total_bytes
, 90);
856 lockdep_assert_held(&space_info
->lock
);
858 /* If we're just plain full then async reclaim just slows us down. */
859 if ((space_info
->bytes_used
+ space_info
->bytes_reserved
+
860 global_rsv_size
) >= thresh
)
863 used
= space_info
->bytes_may_use
+ space_info
->bytes_pinned
;
865 /* The total flushable belongs to the global rsv, don't flush. */
866 if (global_rsv_size
>= used
)
870 * 128MiB is 1/4 of the maximum global rsv size. If we have less than
871 * that devoted to other reservations then there's no sense in flushing,
872 * we don't have a lot of things that need flushing.
874 if (used
- global_rsv_size
<= SZ_128M
)
878 * We have tickets queued, bail so we don't compete with the async
881 if (space_info
->reclaim_size
)
885 * If we have over half of the free space occupied by reservations or
886 * pinned then we want to start flushing.
888 * We do not do the traditional thing here, which is to say
890 * if (used >= ((total_bytes + avail) / 2))
893 * because this doesn't quite work how we want. If we had more than 50%
894 * of the space_info used by bytes_used and we had 0 available we'd just
895 * constantly run the background flusher. Instead we want it to kick in
896 * if our reclaimable space exceeds our clamped free space.
898 * Our clamping range is 2^1 -> 2^8. Practically speaking that means
901 * Amount of RAM Minimum threshold Maximum threshold
904 * 128GiB 512MiB 64GiB
909 * These are the range our thresholds will fall in, corresponding to how
910 * much delalloc we need for the background flusher to kick in.
913 thresh
= calc_available_free_space(fs_info
, space_info
,
914 BTRFS_RESERVE_FLUSH_ALL
);
915 used
= space_info
->bytes_used
+ space_info
->bytes_reserved
+
916 space_info
->bytes_readonly
+ global_rsv_size
;
917 if (used
< space_info
->total_bytes
)
918 thresh
+= space_info
->total_bytes
- used
;
919 thresh
>>= space_info
->clamp
;
921 used
= space_info
->bytes_pinned
;
924 * If we have more ordered bytes than delalloc bytes then we're either
925 * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
926 * around. Preemptive flushing is only useful in that it can free up
927 * space before tickets need to wait for things to finish. In the case
928 * of ordered extents, preemptively waiting on ordered extents gets us
929 * nothing, if our reservations are tied up in ordered extents we'll
930 * simply have to slow down writers by forcing them to wait on ordered
933 * In the case that ordered is larger than delalloc, only include the
934 * block reserves that we would actually be able to directly reclaim
935 * from. In this case if we're heavy on metadata operations this will
936 * clearly be heavy enough to warrant preemptive flushing. In the case
937 * of heavy DIO or ordered reservations, preemptive flushing will just
938 * waste time and cause us to slow down.
940 * We want to make sure we truly are maxed out on ordered however, so
941 * cut ordered in half, and if it's still higher than delalloc then we
942 * can keep flushing. This is to avoid the case where we start
943 * flushing, and now delalloc == ordered and we stop preemptively
944 * flushing when we could still have several gigs of delalloc to flush.
946 ordered
= percpu_counter_read_positive(&fs_info
->ordered_bytes
) >> 1;
947 delalloc
= percpu_counter_read_positive(&fs_info
->delalloc_bytes
);
948 if (ordered
>= delalloc
)
949 used
+= fs_info
->delayed_refs_rsv
.reserved
+
950 fs_info
->delayed_block_rsv
.reserved
;
952 used
+= space_info
->bytes_may_use
- global_rsv_size
;
954 return (used
>= thresh
&& !btrfs_fs_closing(fs_info
) &&
955 !test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
));
958 static bool steal_from_global_rsv(struct btrfs_fs_info
*fs_info
,
959 struct btrfs_space_info
*space_info
,
960 struct reserve_ticket
*ticket
)
962 struct btrfs_block_rsv
*global_rsv
= &fs_info
->global_block_rsv
;
968 if (global_rsv
->space_info
!= space_info
)
971 spin_lock(&global_rsv
->lock
);
972 min_bytes
= mult_perc(global_rsv
->size
, 10);
973 if (global_rsv
->reserved
< min_bytes
+ ticket
->bytes
) {
974 spin_unlock(&global_rsv
->lock
);
977 global_rsv
->reserved
-= ticket
->bytes
;
978 remove_ticket(space_info
, ticket
);
980 wake_up(&ticket
->wait
);
981 space_info
->tickets_id
++;
982 if (global_rsv
->reserved
< global_rsv
->size
)
983 global_rsv
->full
= 0;
984 spin_unlock(&global_rsv
->lock
);
990 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
991 * @fs_info - fs_info for this fs
992 * @space_info - the space info we were flushing
994 * We call this when we've exhausted our flushing ability and haven't made
995 * progress in satisfying tickets. The reservation code handles tickets in
996 * order, so if there is a large ticket first and then smaller ones we could
997 * very well satisfy the smaller tickets. This will attempt to wake up any
998 * tickets in the list to catch this case.
1000 * This function returns true if it was able to make progress by clearing out
1001 * other tickets, or if it stumbles across a ticket that was smaller than the
1004 static bool maybe_fail_all_tickets(struct btrfs_fs_info
*fs_info
,
1005 struct btrfs_space_info
*space_info
)
1007 struct reserve_ticket
*ticket
;
1008 u64 tickets_id
= space_info
->tickets_id
;
1009 const bool aborted
= BTRFS_FS_ERROR(fs_info
);
1011 trace_btrfs_fail_all_tickets(fs_info
, space_info
);
1013 if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
)) {
1014 btrfs_info(fs_info
, "cannot satisfy tickets, dumping space info");
1015 __btrfs_dump_space_info(fs_info
, space_info
);
1018 while (!list_empty(&space_info
->tickets
) &&
1019 tickets_id
== space_info
->tickets_id
) {
1020 ticket
= list_first_entry(&space_info
->tickets
,
1021 struct reserve_ticket
, list
);
1023 if (!aborted
&& steal_from_global_rsv(fs_info
, space_info
, ticket
))
1026 if (!aborted
&& btrfs_test_opt(fs_info
, ENOSPC_DEBUG
))
1027 btrfs_info(fs_info
, "failing ticket with %llu bytes",
1030 remove_ticket(space_info
, ticket
);
1032 ticket
->error
= -EIO
;
1034 ticket
->error
= -ENOSPC
;
1035 wake_up(&ticket
->wait
);
1038 * We're just throwing tickets away, so more flushing may not
1039 * trip over btrfs_try_granting_tickets, so we need to call it
1040 * here to see if we can make progress with the next ticket in
1044 btrfs_try_granting_tickets(fs_info
, space_info
);
1046 return (tickets_id
!= space_info
->tickets_id
);
1050 * This is for normal flushers, we can wait all goddamned day if we want to. We
1051 * will loop and continuously try to flush as long as we are making progress.
1052 * We count progress as clearing off tickets each time we have to loop.
1054 static void btrfs_async_reclaim_metadata_space(struct work_struct
*work
)
1056 struct btrfs_fs_info
*fs_info
;
1057 struct btrfs_space_info
*space_info
;
1059 enum btrfs_flush_state flush_state
;
1060 int commit_cycles
= 0;
1061 u64 last_tickets_id
;
1063 fs_info
= container_of(work
, struct btrfs_fs_info
, async_reclaim_work
);
1064 space_info
= btrfs_find_space_info(fs_info
, BTRFS_BLOCK_GROUP_METADATA
);
1066 spin_lock(&space_info
->lock
);
1067 to_reclaim
= btrfs_calc_reclaim_metadata_size(fs_info
, space_info
);
1069 space_info
->flush
= 0;
1070 spin_unlock(&space_info
->lock
);
1073 last_tickets_id
= space_info
->tickets_id
;
1074 spin_unlock(&space_info
->lock
);
1076 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1078 flush_space(fs_info
, space_info
, to_reclaim
, flush_state
, false);
1079 spin_lock(&space_info
->lock
);
1080 if (list_empty(&space_info
->tickets
)) {
1081 space_info
->flush
= 0;
1082 spin_unlock(&space_info
->lock
);
1085 to_reclaim
= btrfs_calc_reclaim_metadata_size(fs_info
,
1087 if (last_tickets_id
== space_info
->tickets_id
) {
1090 last_tickets_id
= space_info
->tickets_id
;
1091 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1097 * We do not want to empty the system of delalloc unless we're
1098 * under heavy pressure, so allow one trip through the flushing
1099 * logic before we start doing a FLUSH_DELALLOC_FULL.
1101 if (flush_state
== FLUSH_DELALLOC_FULL
&& !commit_cycles
)
1105 * We don't want to force a chunk allocation until we've tried
1106 * pretty hard to reclaim space. Think of the case where we
1107 * freed up a bunch of space and so have a lot of pinned space
1108 * to reclaim. We would rather use that than possibly create a
1109 * underutilized metadata chunk. So if this is our first run
1110 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
1111 * commit the transaction. If nothing has changed the next go
1112 * around then we can force a chunk allocation.
1114 if (flush_state
== ALLOC_CHUNK_FORCE
&& !commit_cycles
)
1117 if (flush_state
> COMMIT_TRANS
) {
1119 if (commit_cycles
> 2) {
1120 if (maybe_fail_all_tickets(fs_info
, space_info
)) {
1121 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1124 space_info
->flush
= 0;
1127 flush_state
= FLUSH_DELAYED_ITEMS_NR
;
1130 spin_unlock(&space_info
->lock
);
1131 } while (flush_state
<= COMMIT_TRANS
);
1135 * This handles pre-flushing of metadata space before we get to the point that
1136 * we need to start blocking threads on tickets. The logic here is different
1137 * from the other flush paths because it doesn't rely on tickets to tell us how
1138 * much we need to flush, instead it attempts to keep us below the 80% full
1139 * watermark of space by flushing whichever reservation pool is currently the
1142 static void btrfs_preempt_reclaim_metadata_space(struct work_struct
*work
)
1144 struct btrfs_fs_info
*fs_info
;
1145 struct btrfs_space_info
*space_info
;
1146 struct btrfs_block_rsv
*delayed_block_rsv
;
1147 struct btrfs_block_rsv
*delayed_refs_rsv
;
1148 struct btrfs_block_rsv
*global_rsv
;
1149 struct btrfs_block_rsv
*trans_rsv
;
1152 fs_info
= container_of(work
, struct btrfs_fs_info
,
1153 preempt_reclaim_work
);
1154 space_info
= btrfs_find_space_info(fs_info
, BTRFS_BLOCK_GROUP_METADATA
);
1155 delayed_block_rsv
= &fs_info
->delayed_block_rsv
;
1156 delayed_refs_rsv
= &fs_info
->delayed_refs_rsv
;
1157 global_rsv
= &fs_info
->global_block_rsv
;
1158 trans_rsv
= &fs_info
->trans_block_rsv
;
1160 spin_lock(&space_info
->lock
);
1161 while (need_preemptive_reclaim(fs_info
, space_info
)) {
1162 enum btrfs_flush_state flush
;
1163 u64 delalloc_size
= 0;
1164 u64 to_reclaim
, block_rsv_size
;
1165 u64 global_rsv_size
= global_rsv
->reserved
;
1170 * We don't have a precise counter for the metadata being
1171 * reserved for delalloc, so we'll approximate it by subtracting
1172 * out the block rsv's space from the bytes_may_use. If that
1173 * amount is higher than the individual reserves, then we can
1174 * assume it's tied up in delalloc reservations.
1176 block_rsv_size
= global_rsv_size
+
1177 delayed_block_rsv
->reserved
+
1178 delayed_refs_rsv
->reserved
+
1179 trans_rsv
->reserved
;
1180 if (block_rsv_size
< space_info
->bytes_may_use
)
1181 delalloc_size
= space_info
->bytes_may_use
- block_rsv_size
;
1184 * We don't want to include the global_rsv in our calculation,
1185 * because that's space we can't touch. Subtract it from the
1186 * block_rsv_size for the next checks.
1188 block_rsv_size
-= global_rsv_size
;
1191 * We really want to avoid flushing delalloc too much, as it
1192 * could result in poor allocation patterns, so only flush it if
1193 * it's larger than the rest of the pools combined.
1195 if (delalloc_size
> block_rsv_size
) {
1196 to_reclaim
= delalloc_size
;
1197 flush
= FLUSH_DELALLOC
;
1198 } else if (space_info
->bytes_pinned
>
1199 (delayed_block_rsv
->reserved
+
1200 delayed_refs_rsv
->reserved
)) {
1201 to_reclaim
= space_info
->bytes_pinned
;
1202 flush
= COMMIT_TRANS
;
1203 } else if (delayed_block_rsv
->reserved
>
1204 delayed_refs_rsv
->reserved
) {
1205 to_reclaim
= delayed_block_rsv
->reserved
;
1206 flush
= FLUSH_DELAYED_ITEMS_NR
;
1208 to_reclaim
= delayed_refs_rsv
->reserved
;
1209 flush
= FLUSH_DELAYED_REFS_NR
;
1212 spin_unlock(&space_info
->lock
);
1215 * We don't want to reclaim everything, just a portion, so scale
1216 * down the to_reclaim by 1/4. If it takes us down to 0,
1217 * reclaim 1 items worth.
1221 to_reclaim
= btrfs_calc_insert_metadata_size(fs_info
, 1);
1222 flush_space(fs_info
, space_info
, to_reclaim
, flush
, true);
1224 spin_lock(&space_info
->lock
);
1227 /* We only went through once, back off our clamping. */
1228 if (loops
== 1 && !space_info
->reclaim_size
)
1229 space_info
->clamp
= max(1, space_info
->clamp
- 1);
1230 trace_btrfs_done_preemptive_reclaim(fs_info
, space_info
);
1231 spin_unlock(&space_info
->lock
);
1235 * FLUSH_DELALLOC_WAIT:
1236 * Space is freed from flushing delalloc in one of two ways.
1238 * 1) compression is on and we allocate less space than we reserved
1239 * 2) we are overwriting existing space
1241 * For #1 that extra space is reclaimed as soon as the delalloc pages are
1242 * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
1243 * length to ->bytes_reserved, and subtracts the reserved space from
1246 * For #2 this is trickier. Once the ordered extent runs we will drop the
1247 * extent in the range we are overwriting, which creates a delayed ref for
1248 * that freed extent. This however is not reclaimed until the transaction
1249 * commits, thus the next stages.
1252 * If we are freeing inodes, we want to make sure all delayed iputs have
1253 * completed, because they could have been on an inode with i_nlink == 0, and
1254 * thus have been truncated and freed up space. But again this space is not
1255 * immediately re-usable, it comes in the form of a delayed ref, which must be
1256 * run and then the transaction must be committed.
1259 * This is where we reclaim all of the pinned space generated by running the
1263 * For data we start with alloc chunk force, however we could have been full
1264 * before, and then the transaction commit could have freed new block groups,
1265 * so if we now have space to allocate do the force chunk allocation.
1267 static const enum btrfs_flush_state data_flush_states
[] = {
1268 FLUSH_DELALLOC_FULL
,
1274 static void btrfs_async_reclaim_data_space(struct work_struct
*work
)
1276 struct btrfs_fs_info
*fs_info
;
1277 struct btrfs_space_info
*space_info
;
1278 u64 last_tickets_id
;
1279 enum btrfs_flush_state flush_state
= 0;
1281 fs_info
= container_of(work
, struct btrfs_fs_info
, async_data_reclaim_work
);
1282 space_info
= fs_info
->data_sinfo
;
1284 spin_lock(&space_info
->lock
);
1285 if (list_empty(&space_info
->tickets
)) {
1286 space_info
->flush
= 0;
1287 spin_unlock(&space_info
->lock
);
1290 last_tickets_id
= space_info
->tickets_id
;
1291 spin_unlock(&space_info
->lock
);
1293 while (!space_info
->full
) {
1294 flush_space(fs_info
, space_info
, U64_MAX
, ALLOC_CHUNK_FORCE
, false);
1295 spin_lock(&space_info
->lock
);
1296 if (list_empty(&space_info
->tickets
)) {
1297 space_info
->flush
= 0;
1298 spin_unlock(&space_info
->lock
);
1302 /* Something happened, fail everything and bail. */
1303 if (BTRFS_FS_ERROR(fs_info
))
1305 last_tickets_id
= space_info
->tickets_id
;
1306 spin_unlock(&space_info
->lock
);
1309 while (flush_state
< ARRAY_SIZE(data_flush_states
)) {
1310 flush_space(fs_info
, space_info
, U64_MAX
,
1311 data_flush_states
[flush_state
], false);
1312 spin_lock(&space_info
->lock
);
1313 if (list_empty(&space_info
->tickets
)) {
1314 space_info
->flush
= 0;
1315 spin_unlock(&space_info
->lock
);
1319 if (last_tickets_id
== space_info
->tickets_id
) {
1322 last_tickets_id
= space_info
->tickets_id
;
1326 if (flush_state
>= ARRAY_SIZE(data_flush_states
)) {
1327 if (space_info
->full
) {
1328 if (maybe_fail_all_tickets(fs_info
, space_info
))
1331 space_info
->flush
= 0;
1336 /* Something happened, fail everything and bail. */
1337 if (BTRFS_FS_ERROR(fs_info
))
1341 spin_unlock(&space_info
->lock
);
1346 maybe_fail_all_tickets(fs_info
, space_info
);
1347 space_info
->flush
= 0;
1348 spin_unlock(&space_info
->lock
);
1351 void btrfs_init_async_reclaim_work(struct btrfs_fs_info
*fs_info
)
1353 INIT_WORK(&fs_info
->async_reclaim_work
, btrfs_async_reclaim_metadata_space
);
1354 INIT_WORK(&fs_info
->async_data_reclaim_work
, btrfs_async_reclaim_data_space
);
1355 INIT_WORK(&fs_info
->preempt_reclaim_work
,
1356 btrfs_preempt_reclaim_metadata_space
);
1359 static const enum btrfs_flush_state priority_flush_states
[] = {
1360 FLUSH_DELAYED_ITEMS_NR
,
1361 FLUSH_DELAYED_ITEMS
,
1365 static const enum btrfs_flush_state evict_flush_states
[] = {
1366 FLUSH_DELAYED_ITEMS_NR
,
1367 FLUSH_DELAYED_ITEMS
,
1368 FLUSH_DELAYED_REFS_NR
,
1371 FLUSH_DELALLOC_WAIT
,
1372 FLUSH_DELALLOC_FULL
,
1377 static void priority_reclaim_metadata_space(struct btrfs_fs_info
*fs_info
,
1378 struct btrfs_space_info
*space_info
,
1379 struct reserve_ticket
*ticket
,
1380 const enum btrfs_flush_state
*states
,
1384 int flush_state
= 0;
1386 spin_lock(&space_info
->lock
);
1387 to_reclaim
= btrfs_calc_reclaim_metadata_size(fs_info
, space_info
);
1389 * This is the priority reclaim path, so to_reclaim could be >0 still
1390 * because we may have only satisfied the priority tickets and still
1391 * left non priority tickets on the list. We would then have
1392 * to_reclaim but ->bytes == 0.
1394 if (ticket
->bytes
== 0) {
1395 spin_unlock(&space_info
->lock
);
1399 while (flush_state
< states_nr
) {
1400 spin_unlock(&space_info
->lock
);
1401 flush_space(fs_info
, space_info
, to_reclaim
, states
[flush_state
],
1404 spin_lock(&space_info
->lock
);
1405 if (ticket
->bytes
== 0) {
1406 spin_unlock(&space_info
->lock
);
1411 /* Attempt to steal from the global rsv if we can. */
1412 if (!steal_from_global_rsv(fs_info
, space_info
, ticket
)) {
1413 ticket
->error
= -ENOSPC
;
1414 remove_ticket(space_info
, ticket
);
1418 * We must run try_granting_tickets here because we could be a large
1419 * ticket in front of a smaller ticket that can now be satisfied with
1420 * the available space.
1422 btrfs_try_granting_tickets(fs_info
, space_info
);
1423 spin_unlock(&space_info
->lock
);
1426 static void priority_reclaim_data_space(struct btrfs_fs_info
*fs_info
,
1427 struct btrfs_space_info
*space_info
,
1428 struct reserve_ticket
*ticket
)
1430 spin_lock(&space_info
->lock
);
1432 /* We could have been granted before we got here. */
1433 if (ticket
->bytes
== 0) {
1434 spin_unlock(&space_info
->lock
);
1438 while (!space_info
->full
) {
1439 spin_unlock(&space_info
->lock
);
1440 flush_space(fs_info
, space_info
, U64_MAX
, ALLOC_CHUNK_FORCE
, false);
1441 spin_lock(&space_info
->lock
);
1442 if (ticket
->bytes
== 0) {
1443 spin_unlock(&space_info
->lock
);
1448 ticket
->error
= -ENOSPC
;
1449 remove_ticket(space_info
, ticket
);
1450 btrfs_try_granting_tickets(fs_info
, space_info
);
1451 spin_unlock(&space_info
->lock
);
1454 static void wait_reserve_ticket(struct btrfs_fs_info
*fs_info
,
1455 struct btrfs_space_info
*space_info
,
1456 struct reserve_ticket
*ticket
)
1462 spin_lock(&space_info
->lock
);
1463 while (ticket
->bytes
> 0 && ticket
->error
== 0) {
1464 ret
= prepare_to_wait_event(&ticket
->wait
, &wait
, TASK_KILLABLE
);
1467 * Delete us from the list. After we unlock the space
1468 * info, we don't want the async reclaim job to reserve
1469 * space for this ticket. If that would happen, then the
1470 * ticket's task would not known that space was reserved
1471 * despite getting an error, resulting in a space leak
1472 * (bytes_may_use counter of our space_info).
1474 remove_ticket(space_info
, ticket
);
1475 ticket
->error
= -EINTR
;
1478 spin_unlock(&space_info
->lock
);
1482 finish_wait(&ticket
->wait
, &wait
);
1483 spin_lock(&space_info
->lock
);
1485 spin_unlock(&space_info
->lock
);
1489 * Do the appropriate flushing and waiting for a ticket.
1491 * @fs_info: the filesystem
1492 * @space_info: space info for the reservation
1493 * @ticket: ticket for the reservation
1494 * @start_ns: timestamp when the reservation started
1495 * @orig_bytes: amount of bytes originally reserved
1496 * @flush: how much we can flush
1498 * This does the work of figuring out how to flush for the ticket, waiting for
1499 * the reservation, and returning the appropriate error if there is one.
1501 static int handle_reserve_ticket(struct btrfs_fs_info
*fs_info
,
1502 struct btrfs_space_info
*space_info
,
1503 struct reserve_ticket
*ticket
,
1504 u64 start_ns
, u64 orig_bytes
,
1505 enum btrfs_reserve_flush_enum flush
)
1510 case BTRFS_RESERVE_FLUSH_DATA
:
1511 case BTRFS_RESERVE_FLUSH_ALL
:
1512 case BTRFS_RESERVE_FLUSH_ALL_STEAL
:
1513 wait_reserve_ticket(fs_info
, space_info
, ticket
);
1515 case BTRFS_RESERVE_FLUSH_LIMIT
:
1516 priority_reclaim_metadata_space(fs_info
, space_info
, ticket
,
1517 priority_flush_states
,
1518 ARRAY_SIZE(priority_flush_states
));
1520 case BTRFS_RESERVE_FLUSH_EVICT
:
1521 priority_reclaim_metadata_space(fs_info
, space_info
, ticket
,
1523 ARRAY_SIZE(evict_flush_states
));
1525 case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE
:
1526 priority_reclaim_data_space(fs_info
, space_info
, ticket
);
1533 ret
= ticket
->error
;
1534 ASSERT(list_empty(&ticket
->list
));
1536 * Check that we can't have an error set if the reservation succeeded,
1537 * as that would confuse tasks and lead them to error out without
1538 * releasing reserved space (if an error happens the expectation is that
1539 * space wasn't reserved at all).
1541 ASSERT(!(ticket
->bytes
== 0 && ticket
->error
));
1542 trace_btrfs_reserve_ticket(fs_info
, space_info
->flags
, orig_bytes
,
1543 start_ns
, flush
, ticket
->error
);
1548 * This returns true if this flush state will go through the ordinary flushing
1551 static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush
)
1553 return (flush
== BTRFS_RESERVE_FLUSH_ALL
) ||
1554 (flush
== BTRFS_RESERVE_FLUSH_ALL_STEAL
);
1557 static inline void maybe_clamp_preempt(struct btrfs_fs_info
*fs_info
,
1558 struct btrfs_space_info
*space_info
)
1560 u64 ordered
= percpu_counter_sum_positive(&fs_info
->ordered_bytes
);
1561 u64 delalloc
= percpu_counter_sum_positive(&fs_info
->delalloc_bytes
);
1564 * If we're heavy on ordered operations then clamping won't help us. We
1565 * need to clamp specifically to keep up with dirty'ing buffered
1566 * writers, because there's not a 1:1 correlation of writing delalloc
1567 * and freeing space, like there is with flushing delayed refs or
1568 * delayed nodes. If we're already more ordered than delalloc then
1569 * we're keeping up, otherwise we aren't and should probably clamp.
1571 if (ordered
< delalloc
)
1572 space_info
->clamp
= min(space_info
->clamp
+ 1, 8);
1575 static inline bool can_steal(enum btrfs_reserve_flush_enum flush
)
1577 return (flush
== BTRFS_RESERVE_FLUSH_ALL_STEAL
||
1578 flush
== BTRFS_RESERVE_FLUSH_EVICT
);
1582 * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
1583 * fail as quickly as possible.
1585 static inline bool can_ticket(enum btrfs_reserve_flush_enum flush
)
1587 return (flush
!= BTRFS_RESERVE_NO_FLUSH
&&
1588 flush
!= BTRFS_RESERVE_FLUSH_EMERGENCY
);
1592 * Try to reserve bytes from the block_rsv's space.
1594 * @fs_info: the filesystem
1595 * @space_info: space info we want to allocate from
1596 * @orig_bytes: number of bytes we want
1597 * @flush: whether or not we can flush to make our reservation
1599 * This will reserve orig_bytes number of bytes from the space info associated
1600 * with the block_rsv. If there is not enough space it will make an attempt to
1601 * flush out space to make room. It will do this by flushing delalloc if
1602 * possible or committing the transaction. If flush is 0 then no attempts to
1603 * regain reservations will be made and this will fail if there is not enough
1606 static int __reserve_bytes(struct btrfs_fs_info
*fs_info
,
1607 struct btrfs_space_info
*space_info
, u64 orig_bytes
,
1608 enum btrfs_reserve_flush_enum flush
)
1610 struct work_struct
*async_work
;
1611 struct reserve_ticket ticket
;
1615 bool pending_tickets
;
1619 * If have a transaction handle (current->journal_info != NULL), then
1620 * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
1621 * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
1622 * flushing methods can trigger transaction commits.
1624 if (current
->journal_info
) {
1625 /* One assert per line for easier debugging. */
1626 ASSERT(flush
!= BTRFS_RESERVE_FLUSH_ALL
);
1627 ASSERT(flush
!= BTRFS_RESERVE_FLUSH_ALL_STEAL
);
1628 ASSERT(flush
!= BTRFS_RESERVE_FLUSH_EVICT
);
1631 if (flush
== BTRFS_RESERVE_FLUSH_DATA
)
1632 async_work
= &fs_info
->async_data_reclaim_work
;
1634 async_work
= &fs_info
->async_reclaim_work
;
1636 spin_lock(&space_info
->lock
);
1637 used
= btrfs_space_info_used(space_info
, true);
1640 * We don't want NO_FLUSH allocations to jump everybody, they can
1641 * generally handle ENOSPC in a different way, so treat them the same as
1642 * normal flushers when it comes to skipping pending tickets.
1644 if (is_normal_flushing(flush
) || (flush
== BTRFS_RESERVE_NO_FLUSH
))
1645 pending_tickets
= !list_empty(&space_info
->tickets
) ||
1646 !list_empty(&space_info
->priority_tickets
);
1648 pending_tickets
= !list_empty(&space_info
->priority_tickets
);
1651 * Carry on if we have enough space (short-circuit) OR call
1652 * can_overcommit() to ensure we can overcommit to continue.
1654 if (!pending_tickets
&&
1655 ((used
+ orig_bytes
<= space_info
->total_bytes
) ||
1656 btrfs_can_overcommit(fs_info
, space_info
, orig_bytes
, flush
))) {
1657 btrfs_space_info_update_bytes_may_use(fs_info
, space_info
,
1663 * Things are dire, we need to make a reservation so we don't abort. We
1664 * will let this reservation go through as long as we have actual space
1665 * left to allocate for the block.
1667 if (ret
&& unlikely(flush
== BTRFS_RESERVE_FLUSH_EMERGENCY
)) {
1668 used
= btrfs_space_info_used(space_info
, false);
1669 if (used
+ orig_bytes
<= space_info
->total_bytes
) {
1670 btrfs_space_info_update_bytes_may_use(fs_info
, space_info
,
1677 * If we couldn't make a reservation then setup our reservation ticket
1678 * and kick the async worker if it's not already running.
1680 * If we are a priority flusher then we just need to add our ticket to
1681 * the list and we will do our own flushing further down.
1683 if (ret
&& can_ticket(flush
)) {
1684 ticket
.bytes
= orig_bytes
;
1686 space_info
->reclaim_size
+= ticket
.bytes
;
1687 init_waitqueue_head(&ticket
.wait
);
1688 ticket
.steal
= can_steal(flush
);
1689 if (trace_btrfs_reserve_ticket_enabled())
1690 start_ns
= ktime_get_ns();
1692 if (flush
== BTRFS_RESERVE_FLUSH_ALL
||
1693 flush
== BTRFS_RESERVE_FLUSH_ALL_STEAL
||
1694 flush
== BTRFS_RESERVE_FLUSH_DATA
) {
1695 list_add_tail(&ticket
.list
, &space_info
->tickets
);
1696 if (!space_info
->flush
) {
1698 * We were forced to add a reserve ticket, so
1699 * our preemptive flushing is unable to keep
1700 * up. Clamp down on the threshold for the
1701 * preemptive flushing in order to keep up with
1704 maybe_clamp_preempt(fs_info
, space_info
);
1706 space_info
->flush
= 1;
1707 trace_btrfs_trigger_flush(fs_info
,
1711 queue_work(system_unbound_wq
, async_work
);
1714 list_add_tail(&ticket
.list
,
1715 &space_info
->priority_tickets
);
1717 } else if (!ret
&& space_info
->flags
& BTRFS_BLOCK_GROUP_METADATA
) {
1719 * We will do the space reservation dance during log replay,
1720 * which means we won't have fs_info->fs_root set, so don't do
1721 * the async reclaim as we will panic.
1723 if (!test_bit(BTRFS_FS_LOG_RECOVERING
, &fs_info
->flags
) &&
1724 !work_busy(&fs_info
->preempt_reclaim_work
) &&
1725 need_preemptive_reclaim(fs_info
, space_info
)) {
1726 trace_btrfs_trigger_flush(fs_info
, space_info
->flags
,
1727 orig_bytes
, flush
, "preempt");
1728 queue_work(system_unbound_wq
,
1729 &fs_info
->preempt_reclaim_work
);
1732 spin_unlock(&space_info
->lock
);
1733 if (!ret
|| !can_ticket(flush
))
1736 return handle_reserve_ticket(fs_info
, space_info
, &ticket
, start_ns
,
1741 * Try to reserve metadata bytes from the block_rsv's space.
1743 * @fs_info: the filesystem
1744 * @block_rsv: block_rsv we're allocating for
1745 * @orig_bytes: number of bytes we want
1746 * @flush: whether or not we can flush to make our reservation
1748 * This will reserve orig_bytes number of bytes from the space info associated
1749 * with the block_rsv. If there is not enough space it will make an attempt to
1750 * flush out space to make room. It will do this by flushing delalloc if
1751 * possible or committing the transaction. If flush is 0 then no attempts to
1752 * regain reservations will be made and this will fail if there is not enough
1755 int btrfs_reserve_metadata_bytes(struct btrfs_fs_info
*fs_info
,
1756 struct btrfs_block_rsv
*block_rsv
,
1758 enum btrfs_reserve_flush_enum flush
)
1762 ret
= __reserve_bytes(fs_info
, block_rsv
->space_info
, orig_bytes
, flush
);
1763 if (ret
== -ENOSPC
) {
1764 trace_btrfs_space_reservation(fs_info
, "space_info:enospc",
1765 block_rsv
->space_info
->flags
,
1768 if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
))
1769 btrfs_dump_space_info(fs_info
, block_rsv
->space_info
,
1776 * Try to reserve data bytes for an allocation.
1778 * @fs_info: the filesystem
1779 * @bytes: number of bytes we need
1780 * @flush: how we are allowed to flush
1782 * This will reserve bytes from the data space info. If there is not enough
1783 * space then we will attempt to flush space as specified by flush.
1785 int btrfs_reserve_data_bytes(struct btrfs_fs_info
*fs_info
, u64 bytes
,
1786 enum btrfs_reserve_flush_enum flush
)
1788 struct btrfs_space_info
*data_sinfo
= fs_info
->data_sinfo
;
1791 ASSERT(flush
== BTRFS_RESERVE_FLUSH_DATA
||
1792 flush
== BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE
||
1793 flush
== BTRFS_RESERVE_NO_FLUSH
);
1794 ASSERT(!current
->journal_info
|| flush
!= BTRFS_RESERVE_FLUSH_DATA
);
1796 ret
= __reserve_bytes(fs_info
, data_sinfo
, bytes
, flush
);
1797 if (ret
== -ENOSPC
) {
1798 trace_btrfs_space_reservation(fs_info
, "space_info:enospc",
1799 data_sinfo
->flags
, bytes
, 1);
1800 if (btrfs_test_opt(fs_info
, ENOSPC_DEBUG
))
1801 btrfs_dump_space_info(fs_info
, data_sinfo
, bytes
, 0);
1806 /* Dump all the space infos when we abort a transaction due to ENOSPC. */
1807 __cold
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info
*fs_info
)
1809 struct btrfs_space_info
*space_info
;
1811 btrfs_info(fs_info
, "dumping space info:");
1812 list_for_each_entry(space_info
, &fs_info
->space_info
, list
) {
1813 spin_lock(&space_info
->lock
);
1814 __btrfs_dump_space_info(fs_info
, space_info
);
1815 spin_unlock(&space_info
->lock
);
1817 dump_global_block_rsv(fs_info
);
1821 * Account the unused space of all the readonly block group in the space_info.
1822 * takes mirrors into account.
1824 u64
btrfs_account_ro_block_groups_free_space(struct btrfs_space_info
*sinfo
)
1826 struct btrfs_block_group
*block_group
;
1830 /* It's df, we don't care if it's racy */
1831 if (list_empty(&sinfo
->ro_bgs
))
1834 spin_lock(&sinfo
->lock
);
1835 list_for_each_entry(block_group
, &sinfo
->ro_bgs
, ro_list
) {
1836 spin_lock(&block_group
->lock
);
1838 if (!block_group
->ro
) {
1839 spin_unlock(&block_group
->lock
);
1843 factor
= btrfs_bg_type_to_factor(block_group
->flags
);
1844 free_bytes
+= (block_group
->length
-
1845 block_group
->used
) * factor
;
1847 spin_unlock(&block_group
->lock
);
1849 spin_unlock(&sinfo
->lock
);