1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/slab.h>
4 #include <trace/events/btrfs.h>
7 #include "extent-io-tree.h"
8 #include "btrfs_inode.h"
11 static struct kmem_cache
*extent_state_cache
;
13 static inline bool extent_state_in_tree(const struct extent_state
*state
)
15 return !RB_EMPTY_NODE(&state
->rb_node
);
18 #ifdef CONFIG_BTRFS_DEBUG
19 static LIST_HEAD(states
);
20 static DEFINE_SPINLOCK(leak_lock
);
22 static inline void btrfs_leak_debug_add_state(struct extent_state
*state
)
26 spin_lock_irqsave(&leak_lock
, flags
);
27 list_add(&state
->leak_list
, &states
);
28 spin_unlock_irqrestore(&leak_lock
, flags
);
31 static inline void btrfs_leak_debug_del_state(struct extent_state
*state
)
35 spin_lock_irqsave(&leak_lock
, flags
);
36 list_del(&state
->leak_list
);
37 spin_unlock_irqrestore(&leak_lock
, flags
);
40 static inline void btrfs_extent_state_leak_debug_check(void)
42 struct extent_state
*state
;
44 while (!list_empty(&states
)) {
45 state
= list_entry(states
.next
, struct extent_state
, leak_list
);
46 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
47 state
->start
, state
->end
, state
->state
,
48 extent_state_in_tree(state
),
49 refcount_read(&state
->refs
));
50 list_del(&state
->leak_list
);
51 kmem_cache_free(extent_state_cache
, state
);
55 #define btrfs_debug_check_extent_io_range(tree, start, end) \
56 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
57 static inline void __btrfs_debug_check_extent_io_range(const char *caller
,
58 struct extent_io_tree
*tree
,
61 struct btrfs_inode
*inode
= tree
->inode
;
67 isize
= i_size_read(&inode
->vfs_inode
);
68 if (end
>= PAGE_SIZE
&& (end
% 2) == 0 && end
!= isize
- 1) {
69 btrfs_debug_rl(inode
->root
->fs_info
,
70 "%s: ino %llu isize %llu odd range [%llu,%llu]",
71 caller
, btrfs_ino(inode
), isize
, start
, end
);
75 #define btrfs_leak_debug_add_state(state) do {} while (0)
76 #define btrfs_leak_debug_del_state(state) do {} while (0)
77 #define btrfs_extent_state_leak_debug_check() do {} while (0)
78 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
82 * For the file_extent_tree, we want to hold the inode lock when we lookup and
83 * update the disk_i_size, but lockdep will complain because our io_tree we hold
84 * the tree lock and get the inode lock when setting delalloc. These two things
85 * are unrelated, so make a class for the file_extent_tree so we don't get the
86 * two locking patterns mixed up.
88 static struct lock_class_key file_extent_tree_class
;
93 struct rb_node rb_node
;
96 void extent_io_tree_init(struct btrfs_fs_info
*fs_info
,
97 struct extent_io_tree
*tree
, unsigned int owner
)
99 tree
->fs_info
= fs_info
;
100 tree
->state
= RB_ROOT
;
101 spin_lock_init(&tree
->lock
);
104 if (owner
== IO_TREE_INODE_FILE_EXTENT
)
105 lockdep_set_class(&tree
->lock
, &file_extent_tree_class
);
108 void extent_io_tree_release(struct extent_io_tree
*tree
)
110 spin_lock(&tree
->lock
);
112 * Do a single barrier for the waitqueue_active check here, the state
113 * of the waitqueue should not change once extent_io_tree_release is
117 while (!RB_EMPTY_ROOT(&tree
->state
)) {
118 struct rb_node
*node
;
119 struct extent_state
*state
;
121 node
= rb_first(&tree
->state
);
122 state
= rb_entry(node
, struct extent_state
, rb_node
);
123 rb_erase(&state
->rb_node
, &tree
->state
);
124 RB_CLEAR_NODE(&state
->rb_node
);
126 * btree io trees aren't supposed to have tasks waiting for
127 * changes in the flags of extent states ever.
129 ASSERT(!waitqueue_active(&state
->wq
));
130 free_extent_state(state
);
132 cond_resched_lock(&tree
->lock
);
134 spin_unlock(&tree
->lock
);
137 static struct extent_state
*alloc_extent_state(gfp_t mask
)
139 struct extent_state
*state
;
142 * The given mask might be not appropriate for the slab allocator,
143 * drop the unsupported bits
145 mask
&= ~(__GFP_DMA32
|__GFP_HIGHMEM
);
146 state
= kmem_cache_alloc(extent_state_cache
, mask
);
150 RB_CLEAR_NODE(&state
->rb_node
);
151 btrfs_leak_debug_add_state(state
);
152 refcount_set(&state
->refs
, 1);
153 init_waitqueue_head(&state
->wq
);
154 trace_alloc_extent_state(state
, mask
, _RET_IP_
);
158 static struct extent_state
*alloc_extent_state_atomic(struct extent_state
*prealloc
)
161 prealloc
= alloc_extent_state(GFP_ATOMIC
);
166 void free_extent_state(struct extent_state
*state
)
170 if (refcount_dec_and_test(&state
->refs
)) {
171 WARN_ON(extent_state_in_tree(state
));
172 btrfs_leak_debug_del_state(state
);
173 trace_free_extent_state(state
, _RET_IP_
);
174 kmem_cache_free(extent_state_cache
, state
);
178 static int add_extent_changeset(struct extent_state
*state
, u32 bits
,
179 struct extent_changeset
*changeset
,
186 if (set
&& (state
->state
& bits
) == bits
)
188 if (!set
&& (state
->state
& bits
) == 0)
190 changeset
->bytes_changed
+= state
->end
- state
->start
+ 1;
191 ret
= ulist_add(&changeset
->range_changed
, state
->start
, state
->end
,
196 static inline struct extent_state
*next_state(struct extent_state
*state
)
198 struct rb_node
*next
= rb_next(&state
->rb_node
);
201 return rb_entry(next
, struct extent_state
, rb_node
);
206 static inline struct extent_state
*prev_state(struct extent_state
*state
)
208 struct rb_node
*next
= rb_prev(&state
->rb_node
);
211 return rb_entry(next
, struct extent_state
, rb_node
);
217 * Search @tree for an entry that contains @offset. Such entry would have
218 * entry->start <= offset && entry->end >= offset.
220 * @tree: the tree to search
221 * @offset: offset that should fall within an entry in @tree
222 * @node_ret: pointer where new node should be anchored (used when inserting an
224 * @parent_ret: points to entry which would have been the parent of the entry,
227 * Return a pointer to the entry that contains @offset byte address and don't change
228 * @node_ret and @parent_ret.
230 * If no such entry exists, return pointer to entry that ends before @offset
231 * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
233 static inline struct extent_state
*tree_search_for_insert(struct extent_io_tree
*tree
,
235 struct rb_node
***node_ret
,
236 struct rb_node
**parent_ret
)
238 struct rb_root
*root
= &tree
->state
;
239 struct rb_node
**node
= &root
->rb_node
;
240 struct rb_node
*prev
= NULL
;
241 struct extent_state
*entry
= NULL
;
245 entry
= rb_entry(prev
, struct extent_state
, rb_node
);
247 if (offset
< entry
->start
)
248 node
= &(*node
)->rb_left
;
249 else if (offset
> entry
->end
)
250 node
= &(*node
)->rb_right
;
260 /* Search neighbors until we find the first one past the end */
261 while (entry
&& offset
> entry
->end
)
262 entry
= next_state(entry
);
268 * Search offset in the tree or fill neighbor rbtree node pointers.
270 * @tree: the tree to search
271 * @offset: offset that should fall within an entry in @tree
272 * @next_ret: pointer to the first entry whose range ends after @offset
273 * @prev_ret: pointer to the first entry whose range begins before @offset
275 * Return a pointer to the entry that contains @offset byte address. If no
276 * such entry exists, then return NULL and fill @prev_ret and @next_ret.
277 * Otherwise return the found entry and other pointers are left untouched.
279 static struct extent_state
*tree_search_prev_next(struct extent_io_tree
*tree
,
281 struct extent_state
**prev_ret
,
282 struct extent_state
**next_ret
)
284 struct rb_root
*root
= &tree
->state
;
285 struct rb_node
**node
= &root
->rb_node
;
286 struct extent_state
*orig_prev
;
287 struct extent_state
*entry
= NULL
;
293 entry
= rb_entry(*node
, struct extent_state
, rb_node
);
295 if (offset
< entry
->start
)
296 node
= &(*node
)->rb_left
;
297 else if (offset
> entry
->end
)
298 node
= &(*node
)->rb_right
;
304 while (entry
&& offset
> entry
->end
)
305 entry
= next_state(entry
);
309 while (entry
&& offset
< entry
->start
)
310 entry
= prev_state(entry
);
317 * Inexact rb-tree search, return the next entry if @offset is not found
319 static inline struct extent_state
*tree_search(struct extent_io_tree
*tree
, u64 offset
)
321 return tree_search_for_insert(tree
, offset
, NULL
, NULL
);
324 static void extent_io_tree_panic(struct extent_io_tree
*tree
, int err
)
326 btrfs_panic(tree
->fs_info
, err
,
327 "locking error: extent tree was modified by another thread while locked");
331 * Utility function to look for merge candidates inside a given range. Any
332 * extents with matching state are merged together into a single extent in the
333 * tree. Extents with EXTENT_IO in their state field are not merged because
334 * the end_io handlers need to be able to do operations on them without
335 * sleeping (or doing allocations/splits).
337 * This should be called with the tree lock held.
339 static void merge_state(struct extent_io_tree
*tree
, struct extent_state
*state
)
341 struct extent_state
*other
;
343 if (state
->state
& (EXTENT_LOCKED
| EXTENT_BOUNDARY
))
346 other
= prev_state(state
);
347 if (other
&& other
->end
== state
->start
- 1 &&
348 other
->state
== state
->state
) {
350 btrfs_merge_delalloc_extent(tree
->inode
, state
, other
);
351 state
->start
= other
->start
;
352 rb_erase(&other
->rb_node
, &tree
->state
);
353 RB_CLEAR_NODE(&other
->rb_node
);
354 free_extent_state(other
);
356 other
= next_state(state
);
357 if (other
&& other
->start
== state
->end
+ 1 &&
358 other
->state
== state
->state
) {
360 btrfs_merge_delalloc_extent(tree
->inode
, state
, other
);
361 state
->end
= other
->end
;
362 rb_erase(&other
->rb_node
, &tree
->state
);
363 RB_CLEAR_NODE(&other
->rb_node
);
364 free_extent_state(other
);
368 static void set_state_bits(struct extent_io_tree
*tree
,
369 struct extent_state
*state
,
370 u32 bits
, struct extent_changeset
*changeset
)
372 u32 bits_to_set
= bits
& ~EXTENT_CTLBITS
;
376 btrfs_set_delalloc_extent(tree
->inode
, state
, bits
);
378 ret
= add_extent_changeset(state
, bits_to_set
, changeset
, 1);
380 state
->state
|= bits_to_set
;
384 * Insert an extent_state struct into the tree. 'bits' are set on the
385 * struct before it is inserted.
387 * This may return -EEXIST if the extent is already there, in which case the
388 * state struct is freed.
390 * The tree lock is not taken internally. This is a utility function and
391 * probably isn't what you want to call (see set/clear_extent_bit).
393 static int insert_state(struct extent_io_tree
*tree
,
394 struct extent_state
*state
,
395 u32 bits
, struct extent_changeset
*changeset
)
397 struct rb_node
**node
;
398 struct rb_node
*parent
= NULL
;
399 const u64 end
= state
->end
;
401 set_state_bits(tree
, state
, bits
, changeset
);
403 node
= &tree
->state
.rb_node
;
405 struct extent_state
*entry
;
408 entry
= rb_entry(parent
, struct extent_state
, rb_node
);
410 if (end
< entry
->start
) {
411 node
= &(*node
)->rb_left
;
412 } else if (end
> entry
->end
) {
413 node
= &(*node
)->rb_right
;
415 btrfs_err(tree
->fs_info
,
416 "found node %llu %llu on insert of %llu %llu",
417 entry
->start
, entry
->end
, state
->start
, end
);
422 rb_link_node(&state
->rb_node
, parent
, node
);
423 rb_insert_color(&state
->rb_node
, &tree
->state
);
425 merge_state(tree
, state
);
430 * Insert state to @tree to the location given by @node and @parent.
432 static void insert_state_fast(struct extent_io_tree
*tree
,
433 struct extent_state
*state
, struct rb_node
**node
,
434 struct rb_node
*parent
, unsigned bits
,
435 struct extent_changeset
*changeset
)
437 set_state_bits(tree
, state
, bits
, changeset
);
438 rb_link_node(&state
->rb_node
, parent
, node
);
439 rb_insert_color(&state
->rb_node
, &tree
->state
);
440 merge_state(tree
, state
);
444 * Split a given extent state struct in two, inserting the preallocated
445 * struct 'prealloc' as the newly created second half. 'split' indicates an
446 * offset inside 'orig' where it should be split.
449 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
450 * are two extent state structs in the tree:
451 * prealloc: [orig->start, split - 1]
452 * orig: [ split, orig->end ]
454 * The tree locks are not taken by this function. They need to be held
457 static int split_state(struct extent_io_tree
*tree
, struct extent_state
*orig
,
458 struct extent_state
*prealloc
, u64 split
)
460 struct rb_node
*parent
= NULL
;
461 struct rb_node
**node
;
464 btrfs_split_delalloc_extent(tree
->inode
, orig
, split
);
466 prealloc
->start
= orig
->start
;
467 prealloc
->end
= split
- 1;
468 prealloc
->state
= orig
->state
;
471 parent
= &orig
->rb_node
;
474 struct extent_state
*entry
;
477 entry
= rb_entry(parent
, struct extent_state
, rb_node
);
479 if (prealloc
->end
< entry
->start
) {
480 node
= &(*node
)->rb_left
;
481 } else if (prealloc
->end
> entry
->end
) {
482 node
= &(*node
)->rb_right
;
484 free_extent_state(prealloc
);
489 rb_link_node(&prealloc
->rb_node
, parent
, node
);
490 rb_insert_color(&prealloc
->rb_node
, &tree
->state
);
496 * Utility function to clear some bits in an extent state struct. It will
497 * optionally wake up anyone waiting on this state (wake == 1).
499 * If no bits are set on the state struct after clearing things, the
500 * struct is freed and removed from the tree
502 static struct extent_state
*clear_state_bit(struct extent_io_tree
*tree
,
503 struct extent_state
*state
,
505 struct extent_changeset
*changeset
)
507 struct extent_state
*next
;
508 u32 bits_to_clear
= bits
& ~EXTENT_CTLBITS
;
512 btrfs_clear_delalloc_extent(tree
->inode
, state
, bits
);
514 ret
= add_extent_changeset(state
, bits_to_clear
, changeset
, 0);
516 state
->state
&= ~bits_to_clear
;
519 if (state
->state
== 0) {
520 next
= next_state(state
);
521 if (extent_state_in_tree(state
)) {
522 rb_erase(&state
->rb_node
, &tree
->state
);
523 RB_CLEAR_NODE(&state
->rb_node
);
524 free_extent_state(state
);
529 merge_state(tree
, state
);
530 next
= next_state(state
);
536 * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
537 * unset the EXTENT_NOWAIT bit.
539 static void set_gfp_mask_from_bits(u32
*bits
, gfp_t
*mask
)
541 *mask
= (*bits
& EXTENT_NOWAIT
? GFP_NOWAIT
: GFP_NOFS
);
542 *bits
&= EXTENT_NOWAIT
- 1;
546 * Clear some bits on a range in the tree. This may require splitting or
547 * inserting elements in the tree, so the gfp mask is used to indicate which
548 * allocations or sleeping are allowed.
550 * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
551 * range from the tree regardless of state (ie for truncate).
553 * The range [start, end] is inclusive.
555 * This takes the tree lock, and returns 0 on success and < 0 on error.
557 int __clear_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
558 u32 bits
, struct extent_state
**cached_state
,
559 struct extent_changeset
*changeset
)
561 struct extent_state
*state
;
562 struct extent_state
*cached
;
563 struct extent_state
*prealloc
= NULL
;
568 int delete = (bits
& EXTENT_CLEAR_ALL_BITS
);
571 set_gfp_mask_from_bits(&bits
, &mask
);
572 btrfs_debug_check_extent_io_range(tree
, start
, end
);
573 trace_btrfs_clear_extent_bit(tree
, start
, end
- start
+ 1, bits
);
576 bits
|= ~EXTENT_CTLBITS
;
578 if (bits
& EXTENT_DELALLOC
)
579 bits
|= EXTENT_NORESERVE
;
581 wake
= (bits
& EXTENT_LOCKED
) ? 1 : 0;
582 if (bits
& (EXTENT_LOCKED
| EXTENT_BOUNDARY
))
587 * Don't care for allocation failure here because we might end
588 * up not needing the pre-allocated extent state at all, which
589 * is the case if we only have in the tree extent states that
590 * cover our input range and don't cover too any other range.
591 * If we end up needing a new extent state we allocate it later.
593 prealloc
= alloc_extent_state(mask
);
596 spin_lock(&tree
->lock
);
598 cached
= *cached_state
;
601 *cached_state
= NULL
;
605 if (cached
&& extent_state_in_tree(cached
) &&
606 cached
->start
<= start
&& cached
->end
> start
) {
608 refcount_dec(&cached
->refs
);
613 free_extent_state(cached
);
616 /* This search will find the extents that end after our range starts. */
617 state
= tree_search(tree
, start
);
621 if (state
->start
> end
)
623 WARN_ON(state
->end
< start
);
624 last_end
= state
->end
;
626 /* The state doesn't have the wanted bits, go ahead. */
627 if (!(state
->state
& bits
)) {
628 state
= next_state(state
);
633 * | ---- desired range ---- |
635 * | ------------- state -------------- |
637 * We need to split the extent we found, and may flip bits on second
640 * If the extent we found extends past our range, we just split and
641 * search again. It'll get split again the next time though.
643 * If the extent we found is inside our range, we clear the desired bit
647 if (state
->start
< start
) {
648 prealloc
= alloc_extent_state_atomic(prealloc
);
651 err
= split_state(tree
, state
, prealloc
, start
);
653 extent_io_tree_panic(tree
, err
);
658 if (state
->end
<= end
) {
659 state
= clear_state_bit(tree
, state
, bits
, wake
, changeset
);
665 * | ---- desired range ---- |
667 * We need to split the extent, and clear the bit on the first half.
669 if (state
->start
<= end
&& state
->end
> end
) {
670 prealloc
= alloc_extent_state_atomic(prealloc
);
673 err
= split_state(tree
, state
, prealloc
, end
+ 1);
675 extent_io_tree_panic(tree
, err
);
680 clear_state_bit(tree
, prealloc
, bits
, wake
, changeset
);
686 state
= clear_state_bit(tree
, state
, bits
, wake
, changeset
);
688 if (last_end
== (u64
)-1)
690 start
= last_end
+ 1;
691 if (start
<= end
&& state
&& !need_resched())
697 spin_unlock(&tree
->lock
);
698 if (gfpflags_allow_blocking(mask
))
703 spin_unlock(&tree
->lock
);
705 free_extent_state(prealloc
);
711 static void wait_on_state(struct extent_io_tree
*tree
,
712 struct extent_state
*state
)
713 __releases(tree
->lock
)
714 __acquires(tree
->lock
)
717 prepare_to_wait(&state
->wq
, &wait
, TASK_UNINTERRUPTIBLE
);
718 spin_unlock(&tree
->lock
);
720 spin_lock(&tree
->lock
);
721 finish_wait(&state
->wq
, &wait
);
725 * Wait for one or more bits to clear on a range in the state tree.
726 * The range [start, end] is inclusive.
727 * The tree lock is taken by this function
729 void wait_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
, u32 bits
,
730 struct extent_state
**cached_state
)
732 struct extent_state
*state
;
734 btrfs_debug_check_extent_io_range(tree
, start
, end
);
736 spin_lock(&tree
->lock
);
739 * Maintain cached_state, as we may not remove it from the tree if there
740 * are more bits than the bits we're waiting on set on this state.
742 if (cached_state
&& *cached_state
) {
743 state
= *cached_state
;
744 if (extent_state_in_tree(state
) &&
745 state
->start
<= start
&& start
< state
->end
)
750 * This search will find all the extents that end after our
753 state
= tree_search(tree
, start
);
757 if (state
->start
> end
)
760 if (state
->state
& bits
) {
761 start
= state
->start
;
762 refcount_inc(&state
->refs
);
763 wait_on_state(tree
, state
);
764 free_extent_state(state
);
767 start
= state
->end
+ 1;
772 if (!cond_resched_lock(&tree
->lock
)) {
773 state
= next_state(state
);
778 /* This state is no longer useful, clear it and free it up. */
779 if (cached_state
&& *cached_state
) {
780 state
= *cached_state
;
781 *cached_state
= NULL
;
782 free_extent_state(state
);
784 spin_unlock(&tree
->lock
);
787 static void cache_state_if_flags(struct extent_state
*state
,
788 struct extent_state
**cached_ptr
,
791 if (cached_ptr
&& !(*cached_ptr
)) {
792 if (!flags
|| (state
->state
& flags
)) {
794 refcount_inc(&state
->refs
);
799 static void cache_state(struct extent_state
*state
,
800 struct extent_state
**cached_ptr
)
802 return cache_state_if_flags(state
, cached_ptr
,
803 EXTENT_LOCKED
| EXTENT_BOUNDARY
);
807 * Find the first state struct with 'bits' set after 'start', and return it.
808 * tree->lock must be held. NULL will returned if nothing was found after
811 static struct extent_state
*find_first_extent_bit_state(struct extent_io_tree
*tree
,
814 struct extent_state
*state
;
817 * This search will find all the extents that end after our range
820 state
= tree_search(tree
, start
);
822 if (state
->end
>= start
&& (state
->state
& bits
))
824 state
= next_state(state
);
830 * Find the first offset in the io tree with one or more @bits set.
832 * Note: If there are multiple bits set in @bits, any of them will match.
834 * Return true if we find something, and update @start_ret and @end_ret.
835 * Return false if we found nothing.
837 bool find_first_extent_bit(struct extent_io_tree
*tree
, u64 start
,
838 u64
*start_ret
, u64
*end_ret
, u32 bits
,
839 struct extent_state
**cached_state
)
841 struct extent_state
*state
;
844 spin_lock(&tree
->lock
);
845 if (cached_state
&& *cached_state
) {
846 state
= *cached_state
;
847 if (state
->end
== start
- 1 && extent_state_in_tree(state
)) {
848 while ((state
= next_state(state
)) != NULL
) {
849 if (state
->state
& bits
)
852 free_extent_state(*cached_state
);
853 *cached_state
= NULL
;
856 free_extent_state(*cached_state
);
857 *cached_state
= NULL
;
860 state
= find_first_extent_bit_state(tree
, start
, bits
);
863 cache_state_if_flags(state
, cached_state
, 0);
864 *start_ret
= state
->start
;
865 *end_ret
= state
->end
;
869 spin_unlock(&tree
->lock
);
874 * Find a contiguous area of bits
876 * @tree: io tree to check
877 * @start: offset to start the search from
878 * @start_ret: the first offset we found with the bits set
879 * @end_ret: the final contiguous range of the bits that were set
880 * @bits: bits to look for
882 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
883 * to set bits appropriately, and then merge them again. During this time it
884 * will drop the tree->lock, so use this helper if you want to find the actual
885 * contiguous area for given bits. We will search to the first bit we find, and
886 * then walk down the tree until we find a non-contiguous area. The area
887 * returned will be the full contiguous area with the bits set.
889 int find_contiguous_extent_bit(struct extent_io_tree
*tree
, u64 start
,
890 u64
*start_ret
, u64
*end_ret
, u32 bits
)
892 struct extent_state
*state
;
895 spin_lock(&tree
->lock
);
896 state
= find_first_extent_bit_state(tree
, start
, bits
);
898 *start_ret
= state
->start
;
899 *end_ret
= state
->end
;
900 while ((state
= next_state(state
)) != NULL
) {
901 if (state
->start
> (*end_ret
+ 1))
903 *end_ret
= state
->end
;
907 spin_unlock(&tree
->lock
);
912 * Find a contiguous range of bytes in the file marked as delalloc, not more
913 * than 'max_bytes'. start and end are used to return the range,
915 * True is returned if we find something, false if nothing was in the tree.
917 bool btrfs_find_delalloc_range(struct extent_io_tree
*tree
, u64
*start
,
918 u64
*end
, u64 max_bytes
,
919 struct extent_state
**cached_state
)
921 struct extent_state
*state
;
922 u64 cur_start
= *start
;
926 spin_lock(&tree
->lock
);
929 * This search will find all the extents that end after our range
932 state
= tree_search(tree
, cur_start
);
939 if (found
&& (state
->start
!= cur_start
||
940 (state
->state
& EXTENT_BOUNDARY
))) {
943 if (!(state
->state
& EXTENT_DELALLOC
)) {
949 *start
= state
->start
;
950 *cached_state
= state
;
951 refcount_inc(&state
->refs
);
955 cur_start
= state
->end
+ 1;
956 total_bytes
+= state
->end
- state
->start
+ 1;
957 if (total_bytes
>= max_bytes
)
959 state
= next_state(state
);
962 spin_unlock(&tree
->lock
);
967 * Set some bits on a range in the tree. This may require allocations or
968 * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
971 * If any of the exclusive bits are set, this will fail with -EEXIST if some
972 * part of the range already has the desired bits set. The extent_state of the
973 * existing range is returned in failed_state in this case, and the start of the
974 * existing range is returned in failed_start. failed_state is used as an
975 * optimization for wait_extent_bit, failed_start must be used as the source of
976 * truth as failed_state may have changed since we returned.
978 * [start, end] is inclusive This takes the tree lock.
980 static int __set_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
981 u32 bits
, u64
*failed_start
,
982 struct extent_state
**failed_state
,
983 struct extent_state
**cached_state
,
984 struct extent_changeset
*changeset
)
986 struct extent_state
*state
;
987 struct extent_state
*prealloc
= NULL
;
988 struct rb_node
**p
= NULL
;
989 struct rb_node
*parent
= NULL
;
993 u32 exclusive_bits
= (bits
& EXTENT_LOCKED
);
996 set_gfp_mask_from_bits(&bits
, &mask
);
997 btrfs_debug_check_extent_io_range(tree
, start
, end
);
998 trace_btrfs_set_extent_bit(tree
, start
, end
- start
+ 1, bits
);
1001 ASSERT(failed_start
);
1003 ASSERT(failed_start
== NULL
&& failed_state
== NULL
);
1007 * Don't care for allocation failure here because we might end
1008 * up not needing the pre-allocated extent state at all, which
1009 * is the case if we only have in the tree extent states that
1010 * cover our input range and don't cover too any other range.
1011 * If we end up needing a new extent state we allocate it later.
1013 prealloc
= alloc_extent_state(mask
);
1016 spin_lock(&tree
->lock
);
1017 if (cached_state
&& *cached_state
) {
1018 state
= *cached_state
;
1019 if (state
->start
<= start
&& state
->end
> start
&&
1020 extent_state_in_tree(state
))
1024 * This search will find all the extents that end after our range
1027 state
= tree_search_for_insert(tree
, start
, &p
, &parent
);
1029 prealloc
= alloc_extent_state_atomic(prealloc
);
1032 prealloc
->start
= start
;
1033 prealloc
->end
= end
;
1034 insert_state_fast(tree
, prealloc
, p
, parent
, bits
, changeset
);
1035 cache_state(prealloc
, cached_state
);
1040 last_start
= state
->start
;
1041 last_end
= state
->end
;
1044 * | ---- desired range ---- |
1047 * Just lock what we found and keep going
1049 if (state
->start
== start
&& state
->end
<= end
) {
1050 if (state
->state
& exclusive_bits
) {
1051 *failed_start
= state
->start
;
1052 cache_state(state
, failed_state
);
1057 set_state_bits(tree
, state
, bits
, changeset
);
1058 cache_state(state
, cached_state
);
1059 merge_state(tree
, state
);
1060 if (last_end
== (u64
)-1)
1062 start
= last_end
+ 1;
1063 state
= next_state(state
);
1064 if (start
< end
&& state
&& state
->start
== start
&&
1071 * | ---- desired range ---- |
1074 * | ------------- state -------------- |
1076 * We need to split the extent we found, and may flip bits on second
1079 * If the extent we found extends past our range, we just split and
1080 * search again. It'll get split again the next time though.
1082 * If the extent we found is inside our range, we set the desired bit
1085 if (state
->start
< start
) {
1086 if (state
->state
& exclusive_bits
) {
1087 *failed_start
= start
;
1088 cache_state(state
, failed_state
);
1094 * If this extent already has all the bits we want set, then
1095 * skip it, not necessary to split it or do anything with it.
1097 if ((state
->state
& bits
) == bits
) {
1098 start
= state
->end
+ 1;
1099 cache_state(state
, cached_state
);
1103 prealloc
= alloc_extent_state_atomic(prealloc
);
1106 err
= split_state(tree
, state
, prealloc
, start
);
1108 extent_io_tree_panic(tree
, err
);
1113 if (state
->end
<= end
) {
1114 set_state_bits(tree
, state
, bits
, changeset
);
1115 cache_state(state
, cached_state
);
1116 merge_state(tree
, state
);
1117 if (last_end
== (u64
)-1)
1119 start
= last_end
+ 1;
1120 state
= next_state(state
);
1121 if (start
< end
&& state
&& state
->start
== start
&&
1128 * | ---- desired range ---- |
1129 * | state | or | state |
1131 * There's a hole, we need to insert something in it and ignore the
1134 if (state
->start
> start
) {
1136 if (end
< last_start
)
1139 this_end
= last_start
- 1;
1141 prealloc
= alloc_extent_state_atomic(prealloc
);
1146 * Avoid to free 'prealloc' if it can be merged with the later
1149 prealloc
->start
= start
;
1150 prealloc
->end
= this_end
;
1151 err
= insert_state(tree
, prealloc
, bits
, changeset
);
1153 extent_io_tree_panic(tree
, err
);
1155 cache_state(prealloc
, cached_state
);
1157 start
= this_end
+ 1;
1161 * | ---- desired range ---- |
1164 * We need to split the extent, and set the bit on the first half
1166 if (state
->start
<= end
&& state
->end
> end
) {
1167 if (state
->state
& exclusive_bits
) {
1168 *failed_start
= start
;
1169 cache_state(state
, failed_state
);
1174 prealloc
= alloc_extent_state_atomic(prealloc
);
1177 err
= split_state(tree
, state
, prealloc
, end
+ 1);
1179 extent_io_tree_panic(tree
, err
);
1181 set_state_bits(tree
, prealloc
, bits
, changeset
);
1182 cache_state(prealloc
, cached_state
);
1183 merge_state(tree
, prealloc
);
1191 spin_unlock(&tree
->lock
);
1192 if (gfpflags_allow_blocking(mask
))
1197 spin_unlock(&tree
->lock
);
1199 free_extent_state(prealloc
);
1205 int set_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1206 u32 bits
, struct extent_state
**cached_state
)
1208 return __set_extent_bit(tree
, start
, end
, bits
, NULL
, NULL
,
1209 cached_state
, NULL
);
1213 * Convert all bits in a given range from one bit to another
1215 * @tree: the io tree to search
1216 * @start: the start offset in bytes
1217 * @end: the end offset in bytes (inclusive)
1218 * @bits: the bits to set in this range
1219 * @clear_bits: the bits to clear in this range
1220 * @cached_state: state that we're going to cache
1222 * This will go through and set bits for the given range. If any states exist
1223 * already in this range they are set with the given bit and cleared of the
1224 * clear_bits. This is only meant to be used by things that are mergeable, ie.
1225 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1226 * boundary bits like LOCK.
1228 * All allocations are done with GFP_NOFS.
1230 int convert_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1231 u32 bits
, u32 clear_bits
,
1232 struct extent_state
**cached_state
)
1234 struct extent_state
*state
;
1235 struct extent_state
*prealloc
= NULL
;
1236 struct rb_node
**p
= NULL
;
1237 struct rb_node
*parent
= NULL
;
1241 bool first_iteration
= true;
1243 btrfs_debug_check_extent_io_range(tree
, start
, end
);
1244 trace_btrfs_convert_extent_bit(tree
, start
, end
- start
+ 1, bits
,
1250 * Best effort, don't worry if extent state allocation fails
1251 * here for the first iteration. We might have a cached state
1252 * that matches exactly the target range, in which case no
1253 * extent state allocations are needed. We'll only know this
1254 * after locking the tree.
1256 prealloc
= alloc_extent_state(GFP_NOFS
);
1257 if (!prealloc
&& !first_iteration
)
1261 spin_lock(&tree
->lock
);
1262 if (cached_state
&& *cached_state
) {
1263 state
= *cached_state
;
1264 if (state
->start
<= start
&& state
->end
> start
&&
1265 extent_state_in_tree(state
))
1270 * This search will find all the extents that end after our range
1273 state
= tree_search_for_insert(tree
, start
, &p
, &parent
);
1275 prealloc
= alloc_extent_state_atomic(prealloc
);
1280 prealloc
->start
= start
;
1281 prealloc
->end
= end
;
1282 insert_state_fast(tree
, prealloc
, p
, parent
, bits
, NULL
);
1283 cache_state(prealloc
, cached_state
);
1288 last_start
= state
->start
;
1289 last_end
= state
->end
;
1292 * | ---- desired range ---- |
1295 * Just lock what we found and keep going.
1297 if (state
->start
== start
&& state
->end
<= end
) {
1298 set_state_bits(tree
, state
, bits
, NULL
);
1299 cache_state(state
, cached_state
);
1300 state
= clear_state_bit(tree
, state
, clear_bits
, 0, NULL
);
1301 if (last_end
== (u64
)-1)
1303 start
= last_end
+ 1;
1304 if (start
< end
&& state
&& state
->start
== start
&&
1311 * | ---- desired range ---- |
1314 * | ------------- state -------------- |
1316 * We need to split the extent we found, and may flip bits on second
1319 * If the extent we found extends past our range, we just split and
1320 * search again. It'll get split again the next time though.
1322 * If the extent we found is inside our range, we set the desired bit
1325 if (state
->start
< start
) {
1326 prealloc
= alloc_extent_state_atomic(prealloc
);
1331 err
= split_state(tree
, state
, prealloc
, start
);
1333 extent_io_tree_panic(tree
, err
);
1337 if (state
->end
<= end
) {
1338 set_state_bits(tree
, state
, bits
, NULL
);
1339 cache_state(state
, cached_state
);
1340 state
= clear_state_bit(tree
, state
, clear_bits
, 0, NULL
);
1341 if (last_end
== (u64
)-1)
1343 start
= last_end
+ 1;
1344 if (start
< end
&& state
&& state
->start
== start
&&
1351 * | ---- desired range ---- |
1352 * | state | or | state |
1354 * There's a hole, we need to insert something in it and ignore the
1357 if (state
->start
> start
) {
1359 if (end
< last_start
)
1362 this_end
= last_start
- 1;
1364 prealloc
= alloc_extent_state_atomic(prealloc
);
1371 * Avoid to free 'prealloc' if it can be merged with the later
1374 prealloc
->start
= start
;
1375 prealloc
->end
= this_end
;
1376 err
= insert_state(tree
, prealloc
, bits
, NULL
);
1378 extent_io_tree_panic(tree
, err
);
1379 cache_state(prealloc
, cached_state
);
1381 start
= this_end
+ 1;
1385 * | ---- desired range ---- |
1388 * We need to split the extent, and set the bit on the first half.
1390 if (state
->start
<= end
&& state
->end
> end
) {
1391 prealloc
= alloc_extent_state_atomic(prealloc
);
1397 err
= split_state(tree
, state
, prealloc
, end
+ 1);
1399 extent_io_tree_panic(tree
, err
);
1401 set_state_bits(tree
, prealloc
, bits
, NULL
);
1402 cache_state(prealloc
, cached_state
);
1403 clear_state_bit(tree
, prealloc
, clear_bits
, 0, NULL
);
1411 spin_unlock(&tree
->lock
);
1413 first_iteration
= false;
1417 spin_unlock(&tree
->lock
);
1419 free_extent_state(prealloc
);
1425 * Find the first range that has @bits not set. This range could start before
1428 * @tree: the tree to search
1429 * @start: offset at/after which the found extent should start
1430 * @start_ret: records the beginning of the range
1431 * @end_ret: records the end of the range (inclusive)
1432 * @bits: the set of bits which must be unset
1434 * Since unallocated range is also considered one which doesn't have the bits
1435 * set it's possible that @end_ret contains -1, this happens in case the range
1436 * spans (last_range_end, end of device]. In this case it's up to the caller to
1437 * trim @end_ret to the appropriate size.
1439 void find_first_clear_extent_bit(struct extent_io_tree
*tree
, u64 start
,
1440 u64
*start_ret
, u64
*end_ret
, u32 bits
)
1442 struct extent_state
*state
;
1443 struct extent_state
*prev
= NULL
, *next
= NULL
;
1445 spin_lock(&tree
->lock
);
1447 /* Find first extent with bits cleared */
1449 state
= tree_search_prev_next(tree
, start
, &prev
, &next
);
1450 if (!state
&& !next
&& !prev
) {
1452 * Tree is completely empty, send full range and let
1453 * caller deal with it
1458 } else if (!state
&& !next
) {
1460 * We are past the last allocated chunk, set start at
1461 * the end of the last extent.
1463 *start_ret
= prev
->end
+ 1;
1466 } else if (!state
) {
1471 * At this point 'state' either contains 'start' or start is
1474 if (in_range(start
, state
->start
, state
->end
- state
->start
+ 1)) {
1475 if (state
->state
& bits
) {
1477 * |--range with bits sets--|
1481 start
= state
->end
+ 1;
1484 * 'start' falls within a range that doesn't
1485 * have the bits set, so take its start as the
1486 * beginning of the desired range
1488 * |--range with bits cleared----|
1492 *start_ret
= state
->start
;
1497 * |---prev range---|---hole/unset---|---node range---|
1503 * |---hole/unset--||--first node--|
1508 *start_ret
= prev
->end
+ 1;
1516 * Find the longest stretch from start until an entry which has the
1520 if (state
->end
>= start
&& !(state
->state
& bits
)) {
1521 *end_ret
= state
->end
;
1523 *end_ret
= state
->start
- 1;
1526 state
= next_state(state
);
1529 spin_unlock(&tree
->lock
);
1533 * Count the number of bytes in the tree that have a given bit(s) set for a
1536 * @tree: The io tree to search.
1537 * @start: The start offset of the range. This value is updated to the
1538 * offset of the first byte found with the given bit(s), so it
1539 * can end up being bigger than the initial value.
1540 * @search_end: The end offset (inclusive value) of the search range.
1541 * @max_bytes: The maximum byte count we are interested. The search stops
1542 * once it reaches this count.
1543 * @bits: The bits the range must have in order to be accounted for.
1544 * If multiple bits are set, then only subranges that have all
1545 * the bits set are accounted for.
1546 * @contig: Indicate if we should ignore holes in the range or not. If
1547 * this is true, then stop once we find a hole.
1548 * @cached_state: A cached state to be used across multiple calls to this
1549 * function in order to speedup searches. Use NULL if this is
1550 * called only once or if each call does not start where the
1551 * previous one ended.
1553 * Returns the total number of bytes found within the given range that have
1554 * all given bits set. If the returned number of bytes is greater than zero
1555 * then @start is updated with the offset of the first byte with the bits set.
1557 u64
count_range_bits(struct extent_io_tree
*tree
,
1558 u64
*start
, u64 search_end
, u64 max_bytes
,
1559 u32 bits
, int contig
,
1560 struct extent_state
**cached_state
)
1562 struct extent_state
*state
= NULL
;
1563 struct extent_state
*cached
;
1564 u64 cur_start
= *start
;
1565 u64 total_bytes
= 0;
1569 if (WARN_ON(search_end
< cur_start
))
1572 spin_lock(&tree
->lock
);
1574 if (!cached_state
|| !*cached_state
)
1577 cached
= *cached_state
;
1579 if (!extent_state_in_tree(cached
))
1582 if (cached
->start
<= cur_start
&& cur_start
<= cached
->end
) {
1584 } else if (cached
->start
> cur_start
) {
1585 struct extent_state
*prev
;
1588 * The cached state starts after our search range's start. Check
1589 * if the previous state record starts at or before the range we
1590 * are looking for, and if so, use it - this is a common case
1591 * when there are holes between records in the tree. If there is
1592 * no previous state record, we can start from our cached state.
1594 prev
= prev_state(cached
);
1597 else if (prev
->start
<= cur_start
&& cur_start
<= prev
->end
)
1602 * This search will find all the extents that end after our range
1607 state
= tree_search(tree
, cur_start
);
1610 if (state
->start
> search_end
)
1612 if (contig
&& found
&& state
->start
> last
+ 1)
1614 if (state
->end
>= cur_start
&& (state
->state
& bits
) == bits
) {
1615 total_bytes
+= min(search_end
, state
->end
) + 1 -
1616 max(cur_start
, state
->start
);
1617 if (total_bytes
>= max_bytes
)
1620 *start
= max(cur_start
, state
->start
);
1624 } else if (contig
&& found
) {
1627 state
= next_state(state
);
1631 free_extent_state(*cached_state
);
1632 *cached_state
= state
;
1634 refcount_inc(&state
->refs
);
1637 spin_unlock(&tree
->lock
);
1643 * Search a range in the state tree for a given mask. If 'filled' == 1, this
1644 * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
1645 * is returned if any bit in the range is found set.
1647 int test_range_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1648 u32 bits
, int filled
, struct extent_state
*cached
)
1650 struct extent_state
*state
= NULL
;
1653 spin_lock(&tree
->lock
);
1654 if (cached
&& extent_state_in_tree(cached
) && cached
->start
<= start
&&
1655 cached
->end
> start
)
1658 state
= tree_search(tree
, start
);
1659 while (state
&& start
<= end
) {
1660 if (filled
&& state
->start
> start
) {
1665 if (state
->start
> end
)
1668 if (state
->state
& bits
) {
1672 } else if (filled
) {
1677 if (state
->end
== (u64
)-1)
1680 start
= state
->end
+ 1;
1683 state
= next_state(state
);
1686 /* We ran out of states and were still inside of our range. */
1687 if (filled
&& !state
)
1689 spin_unlock(&tree
->lock
);
1693 /* Wrappers around set/clear extent bit */
1694 int set_record_extent_bits(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1695 u32 bits
, struct extent_changeset
*changeset
)
1698 * We don't support EXTENT_LOCKED yet, as current changeset will
1699 * record any bits changed, so for EXTENT_LOCKED case, it will
1700 * either fail with -EEXIST or changeset will record the whole
1703 ASSERT(!(bits
& EXTENT_LOCKED
));
1705 return __set_extent_bit(tree
, start
, end
, bits
, NULL
, NULL
, NULL
, changeset
);
1708 int clear_record_extent_bits(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1709 u32 bits
, struct extent_changeset
*changeset
)
1712 * Don't support EXTENT_LOCKED case, same reason as
1713 * set_record_extent_bits().
1715 ASSERT(!(bits
& EXTENT_LOCKED
));
1717 return __clear_extent_bit(tree
, start
, end
, bits
, NULL
, changeset
);
1720 int try_lock_extent(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1721 struct extent_state
**cached
)
1726 err
= __set_extent_bit(tree
, start
, end
, EXTENT_LOCKED
, &failed_start
,
1727 NULL
, cached
, NULL
);
1728 if (err
== -EEXIST
) {
1729 if (failed_start
> start
)
1730 clear_extent_bit(tree
, start
, failed_start
- 1,
1731 EXTENT_LOCKED
, cached
);
1738 * Either insert or lock state struct between start and end use mask to tell
1739 * us if waiting is desired.
1741 int lock_extent(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1742 struct extent_state
**cached_state
)
1744 struct extent_state
*failed_state
= NULL
;
1748 err
= __set_extent_bit(tree
, start
, end
, EXTENT_LOCKED
, &failed_start
,
1749 &failed_state
, cached_state
, NULL
);
1750 while (err
== -EEXIST
) {
1751 if (failed_start
!= start
)
1752 clear_extent_bit(tree
, start
, failed_start
- 1,
1753 EXTENT_LOCKED
, cached_state
);
1755 wait_extent_bit(tree
, failed_start
, end
, EXTENT_LOCKED
,
1757 err
= __set_extent_bit(tree
, start
, end
, EXTENT_LOCKED
,
1758 &failed_start
, &failed_state
,
1759 cached_state
, NULL
);
1764 void __cold
extent_state_free_cachep(void)
1766 btrfs_extent_state_leak_debug_check();
1767 kmem_cache_destroy(extent_state_cache
);
1770 int __init
extent_state_init_cachep(void)
1772 extent_state_cache
= kmem_cache_create("btrfs_extent_state",
1773 sizeof(struct extent_state
), 0,
1774 SLAB_MEM_SPREAD
, NULL
);
1775 if (!extent_state_cache
)