1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/slab.h>
4 #include <trace/events/btrfs.h>
7 #include "extent-io-tree.h"
8 #include "btrfs_inode.h"
11 static struct kmem_cache
*extent_state_cache
;
13 static inline bool extent_state_in_tree(const struct extent_state
*state
)
15 return !RB_EMPTY_NODE(&state
->rb_node
);
18 #ifdef CONFIG_BTRFS_DEBUG
19 static LIST_HEAD(states
);
20 static DEFINE_SPINLOCK(leak_lock
);
22 static inline void btrfs_leak_debug_add_state(struct extent_state
*state
)
26 spin_lock_irqsave(&leak_lock
, flags
);
27 list_add(&state
->leak_list
, &states
);
28 spin_unlock_irqrestore(&leak_lock
, flags
);
31 static inline void btrfs_leak_debug_del_state(struct extent_state
*state
)
35 spin_lock_irqsave(&leak_lock
, flags
);
36 list_del(&state
->leak_list
);
37 spin_unlock_irqrestore(&leak_lock
, flags
);
40 static inline void btrfs_extent_state_leak_debug_check(void)
42 struct extent_state
*state
;
44 while (!list_empty(&states
)) {
45 state
= list_entry(states
.next
, struct extent_state
, leak_list
);
46 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
47 state
->start
, state
->end
, state
->state
,
48 extent_state_in_tree(state
),
49 refcount_read(&state
->refs
));
50 list_del(&state
->leak_list
);
51 kmem_cache_free(extent_state_cache
, state
);
55 #define btrfs_debug_check_extent_io_range(tree, start, end) \
56 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
57 static inline void __btrfs_debug_check_extent_io_range(const char *caller
,
58 struct extent_io_tree
*tree
,
61 struct btrfs_inode
*inode
= tree
->inode
;
67 isize
= i_size_read(&inode
->vfs_inode
);
68 if (end
>= PAGE_SIZE
&& (end
% 2) == 0 && end
!= isize
- 1) {
69 btrfs_debug_rl(inode
->root
->fs_info
,
70 "%s: ino %llu isize %llu odd range [%llu,%llu]",
71 caller
, btrfs_ino(inode
), isize
, start
, end
);
75 #define btrfs_leak_debug_add_state(state) do {} while (0)
76 #define btrfs_leak_debug_del_state(state) do {} while (0)
77 #define btrfs_extent_state_leak_debug_check() do {} while (0)
78 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
82 * For the file_extent_tree, we want to hold the inode lock when we lookup and
83 * update the disk_i_size, but lockdep will complain because our io_tree we hold
84 * the tree lock and get the inode lock when setting delalloc. These two things
85 * are unrelated, so make a class for the file_extent_tree so we don't get the
86 * two locking patterns mixed up.
88 static struct lock_class_key file_extent_tree_class
;
93 struct rb_node rb_node
;
96 void extent_io_tree_init(struct btrfs_fs_info
*fs_info
,
97 struct extent_io_tree
*tree
, unsigned int owner
)
99 tree
->fs_info
= fs_info
;
100 tree
->state
= RB_ROOT
;
101 spin_lock_init(&tree
->lock
);
104 if (owner
== IO_TREE_INODE_FILE_EXTENT
)
105 lockdep_set_class(&tree
->lock
, &file_extent_tree_class
);
109 * Empty an io tree, removing and freeing every extent state record from the
110 * tree. This should be called once we are sure no other task can access the
111 * tree anymore, so no tree updates happen after we empty the tree and there
112 * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
113 * set on any extent state when calling this function).
115 void extent_io_tree_release(struct extent_io_tree
*tree
)
117 spin_lock(&tree
->lock
);
119 * Do a single barrier for the waitqueue_active check here, the state
120 * of the waitqueue should not change once extent_io_tree_release is
124 while (!RB_EMPTY_ROOT(&tree
->state
)) {
125 struct rb_node
*node
;
126 struct extent_state
*state
;
128 node
= rb_first(&tree
->state
);
129 state
= rb_entry(node
, struct extent_state
, rb_node
);
130 rb_erase(&state
->rb_node
, &tree
->state
);
131 RB_CLEAR_NODE(&state
->rb_node
);
132 ASSERT(!(state
->state
& EXTENT_LOCKED
));
133 ASSERT(!waitqueue_active(&state
->wq
));
134 free_extent_state(state
);
136 cond_resched_lock(&tree
->lock
);
138 spin_unlock(&tree
->lock
);
141 static struct extent_state
*alloc_extent_state(gfp_t mask
)
143 struct extent_state
*state
;
146 * The given mask might be not appropriate for the slab allocator,
147 * drop the unsupported bits
149 mask
&= ~(__GFP_DMA32
|__GFP_HIGHMEM
);
150 state
= kmem_cache_alloc(extent_state_cache
, mask
);
154 RB_CLEAR_NODE(&state
->rb_node
);
155 btrfs_leak_debug_add_state(state
);
156 refcount_set(&state
->refs
, 1);
157 init_waitqueue_head(&state
->wq
);
158 trace_alloc_extent_state(state
, mask
, _RET_IP_
);
162 static struct extent_state
*alloc_extent_state_atomic(struct extent_state
*prealloc
)
165 prealloc
= alloc_extent_state(GFP_ATOMIC
);
170 void free_extent_state(struct extent_state
*state
)
174 if (refcount_dec_and_test(&state
->refs
)) {
175 WARN_ON(extent_state_in_tree(state
));
176 btrfs_leak_debug_del_state(state
);
177 trace_free_extent_state(state
, _RET_IP_
);
178 kmem_cache_free(extent_state_cache
, state
);
182 static int add_extent_changeset(struct extent_state
*state
, u32 bits
,
183 struct extent_changeset
*changeset
,
190 if (set
&& (state
->state
& bits
) == bits
)
192 if (!set
&& (state
->state
& bits
) == 0)
194 changeset
->bytes_changed
+= state
->end
- state
->start
+ 1;
195 ret
= ulist_add(&changeset
->range_changed
, state
->start
, state
->end
,
200 static inline struct extent_state
*next_state(struct extent_state
*state
)
202 struct rb_node
*next
= rb_next(&state
->rb_node
);
205 return rb_entry(next
, struct extent_state
, rb_node
);
210 static inline struct extent_state
*prev_state(struct extent_state
*state
)
212 struct rb_node
*next
= rb_prev(&state
->rb_node
);
215 return rb_entry(next
, struct extent_state
, rb_node
);
221 * Search @tree for an entry that contains @offset. Such entry would have
222 * entry->start <= offset && entry->end >= offset.
224 * @tree: the tree to search
225 * @offset: offset that should fall within an entry in @tree
226 * @node_ret: pointer where new node should be anchored (used when inserting an
228 * @parent_ret: points to entry which would have been the parent of the entry,
231 * Return a pointer to the entry that contains @offset byte address and don't change
232 * @node_ret and @parent_ret.
234 * If no such entry exists, return pointer to entry that ends before @offset
235 * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
237 static inline struct extent_state
*tree_search_for_insert(struct extent_io_tree
*tree
,
239 struct rb_node
***node_ret
,
240 struct rb_node
**parent_ret
)
242 struct rb_root
*root
= &tree
->state
;
243 struct rb_node
**node
= &root
->rb_node
;
244 struct rb_node
*prev
= NULL
;
245 struct extent_state
*entry
= NULL
;
249 entry
= rb_entry(prev
, struct extent_state
, rb_node
);
251 if (offset
< entry
->start
)
252 node
= &(*node
)->rb_left
;
253 else if (offset
> entry
->end
)
254 node
= &(*node
)->rb_right
;
264 /* Search neighbors until we find the first one past the end */
265 while (entry
&& offset
> entry
->end
)
266 entry
= next_state(entry
);
272 * Search offset in the tree or fill neighbor rbtree node pointers.
274 * @tree: the tree to search
275 * @offset: offset that should fall within an entry in @tree
276 * @next_ret: pointer to the first entry whose range ends after @offset
277 * @prev_ret: pointer to the first entry whose range begins before @offset
279 * Return a pointer to the entry that contains @offset byte address. If no
280 * such entry exists, then return NULL and fill @prev_ret and @next_ret.
281 * Otherwise return the found entry and other pointers are left untouched.
283 static struct extent_state
*tree_search_prev_next(struct extent_io_tree
*tree
,
285 struct extent_state
**prev_ret
,
286 struct extent_state
**next_ret
)
288 struct rb_root
*root
= &tree
->state
;
289 struct rb_node
**node
= &root
->rb_node
;
290 struct extent_state
*orig_prev
;
291 struct extent_state
*entry
= NULL
;
297 entry
= rb_entry(*node
, struct extent_state
, rb_node
);
299 if (offset
< entry
->start
)
300 node
= &(*node
)->rb_left
;
301 else if (offset
> entry
->end
)
302 node
= &(*node
)->rb_right
;
308 while (entry
&& offset
> entry
->end
)
309 entry
= next_state(entry
);
313 while (entry
&& offset
< entry
->start
)
314 entry
= prev_state(entry
);
321 * Inexact rb-tree search, return the next entry if @offset is not found
323 static inline struct extent_state
*tree_search(struct extent_io_tree
*tree
, u64 offset
)
325 return tree_search_for_insert(tree
, offset
, NULL
, NULL
);
328 static void extent_io_tree_panic(struct extent_io_tree
*tree
, int err
)
330 btrfs_panic(tree
->fs_info
, err
,
331 "locking error: extent tree was modified by another thread while locked");
334 static void merge_prev_state(struct extent_io_tree
*tree
, struct extent_state
*state
)
336 struct extent_state
*prev
;
338 prev
= prev_state(state
);
339 if (prev
&& prev
->end
== state
->start
- 1 && prev
->state
== state
->state
) {
341 btrfs_merge_delalloc_extent(tree
->inode
, state
, prev
);
342 state
->start
= prev
->start
;
343 rb_erase(&prev
->rb_node
, &tree
->state
);
344 RB_CLEAR_NODE(&prev
->rb_node
);
345 free_extent_state(prev
);
349 static void merge_next_state(struct extent_io_tree
*tree
, struct extent_state
*state
)
351 struct extent_state
*next
;
353 next
= next_state(state
);
354 if (next
&& next
->start
== state
->end
+ 1 && next
->state
== state
->state
) {
356 btrfs_merge_delalloc_extent(tree
->inode
, state
, next
);
357 state
->end
= next
->end
;
358 rb_erase(&next
->rb_node
, &tree
->state
);
359 RB_CLEAR_NODE(&next
->rb_node
);
360 free_extent_state(next
);
365 * Utility function to look for merge candidates inside a given range. Any
366 * extents with matching state are merged together into a single extent in the
367 * tree. Extents with EXTENT_IO in their state field are not merged because
368 * the end_io handlers need to be able to do operations on them without
369 * sleeping (or doing allocations/splits).
371 * This should be called with the tree lock held.
373 static void merge_state(struct extent_io_tree
*tree
, struct extent_state
*state
)
375 if (state
->state
& (EXTENT_LOCKED
| EXTENT_BOUNDARY
))
378 merge_prev_state(tree
, state
);
379 merge_next_state(tree
, state
);
382 static void set_state_bits(struct extent_io_tree
*tree
,
383 struct extent_state
*state
,
384 u32 bits
, struct extent_changeset
*changeset
)
386 u32 bits_to_set
= bits
& ~EXTENT_CTLBITS
;
390 btrfs_set_delalloc_extent(tree
->inode
, state
, bits
);
392 ret
= add_extent_changeset(state
, bits_to_set
, changeset
, 1);
394 state
->state
|= bits_to_set
;
398 * Insert an extent_state struct into the tree. 'bits' are set on the
399 * struct before it is inserted.
401 * Returns a pointer to the struct extent_state record containing the range
402 * requested for insertion, which may be the same as the given struct or it
403 * may be an existing record in the tree that was expanded to accommodate the
404 * requested range. In case of an extent_state different from the one that was
405 * given, the later can be freed or reused by the caller.
407 * On error it returns an error pointer.
409 * The tree lock is not taken internally. This is a utility function and
410 * probably isn't what you want to call (see set/clear_extent_bit).
412 static struct extent_state
*insert_state(struct extent_io_tree
*tree
,
413 struct extent_state
*state
,
415 struct extent_changeset
*changeset
)
417 struct rb_node
**node
;
418 struct rb_node
*parent
= NULL
;
419 const u64 start
= state
->start
- 1;
420 const u64 end
= state
->end
+ 1;
421 const bool try_merge
= !(bits
& (EXTENT_LOCKED
| EXTENT_BOUNDARY
));
423 set_state_bits(tree
, state
, bits
, changeset
);
425 node
= &tree
->state
.rb_node
;
427 struct extent_state
*entry
;
430 entry
= rb_entry(parent
, struct extent_state
, rb_node
);
432 if (state
->end
< entry
->start
) {
433 if (try_merge
&& end
== entry
->start
&&
434 state
->state
== entry
->state
) {
436 btrfs_merge_delalloc_extent(tree
->inode
,
438 entry
->start
= state
->start
;
439 merge_prev_state(tree
, entry
);
443 node
= &(*node
)->rb_left
;
444 } else if (state
->end
> entry
->end
) {
445 if (try_merge
&& entry
->end
== start
&&
446 state
->state
== entry
->state
) {
448 btrfs_merge_delalloc_extent(tree
->inode
,
450 entry
->end
= state
->end
;
451 merge_next_state(tree
, entry
);
455 node
= &(*node
)->rb_right
;
457 btrfs_err(tree
->fs_info
,
458 "found node %llu %llu on insert of %llu %llu",
459 entry
->start
, entry
->end
, state
->start
, state
->end
);
460 return ERR_PTR(-EEXIST
);
464 rb_link_node(&state
->rb_node
, parent
, node
);
465 rb_insert_color(&state
->rb_node
, &tree
->state
);
471 * Insert state to @tree to the location given by @node and @parent.
473 static void insert_state_fast(struct extent_io_tree
*tree
,
474 struct extent_state
*state
, struct rb_node
**node
,
475 struct rb_node
*parent
, unsigned bits
,
476 struct extent_changeset
*changeset
)
478 set_state_bits(tree
, state
, bits
, changeset
);
479 rb_link_node(&state
->rb_node
, parent
, node
);
480 rb_insert_color(&state
->rb_node
, &tree
->state
);
481 merge_state(tree
, state
);
485 * Split a given extent state struct in two, inserting the preallocated
486 * struct 'prealloc' as the newly created second half. 'split' indicates an
487 * offset inside 'orig' where it should be split.
490 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
491 * are two extent state structs in the tree:
492 * prealloc: [orig->start, split - 1]
493 * orig: [ split, orig->end ]
495 * The tree locks are not taken by this function. They need to be held
498 static int split_state(struct extent_io_tree
*tree
, struct extent_state
*orig
,
499 struct extent_state
*prealloc
, u64 split
)
501 struct rb_node
*parent
= NULL
;
502 struct rb_node
**node
;
505 btrfs_split_delalloc_extent(tree
->inode
, orig
, split
);
507 prealloc
->start
= orig
->start
;
508 prealloc
->end
= split
- 1;
509 prealloc
->state
= orig
->state
;
512 parent
= &orig
->rb_node
;
515 struct extent_state
*entry
;
518 entry
= rb_entry(parent
, struct extent_state
, rb_node
);
520 if (prealloc
->end
< entry
->start
) {
521 node
= &(*node
)->rb_left
;
522 } else if (prealloc
->end
> entry
->end
) {
523 node
= &(*node
)->rb_right
;
525 free_extent_state(prealloc
);
530 rb_link_node(&prealloc
->rb_node
, parent
, node
);
531 rb_insert_color(&prealloc
->rb_node
, &tree
->state
);
537 * Utility function to clear some bits in an extent state struct. It will
538 * optionally wake up anyone waiting on this state (wake == 1).
540 * If no bits are set on the state struct after clearing things, the
541 * struct is freed and removed from the tree
543 static struct extent_state
*clear_state_bit(struct extent_io_tree
*tree
,
544 struct extent_state
*state
,
546 struct extent_changeset
*changeset
)
548 struct extent_state
*next
;
549 u32 bits_to_clear
= bits
& ~EXTENT_CTLBITS
;
553 btrfs_clear_delalloc_extent(tree
->inode
, state
, bits
);
555 ret
= add_extent_changeset(state
, bits_to_clear
, changeset
, 0);
557 state
->state
&= ~bits_to_clear
;
560 if (state
->state
== 0) {
561 next
= next_state(state
);
562 if (extent_state_in_tree(state
)) {
563 rb_erase(&state
->rb_node
, &tree
->state
);
564 RB_CLEAR_NODE(&state
->rb_node
);
565 free_extent_state(state
);
570 merge_state(tree
, state
);
571 next
= next_state(state
);
577 * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
578 * unset the EXTENT_NOWAIT bit.
580 static void set_gfp_mask_from_bits(u32
*bits
, gfp_t
*mask
)
582 *mask
= (*bits
& EXTENT_NOWAIT
? GFP_NOWAIT
: GFP_NOFS
);
583 *bits
&= EXTENT_NOWAIT
- 1;
587 * Clear some bits on a range in the tree. This may require splitting or
588 * inserting elements in the tree, so the gfp mask is used to indicate which
589 * allocations or sleeping are allowed.
591 * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
592 * range from the tree regardless of state (ie for truncate).
594 * The range [start, end] is inclusive.
596 * This takes the tree lock, and returns 0 on success and < 0 on error.
598 int __clear_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
599 u32 bits
, struct extent_state
**cached_state
,
600 struct extent_changeset
*changeset
)
602 struct extent_state
*state
;
603 struct extent_state
*cached
;
604 struct extent_state
*prealloc
= NULL
;
609 int delete = (bits
& EXTENT_CLEAR_ALL_BITS
);
612 set_gfp_mask_from_bits(&bits
, &mask
);
613 btrfs_debug_check_extent_io_range(tree
, start
, end
);
614 trace_btrfs_clear_extent_bit(tree
, start
, end
- start
+ 1, bits
);
617 bits
|= ~EXTENT_CTLBITS
;
619 if (bits
& EXTENT_DELALLOC
)
620 bits
|= EXTENT_NORESERVE
;
622 wake
= (bits
& EXTENT_LOCKED
) ? 1 : 0;
623 if (bits
& (EXTENT_LOCKED
| EXTENT_BOUNDARY
))
628 * Don't care for allocation failure here because we might end
629 * up not needing the pre-allocated extent state at all, which
630 * is the case if we only have in the tree extent states that
631 * cover our input range and don't cover too any other range.
632 * If we end up needing a new extent state we allocate it later.
634 prealloc
= alloc_extent_state(mask
);
637 spin_lock(&tree
->lock
);
639 cached
= *cached_state
;
642 *cached_state
= NULL
;
646 if (cached
&& extent_state_in_tree(cached
) &&
647 cached
->start
<= start
&& cached
->end
> start
) {
649 refcount_dec(&cached
->refs
);
654 free_extent_state(cached
);
657 /* This search will find the extents that end after our range starts. */
658 state
= tree_search(tree
, start
);
662 if (state
->start
> end
)
664 WARN_ON(state
->end
< start
);
665 last_end
= state
->end
;
667 /* The state doesn't have the wanted bits, go ahead. */
668 if (!(state
->state
& bits
)) {
669 state
= next_state(state
);
674 * | ---- desired range ---- |
676 * | ------------- state -------------- |
678 * We need to split the extent we found, and may flip bits on second
681 * If the extent we found extends past our range, we just split and
682 * search again. It'll get split again the next time though.
684 * If the extent we found is inside our range, we clear the desired bit
688 if (state
->start
< start
) {
689 prealloc
= alloc_extent_state_atomic(prealloc
);
692 err
= split_state(tree
, state
, prealloc
, start
);
694 extent_io_tree_panic(tree
, err
);
699 if (state
->end
<= end
) {
700 state
= clear_state_bit(tree
, state
, bits
, wake
, changeset
);
706 * | ---- desired range ---- |
708 * We need to split the extent, and clear the bit on the first half.
710 if (state
->start
<= end
&& state
->end
> end
) {
711 prealloc
= alloc_extent_state_atomic(prealloc
);
714 err
= split_state(tree
, state
, prealloc
, end
+ 1);
716 extent_io_tree_panic(tree
, err
);
721 clear_state_bit(tree
, prealloc
, bits
, wake
, changeset
);
727 state
= clear_state_bit(tree
, state
, bits
, wake
, changeset
);
729 if (last_end
== (u64
)-1)
731 start
= last_end
+ 1;
732 if (start
<= end
&& state
&& !need_resched())
738 spin_unlock(&tree
->lock
);
739 if (gfpflags_allow_blocking(mask
))
744 spin_unlock(&tree
->lock
);
746 free_extent_state(prealloc
);
752 static void wait_on_state(struct extent_io_tree
*tree
,
753 struct extent_state
*state
)
754 __releases(tree
->lock
)
755 __acquires(tree
->lock
)
758 prepare_to_wait(&state
->wq
, &wait
, TASK_UNINTERRUPTIBLE
);
759 spin_unlock(&tree
->lock
);
761 spin_lock(&tree
->lock
);
762 finish_wait(&state
->wq
, &wait
);
766 * Wait for one or more bits to clear on a range in the state tree.
767 * The range [start, end] is inclusive.
768 * The tree lock is taken by this function
770 void wait_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
, u32 bits
,
771 struct extent_state
**cached_state
)
773 struct extent_state
*state
;
775 btrfs_debug_check_extent_io_range(tree
, start
, end
);
777 spin_lock(&tree
->lock
);
780 * Maintain cached_state, as we may not remove it from the tree if there
781 * are more bits than the bits we're waiting on set on this state.
783 if (cached_state
&& *cached_state
) {
784 state
= *cached_state
;
785 if (extent_state_in_tree(state
) &&
786 state
->start
<= start
&& start
< state
->end
)
791 * This search will find all the extents that end after our
794 state
= tree_search(tree
, start
);
798 if (state
->start
> end
)
801 if (state
->state
& bits
) {
802 start
= state
->start
;
803 refcount_inc(&state
->refs
);
804 wait_on_state(tree
, state
);
805 free_extent_state(state
);
808 start
= state
->end
+ 1;
813 if (!cond_resched_lock(&tree
->lock
)) {
814 state
= next_state(state
);
819 /* This state is no longer useful, clear it and free it up. */
820 if (cached_state
&& *cached_state
) {
821 state
= *cached_state
;
822 *cached_state
= NULL
;
823 free_extent_state(state
);
825 spin_unlock(&tree
->lock
);
828 static void cache_state_if_flags(struct extent_state
*state
,
829 struct extent_state
**cached_ptr
,
832 if (cached_ptr
&& !(*cached_ptr
)) {
833 if (!flags
|| (state
->state
& flags
)) {
835 refcount_inc(&state
->refs
);
840 static void cache_state(struct extent_state
*state
,
841 struct extent_state
**cached_ptr
)
843 return cache_state_if_flags(state
, cached_ptr
,
844 EXTENT_LOCKED
| EXTENT_BOUNDARY
);
848 * Find the first state struct with 'bits' set after 'start', and return it.
849 * tree->lock must be held. NULL will returned if nothing was found after
852 static struct extent_state
*find_first_extent_bit_state(struct extent_io_tree
*tree
,
855 struct extent_state
*state
;
858 * This search will find all the extents that end after our range
861 state
= tree_search(tree
, start
);
863 if (state
->end
>= start
&& (state
->state
& bits
))
865 state
= next_state(state
);
871 * Find the first offset in the io tree with one or more @bits set.
873 * Note: If there are multiple bits set in @bits, any of them will match.
875 * Return true if we find something, and update @start_ret and @end_ret.
876 * Return false if we found nothing.
878 bool find_first_extent_bit(struct extent_io_tree
*tree
, u64 start
,
879 u64
*start_ret
, u64
*end_ret
, u32 bits
,
880 struct extent_state
**cached_state
)
882 struct extent_state
*state
;
885 spin_lock(&tree
->lock
);
886 if (cached_state
&& *cached_state
) {
887 state
= *cached_state
;
888 if (state
->end
== start
- 1 && extent_state_in_tree(state
)) {
889 while ((state
= next_state(state
)) != NULL
) {
890 if (state
->state
& bits
)
893 free_extent_state(*cached_state
);
894 *cached_state
= NULL
;
897 free_extent_state(*cached_state
);
898 *cached_state
= NULL
;
901 state
= find_first_extent_bit_state(tree
, start
, bits
);
904 cache_state_if_flags(state
, cached_state
, 0);
905 *start_ret
= state
->start
;
906 *end_ret
= state
->end
;
910 spin_unlock(&tree
->lock
);
915 * Find a contiguous area of bits
917 * @tree: io tree to check
918 * @start: offset to start the search from
919 * @start_ret: the first offset we found with the bits set
920 * @end_ret: the final contiguous range of the bits that were set
921 * @bits: bits to look for
923 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
924 * to set bits appropriately, and then merge them again. During this time it
925 * will drop the tree->lock, so use this helper if you want to find the actual
926 * contiguous area for given bits. We will search to the first bit we find, and
927 * then walk down the tree until we find a non-contiguous area. The area
928 * returned will be the full contiguous area with the bits set.
930 int find_contiguous_extent_bit(struct extent_io_tree
*tree
, u64 start
,
931 u64
*start_ret
, u64
*end_ret
, u32 bits
)
933 struct extent_state
*state
;
936 spin_lock(&tree
->lock
);
937 state
= find_first_extent_bit_state(tree
, start
, bits
);
939 *start_ret
= state
->start
;
940 *end_ret
= state
->end
;
941 while ((state
= next_state(state
)) != NULL
) {
942 if (state
->start
> (*end_ret
+ 1))
944 *end_ret
= state
->end
;
948 spin_unlock(&tree
->lock
);
953 * Find a contiguous range of bytes in the file marked as delalloc, not more
954 * than 'max_bytes'. start and end are used to return the range,
956 * True is returned if we find something, false if nothing was in the tree.
958 bool btrfs_find_delalloc_range(struct extent_io_tree
*tree
, u64
*start
,
959 u64
*end
, u64 max_bytes
,
960 struct extent_state
**cached_state
)
962 struct extent_state
*state
;
963 u64 cur_start
= *start
;
967 spin_lock(&tree
->lock
);
970 * This search will find all the extents that end after our range
973 state
= tree_search(tree
, cur_start
);
980 if (found
&& (state
->start
!= cur_start
||
981 (state
->state
& EXTENT_BOUNDARY
))) {
984 if (!(state
->state
& EXTENT_DELALLOC
)) {
990 *start
= state
->start
;
991 *cached_state
= state
;
992 refcount_inc(&state
->refs
);
996 cur_start
= state
->end
+ 1;
997 total_bytes
+= state
->end
- state
->start
+ 1;
998 if (total_bytes
>= max_bytes
)
1000 state
= next_state(state
);
1003 spin_unlock(&tree
->lock
);
1008 * Set some bits on a range in the tree. This may require allocations or
1009 * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
1012 * If any of the exclusive bits are set, this will fail with -EEXIST if some
1013 * part of the range already has the desired bits set. The extent_state of the
1014 * existing range is returned in failed_state in this case, and the start of the
1015 * existing range is returned in failed_start. failed_state is used as an
1016 * optimization for wait_extent_bit, failed_start must be used as the source of
1017 * truth as failed_state may have changed since we returned.
1019 * [start, end] is inclusive This takes the tree lock.
1021 static int __set_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1022 u32 bits
, u64
*failed_start
,
1023 struct extent_state
**failed_state
,
1024 struct extent_state
**cached_state
,
1025 struct extent_changeset
*changeset
)
1027 struct extent_state
*state
;
1028 struct extent_state
*prealloc
= NULL
;
1029 struct rb_node
**p
= NULL
;
1030 struct rb_node
*parent
= NULL
;
1034 u32 exclusive_bits
= (bits
& EXTENT_LOCKED
);
1037 set_gfp_mask_from_bits(&bits
, &mask
);
1038 btrfs_debug_check_extent_io_range(tree
, start
, end
);
1039 trace_btrfs_set_extent_bit(tree
, start
, end
- start
+ 1, bits
);
1042 ASSERT(failed_start
);
1044 ASSERT(failed_start
== NULL
&& failed_state
== NULL
);
1048 * Don't care for allocation failure here because we might end
1049 * up not needing the pre-allocated extent state at all, which
1050 * is the case if we only have in the tree extent states that
1051 * cover our input range and don't cover too any other range.
1052 * If we end up needing a new extent state we allocate it later.
1054 prealloc
= alloc_extent_state(mask
);
1057 spin_lock(&tree
->lock
);
1058 if (cached_state
&& *cached_state
) {
1059 state
= *cached_state
;
1060 if (state
->start
<= start
&& state
->end
> start
&&
1061 extent_state_in_tree(state
))
1065 * This search will find all the extents that end after our range
1068 state
= tree_search_for_insert(tree
, start
, &p
, &parent
);
1070 prealloc
= alloc_extent_state_atomic(prealloc
);
1073 prealloc
->start
= start
;
1074 prealloc
->end
= end
;
1075 insert_state_fast(tree
, prealloc
, p
, parent
, bits
, changeset
);
1076 cache_state(prealloc
, cached_state
);
1081 last_start
= state
->start
;
1082 last_end
= state
->end
;
1085 * | ---- desired range ---- |
1088 * Just lock what we found and keep going
1090 if (state
->start
== start
&& state
->end
<= end
) {
1091 if (state
->state
& exclusive_bits
) {
1092 *failed_start
= state
->start
;
1093 cache_state(state
, failed_state
);
1098 set_state_bits(tree
, state
, bits
, changeset
);
1099 cache_state(state
, cached_state
);
1100 merge_state(tree
, state
);
1101 if (last_end
== (u64
)-1)
1103 start
= last_end
+ 1;
1104 state
= next_state(state
);
1105 if (start
< end
&& state
&& state
->start
== start
&&
1112 * | ---- desired range ---- |
1115 * | ------------- state -------------- |
1117 * We need to split the extent we found, and may flip bits on second
1120 * If the extent we found extends past our range, we just split and
1121 * search again. It'll get split again the next time though.
1123 * If the extent we found is inside our range, we set the desired bit
1126 if (state
->start
< start
) {
1127 if (state
->state
& exclusive_bits
) {
1128 *failed_start
= start
;
1129 cache_state(state
, failed_state
);
1135 * If this extent already has all the bits we want set, then
1136 * skip it, not necessary to split it or do anything with it.
1138 if ((state
->state
& bits
) == bits
) {
1139 start
= state
->end
+ 1;
1140 cache_state(state
, cached_state
);
1144 prealloc
= alloc_extent_state_atomic(prealloc
);
1147 err
= split_state(tree
, state
, prealloc
, start
);
1149 extent_io_tree_panic(tree
, err
);
1154 if (state
->end
<= end
) {
1155 set_state_bits(tree
, state
, bits
, changeset
);
1156 cache_state(state
, cached_state
);
1157 merge_state(tree
, state
);
1158 if (last_end
== (u64
)-1)
1160 start
= last_end
+ 1;
1161 state
= next_state(state
);
1162 if (start
< end
&& state
&& state
->start
== start
&&
1169 * | ---- desired range ---- |
1170 * | state | or | state |
1172 * There's a hole, we need to insert something in it and ignore the
1175 if (state
->start
> start
) {
1177 struct extent_state
*inserted_state
;
1179 if (end
< last_start
)
1182 this_end
= last_start
- 1;
1184 prealloc
= alloc_extent_state_atomic(prealloc
);
1189 * Avoid to free 'prealloc' if it can be merged with the later
1192 prealloc
->start
= start
;
1193 prealloc
->end
= this_end
;
1194 inserted_state
= insert_state(tree
, prealloc
, bits
, changeset
);
1195 if (IS_ERR(inserted_state
)) {
1196 err
= PTR_ERR(inserted_state
);
1197 extent_io_tree_panic(tree
, err
);
1200 cache_state(inserted_state
, cached_state
);
1201 if (inserted_state
== prealloc
)
1203 start
= this_end
+ 1;
1207 * | ---- desired range ---- |
1210 * We need to split the extent, and set the bit on the first half
1212 if (state
->start
<= end
&& state
->end
> end
) {
1213 if (state
->state
& exclusive_bits
) {
1214 *failed_start
= start
;
1215 cache_state(state
, failed_state
);
1220 prealloc
= alloc_extent_state_atomic(prealloc
);
1223 err
= split_state(tree
, state
, prealloc
, end
+ 1);
1225 extent_io_tree_panic(tree
, err
);
1227 set_state_bits(tree
, prealloc
, bits
, changeset
);
1228 cache_state(prealloc
, cached_state
);
1229 merge_state(tree
, prealloc
);
1237 spin_unlock(&tree
->lock
);
1238 if (gfpflags_allow_blocking(mask
))
1243 spin_unlock(&tree
->lock
);
1245 free_extent_state(prealloc
);
1251 int set_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1252 u32 bits
, struct extent_state
**cached_state
)
1254 return __set_extent_bit(tree
, start
, end
, bits
, NULL
, NULL
,
1255 cached_state
, NULL
);
1259 * Convert all bits in a given range from one bit to another
1261 * @tree: the io tree to search
1262 * @start: the start offset in bytes
1263 * @end: the end offset in bytes (inclusive)
1264 * @bits: the bits to set in this range
1265 * @clear_bits: the bits to clear in this range
1266 * @cached_state: state that we're going to cache
1268 * This will go through and set bits for the given range. If any states exist
1269 * already in this range they are set with the given bit and cleared of the
1270 * clear_bits. This is only meant to be used by things that are mergeable, ie.
1271 * converting from say DELALLOC to DIRTY. This is not meant to be used with
1272 * boundary bits like LOCK.
1274 * All allocations are done with GFP_NOFS.
1276 int convert_extent_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1277 u32 bits
, u32 clear_bits
,
1278 struct extent_state
**cached_state
)
1280 struct extent_state
*state
;
1281 struct extent_state
*prealloc
= NULL
;
1282 struct rb_node
**p
= NULL
;
1283 struct rb_node
*parent
= NULL
;
1287 bool first_iteration
= true;
1289 btrfs_debug_check_extent_io_range(tree
, start
, end
);
1290 trace_btrfs_convert_extent_bit(tree
, start
, end
- start
+ 1, bits
,
1296 * Best effort, don't worry if extent state allocation fails
1297 * here for the first iteration. We might have a cached state
1298 * that matches exactly the target range, in which case no
1299 * extent state allocations are needed. We'll only know this
1300 * after locking the tree.
1302 prealloc
= alloc_extent_state(GFP_NOFS
);
1303 if (!prealloc
&& !first_iteration
)
1307 spin_lock(&tree
->lock
);
1308 if (cached_state
&& *cached_state
) {
1309 state
= *cached_state
;
1310 if (state
->start
<= start
&& state
->end
> start
&&
1311 extent_state_in_tree(state
))
1316 * This search will find all the extents that end after our range
1319 state
= tree_search_for_insert(tree
, start
, &p
, &parent
);
1321 prealloc
= alloc_extent_state_atomic(prealloc
);
1326 prealloc
->start
= start
;
1327 prealloc
->end
= end
;
1328 insert_state_fast(tree
, prealloc
, p
, parent
, bits
, NULL
);
1329 cache_state(prealloc
, cached_state
);
1334 last_start
= state
->start
;
1335 last_end
= state
->end
;
1338 * | ---- desired range ---- |
1341 * Just lock what we found and keep going.
1343 if (state
->start
== start
&& state
->end
<= end
) {
1344 set_state_bits(tree
, state
, bits
, NULL
);
1345 cache_state(state
, cached_state
);
1346 state
= clear_state_bit(tree
, state
, clear_bits
, 0, NULL
);
1347 if (last_end
== (u64
)-1)
1349 start
= last_end
+ 1;
1350 if (start
< end
&& state
&& state
->start
== start
&&
1357 * | ---- desired range ---- |
1360 * | ------------- state -------------- |
1362 * We need to split the extent we found, and may flip bits on second
1365 * If the extent we found extends past our range, we just split and
1366 * search again. It'll get split again the next time though.
1368 * If the extent we found is inside our range, we set the desired bit
1371 if (state
->start
< start
) {
1372 prealloc
= alloc_extent_state_atomic(prealloc
);
1377 err
= split_state(tree
, state
, prealloc
, start
);
1379 extent_io_tree_panic(tree
, err
);
1383 if (state
->end
<= end
) {
1384 set_state_bits(tree
, state
, bits
, NULL
);
1385 cache_state(state
, cached_state
);
1386 state
= clear_state_bit(tree
, state
, clear_bits
, 0, NULL
);
1387 if (last_end
== (u64
)-1)
1389 start
= last_end
+ 1;
1390 if (start
< end
&& state
&& state
->start
== start
&&
1397 * | ---- desired range ---- |
1398 * | state | or | state |
1400 * There's a hole, we need to insert something in it and ignore the
1403 if (state
->start
> start
) {
1405 struct extent_state
*inserted_state
;
1407 if (end
< last_start
)
1410 this_end
= last_start
- 1;
1412 prealloc
= alloc_extent_state_atomic(prealloc
);
1419 * Avoid to free 'prealloc' if it can be merged with the later
1422 prealloc
->start
= start
;
1423 prealloc
->end
= this_end
;
1424 inserted_state
= insert_state(tree
, prealloc
, bits
, NULL
);
1425 if (IS_ERR(inserted_state
)) {
1426 err
= PTR_ERR(inserted_state
);
1427 extent_io_tree_panic(tree
, err
);
1429 cache_state(inserted_state
, cached_state
);
1430 if (inserted_state
== prealloc
)
1432 start
= this_end
+ 1;
1436 * | ---- desired range ---- |
1439 * We need to split the extent, and set the bit on the first half.
1441 if (state
->start
<= end
&& state
->end
> end
) {
1442 prealloc
= alloc_extent_state_atomic(prealloc
);
1448 err
= split_state(tree
, state
, prealloc
, end
+ 1);
1450 extent_io_tree_panic(tree
, err
);
1452 set_state_bits(tree
, prealloc
, bits
, NULL
);
1453 cache_state(prealloc
, cached_state
);
1454 clear_state_bit(tree
, prealloc
, clear_bits
, 0, NULL
);
1462 spin_unlock(&tree
->lock
);
1464 first_iteration
= false;
1468 spin_unlock(&tree
->lock
);
1470 free_extent_state(prealloc
);
1476 * Find the first range that has @bits not set. This range could start before
1479 * @tree: the tree to search
1480 * @start: offset at/after which the found extent should start
1481 * @start_ret: records the beginning of the range
1482 * @end_ret: records the end of the range (inclusive)
1483 * @bits: the set of bits which must be unset
1485 * Since unallocated range is also considered one which doesn't have the bits
1486 * set it's possible that @end_ret contains -1, this happens in case the range
1487 * spans (last_range_end, end of device]. In this case it's up to the caller to
1488 * trim @end_ret to the appropriate size.
1490 void find_first_clear_extent_bit(struct extent_io_tree
*tree
, u64 start
,
1491 u64
*start_ret
, u64
*end_ret
, u32 bits
)
1493 struct extent_state
*state
;
1494 struct extent_state
*prev
= NULL
, *next
= NULL
;
1496 spin_lock(&tree
->lock
);
1498 /* Find first extent with bits cleared */
1500 state
= tree_search_prev_next(tree
, start
, &prev
, &next
);
1501 if (!state
&& !next
&& !prev
) {
1503 * Tree is completely empty, send full range and let
1504 * caller deal with it
1509 } else if (!state
&& !next
) {
1511 * We are past the last allocated chunk, set start at
1512 * the end of the last extent.
1514 *start_ret
= prev
->end
+ 1;
1517 } else if (!state
) {
1522 * At this point 'state' either contains 'start' or start is
1525 if (in_range(start
, state
->start
, state
->end
- state
->start
+ 1)) {
1526 if (state
->state
& bits
) {
1528 * |--range with bits sets--|
1532 start
= state
->end
+ 1;
1535 * 'start' falls within a range that doesn't
1536 * have the bits set, so take its start as the
1537 * beginning of the desired range
1539 * |--range with bits cleared----|
1543 *start_ret
= state
->start
;
1548 * |---prev range---|---hole/unset---|---node range---|
1554 * |---hole/unset--||--first node--|
1559 *start_ret
= prev
->end
+ 1;
1567 * Find the longest stretch from start until an entry which has the
1571 if (state
->end
>= start
&& !(state
->state
& bits
)) {
1572 *end_ret
= state
->end
;
1574 *end_ret
= state
->start
- 1;
1577 state
= next_state(state
);
1580 spin_unlock(&tree
->lock
);
1584 * Count the number of bytes in the tree that have a given bit(s) set for a
1587 * @tree: The io tree to search.
1588 * @start: The start offset of the range. This value is updated to the
1589 * offset of the first byte found with the given bit(s), so it
1590 * can end up being bigger than the initial value.
1591 * @search_end: The end offset (inclusive value) of the search range.
1592 * @max_bytes: The maximum byte count we are interested. The search stops
1593 * once it reaches this count.
1594 * @bits: The bits the range must have in order to be accounted for.
1595 * If multiple bits are set, then only subranges that have all
1596 * the bits set are accounted for.
1597 * @contig: Indicate if we should ignore holes in the range or not. If
1598 * this is true, then stop once we find a hole.
1599 * @cached_state: A cached state to be used across multiple calls to this
1600 * function in order to speedup searches. Use NULL if this is
1601 * called only once or if each call does not start where the
1602 * previous one ended.
1604 * Returns the total number of bytes found within the given range that have
1605 * all given bits set. If the returned number of bytes is greater than zero
1606 * then @start is updated with the offset of the first byte with the bits set.
1608 u64
count_range_bits(struct extent_io_tree
*tree
,
1609 u64
*start
, u64 search_end
, u64 max_bytes
,
1610 u32 bits
, int contig
,
1611 struct extent_state
**cached_state
)
1613 struct extent_state
*state
= NULL
;
1614 struct extent_state
*cached
;
1615 u64 cur_start
= *start
;
1616 u64 total_bytes
= 0;
1620 if (WARN_ON(search_end
< cur_start
))
1623 spin_lock(&tree
->lock
);
1625 if (!cached_state
|| !*cached_state
)
1628 cached
= *cached_state
;
1630 if (!extent_state_in_tree(cached
))
1633 if (cached
->start
<= cur_start
&& cur_start
<= cached
->end
) {
1635 } else if (cached
->start
> cur_start
) {
1636 struct extent_state
*prev
;
1639 * The cached state starts after our search range's start. Check
1640 * if the previous state record starts at or before the range we
1641 * are looking for, and if so, use it - this is a common case
1642 * when there are holes between records in the tree. If there is
1643 * no previous state record, we can start from our cached state.
1645 prev
= prev_state(cached
);
1648 else if (prev
->start
<= cur_start
&& cur_start
<= prev
->end
)
1653 * This search will find all the extents that end after our range
1658 state
= tree_search(tree
, cur_start
);
1661 if (state
->start
> search_end
)
1663 if (contig
&& found
&& state
->start
> last
+ 1)
1665 if (state
->end
>= cur_start
&& (state
->state
& bits
) == bits
) {
1666 total_bytes
+= min(search_end
, state
->end
) + 1 -
1667 max(cur_start
, state
->start
);
1668 if (total_bytes
>= max_bytes
)
1671 *start
= max(cur_start
, state
->start
);
1675 } else if (contig
&& found
) {
1678 state
= next_state(state
);
1682 free_extent_state(*cached_state
);
1683 *cached_state
= state
;
1685 refcount_inc(&state
->refs
);
1688 spin_unlock(&tree
->lock
);
1694 * Check if the single @bit exists in the given range.
1696 bool test_range_bit_exists(struct extent_io_tree
*tree
, u64 start
, u64 end
, u32 bit
)
1698 struct extent_state
*state
= NULL
;
1699 bool bitset
= false;
1701 ASSERT(is_power_of_2(bit
));
1703 spin_lock(&tree
->lock
);
1704 state
= tree_search(tree
, start
);
1705 while (state
&& start
<= end
) {
1706 if (state
->start
> end
)
1709 if (state
->state
& bit
) {
1714 /* If state->end is (u64)-1, start will overflow to 0 */
1715 start
= state
->end
+ 1;
1716 if (start
> end
|| start
== 0)
1718 state
= next_state(state
);
1720 spin_unlock(&tree
->lock
);
1725 * Check if the whole range [@start,@end) contains the single @bit set.
1727 bool test_range_bit(struct extent_io_tree
*tree
, u64 start
, u64 end
, u32 bit
,
1728 struct extent_state
*cached
)
1730 struct extent_state
*state
= NULL
;
1733 ASSERT(is_power_of_2(bit
));
1735 spin_lock(&tree
->lock
);
1736 if (cached
&& extent_state_in_tree(cached
) && cached
->start
<= start
&&
1737 cached
->end
> start
)
1740 state
= tree_search(tree
, start
);
1741 while (state
&& start
<= end
) {
1742 if (state
->start
> start
) {
1747 if (state
->start
> end
)
1750 if ((state
->state
& bit
) == 0) {
1755 if (state
->end
== (u64
)-1)
1759 * Last entry (if state->end is (u64)-1 and overflow happens),
1760 * or next entry starts after the range.
1762 start
= state
->end
+ 1;
1763 if (start
> end
|| start
== 0)
1765 state
= next_state(state
);
1768 /* We ran out of states and were still inside of our range. */
1771 spin_unlock(&tree
->lock
);
1775 /* Wrappers around set/clear extent bit */
1776 int set_record_extent_bits(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1777 u32 bits
, struct extent_changeset
*changeset
)
1780 * We don't support EXTENT_LOCKED yet, as current changeset will
1781 * record any bits changed, so for EXTENT_LOCKED case, it will
1782 * either fail with -EEXIST or changeset will record the whole
1785 ASSERT(!(bits
& EXTENT_LOCKED
));
1787 return __set_extent_bit(tree
, start
, end
, bits
, NULL
, NULL
, NULL
, changeset
);
1790 int clear_record_extent_bits(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1791 u32 bits
, struct extent_changeset
*changeset
)
1794 * Don't support EXTENT_LOCKED case, same reason as
1795 * set_record_extent_bits().
1797 ASSERT(!(bits
& EXTENT_LOCKED
));
1799 return __clear_extent_bit(tree
, start
, end
, bits
, NULL
, changeset
);
1802 int try_lock_extent(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1803 struct extent_state
**cached
)
1808 err
= __set_extent_bit(tree
, start
, end
, EXTENT_LOCKED
, &failed_start
,
1809 NULL
, cached
, NULL
);
1810 if (err
== -EEXIST
) {
1811 if (failed_start
> start
)
1812 clear_extent_bit(tree
, start
, failed_start
- 1,
1813 EXTENT_LOCKED
, cached
);
1820 * Either insert or lock state struct between start and end use mask to tell
1821 * us if waiting is desired.
1823 int lock_extent(struct extent_io_tree
*tree
, u64 start
, u64 end
,
1824 struct extent_state
**cached_state
)
1826 struct extent_state
*failed_state
= NULL
;
1830 err
= __set_extent_bit(tree
, start
, end
, EXTENT_LOCKED
, &failed_start
,
1831 &failed_state
, cached_state
, NULL
);
1832 while (err
== -EEXIST
) {
1833 if (failed_start
!= start
)
1834 clear_extent_bit(tree
, start
, failed_start
- 1,
1835 EXTENT_LOCKED
, cached_state
);
1837 wait_extent_bit(tree
, failed_start
, end
, EXTENT_LOCKED
,
1839 err
= __set_extent_bit(tree
, start
, end
, EXTENT_LOCKED
,
1840 &failed_start
, &failed_state
,
1841 cached_state
, NULL
);
1846 void __cold
extent_state_free_cachep(void)
1848 btrfs_extent_state_leak_debug_check();
1849 kmem_cache_destroy(extent_state_cache
);
1852 int __init
extent_state_init_cachep(void)
1854 extent_state_cache
= kmem_cache_create("btrfs_extent_state",
1855 sizeof(struct extent_state
), 0,
1856 SLAB_MEM_SPREAD
, NULL
);
1857 if (!extent_state_cache
)