1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/sched.h>
9 #include "print-tree.h"
10 #include "transaction.h"
12 #include "accessors.h"
14 #include "delalloc-space.h"
17 #include "file-item.h"
20 static struct kmem_cache
*btrfs_inode_defrag_cachep
;
23 * When auto defrag is enabled we queue up these defrag structs to remember
24 * which inodes need defragging passes.
27 struct rb_node rb_node
;
31 * Transid where the defrag was added, we search for extents newer than
40 * The extent size threshold for autodefrag.
42 * This value is different for compressed/non-compressed extents, thus
43 * needs to be passed from higher layer.
44 * (aka, inode_should_defrag())
49 static int __compare_inode_defrag(struct inode_defrag
*defrag1
,
50 struct inode_defrag
*defrag2
)
52 if (defrag1
->root
> defrag2
->root
)
54 else if (defrag1
->root
< defrag2
->root
)
56 else if (defrag1
->ino
> defrag2
->ino
)
58 else if (defrag1
->ino
< defrag2
->ino
)
65 * Pop a record for an inode into the defrag tree. The lock must be held
68 * If you're inserting a record for an older transid than an existing record,
69 * the transid already in the tree is lowered.
71 * If an existing record is found the defrag item you pass in is freed.
73 static int __btrfs_add_inode_defrag(struct btrfs_inode
*inode
,
74 struct inode_defrag
*defrag
)
76 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
77 struct inode_defrag
*entry
;
79 struct rb_node
*parent
= NULL
;
82 p
= &fs_info
->defrag_inodes
.rb_node
;
85 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
87 ret
= __compare_inode_defrag(defrag
, entry
);
91 p
= &parent
->rb_right
;
94 * If we're reinserting an entry for an old defrag run,
95 * make sure to lower the transid of our existing
98 if (defrag
->transid
< entry
->transid
)
99 entry
->transid
= defrag
->transid
;
100 entry
->extent_thresh
= min(defrag
->extent_thresh
,
101 entry
->extent_thresh
);
105 set_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
);
106 rb_link_node(&defrag
->rb_node
, parent
, p
);
107 rb_insert_color(&defrag
->rb_node
, &fs_info
->defrag_inodes
);
111 static inline int __need_auto_defrag(struct btrfs_fs_info
*fs_info
)
113 if (!btrfs_test_opt(fs_info
, AUTO_DEFRAG
))
116 if (btrfs_fs_closing(fs_info
))
123 * Insert a defrag record for this inode if auto defrag is enabled.
125 int btrfs_add_inode_defrag(struct btrfs_trans_handle
*trans
,
126 struct btrfs_inode
*inode
, u32 extent_thresh
)
128 struct btrfs_root
*root
= inode
->root
;
129 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
130 struct inode_defrag
*defrag
;
134 if (!__need_auto_defrag(fs_info
))
137 if (test_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
))
141 transid
= trans
->transid
;
143 transid
= inode
->root
->last_trans
;
145 defrag
= kmem_cache_zalloc(btrfs_inode_defrag_cachep
, GFP_NOFS
);
149 defrag
->ino
= btrfs_ino(inode
);
150 defrag
->transid
= transid
;
151 defrag
->root
= root
->root_key
.objectid
;
152 defrag
->extent_thresh
= extent_thresh
;
154 spin_lock(&fs_info
->defrag_inodes_lock
);
155 if (!test_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
)) {
157 * If we set IN_DEFRAG flag and evict the inode from memory,
158 * and then re-read this inode, this new inode doesn't have
159 * IN_DEFRAG flag. At the case, we may find the existed defrag.
161 ret
= __btrfs_add_inode_defrag(inode
, defrag
);
163 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
165 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
167 spin_unlock(&fs_info
->defrag_inodes_lock
);
172 * Pick the defragable inode that we want, if it doesn't exist, we will get the
175 static struct inode_defrag
*btrfs_pick_defrag_inode(
176 struct btrfs_fs_info
*fs_info
, u64 root
, u64 ino
)
178 struct inode_defrag
*entry
= NULL
;
179 struct inode_defrag tmp
;
181 struct rb_node
*parent
= NULL
;
187 spin_lock(&fs_info
->defrag_inodes_lock
);
188 p
= fs_info
->defrag_inodes
.rb_node
;
191 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
193 ret
= __compare_inode_defrag(&tmp
, entry
);
197 p
= parent
->rb_right
;
202 if (parent
&& __compare_inode_defrag(&tmp
, entry
) > 0) {
203 parent
= rb_next(parent
);
205 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
211 rb_erase(parent
, &fs_info
->defrag_inodes
);
212 spin_unlock(&fs_info
->defrag_inodes_lock
);
216 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info
*fs_info
)
218 struct inode_defrag
*defrag
;
219 struct rb_node
*node
;
221 spin_lock(&fs_info
->defrag_inodes_lock
);
222 node
= rb_first(&fs_info
->defrag_inodes
);
224 rb_erase(node
, &fs_info
->defrag_inodes
);
225 defrag
= rb_entry(node
, struct inode_defrag
, rb_node
);
226 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
228 cond_resched_lock(&fs_info
->defrag_inodes_lock
);
230 node
= rb_first(&fs_info
->defrag_inodes
);
232 spin_unlock(&fs_info
->defrag_inodes_lock
);
235 #define BTRFS_DEFRAG_BATCH 1024
237 static int __btrfs_run_defrag_inode(struct btrfs_fs_info
*fs_info
,
238 struct inode_defrag
*defrag
)
240 struct btrfs_root
*inode_root
;
242 struct btrfs_ioctl_defrag_range_args range
;
247 if (test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
))
249 if (!__need_auto_defrag(fs_info
))
253 inode_root
= btrfs_get_fs_root(fs_info
, defrag
->root
, true);
254 if (IS_ERR(inode_root
)) {
255 ret
= PTR_ERR(inode_root
);
259 inode
= btrfs_iget(fs_info
->sb
, defrag
->ino
, inode_root
);
260 btrfs_put_root(inode_root
);
262 ret
= PTR_ERR(inode
);
266 if (cur
>= i_size_read(inode
)) {
271 /* Do a chunk of defrag */
272 clear_bit(BTRFS_INODE_IN_DEFRAG
, &BTRFS_I(inode
)->runtime_flags
);
273 memset(&range
, 0, sizeof(range
));
276 range
.extent_thresh
= defrag
->extent_thresh
;
278 sb_start_write(fs_info
->sb
);
279 ret
= btrfs_defrag_file(inode
, NULL
, &range
, defrag
->transid
,
281 sb_end_write(fs_info
->sb
);
287 cur
= max(cur
+ fs_info
->sectorsize
, range
.start
);
291 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
296 * Run through the list of inodes in the FS that need defragging.
298 int btrfs_run_defrag_inodes(struct btrfs_fs_info
*fs_info
)
300 struct inode_defrag
*defrag
;
302 u64 root_objectid
= 0;
304 atomic_inc(&fs_info
->defrag_running
);
306 /* Pause the auto defragger. */
307 if (test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
))
310 if (!__need_auto_defrag(fs_info
))
313 /* find an inode to defrag */
314 defrag
= btrfs_pick_defrag_inode(fs_info
, root_objectid
, first_ino
);
316 if (root_objectid
|| first_ino
) {
325 first_ino
= defrag
->ino
+ 1;
326 root_objectid
= defrag
->root
;
328 __btrfs_run_defrag_inode(fs_info
, defrag
);
330 atomic_dec(&fs_info
->defrag_running
);
333 * During unmount, we use the transaction_wait queue to wait for the
336 wake_up(&fs_info
->transaction_wait
);
341 * Defrag all the leaves in a given btree.
342 * Read all the leaves and try to get key order to
343 * better reflect disk order
346 static int btrfs_defrag_leaves(struct btrfs_trans_handle
*trans
,
347 struct btrfs_root
*root
)
349 struct btrfs_path
*path
= NULL
;
350 struct btrfs_key key
;
354 int next_key_ret
= 0;
357 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
360 path
= btrfs_alloc_path();
366 level
= btrfs_header_level(root
->node
);
371 if (root
->defrag_progress
.objectid
== 0) {
372 struct extent_buffer
*root_node
;
375 root_node
= btrfs_lock_root_node(root
);
376 nritems
= btrfs_header_nritems(root_node
);
377 root
->defrag_max
.objectid
= 0;
378 /* from above we know this is not a leaf */
379 btrfs_node_key_to_cpu(root_node
, &root
->defrag_max
,
381 btrfs_tree_unlock(root_node
);
382 free_extent_buffer(root_node
);
383 memset(&key
, 0, sizeof(key
));
385 memcpy(&key
, &root
->defrag_progress
, sizeof(key
));
388 path
->keep_locks
= 1;
390 ret
= btrfs_search_forward(root
, &key
, path
, BTRFS_OLDEST_GENERATION
);
397 btrfs_release_path(path
);
399 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
400 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
401 * a deadlock (attempting to write lock an already write locked leaf).
403 path
->lowest_level
= 1;
404 wret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
410 if (!path
->nodes
[1]) {
415 * The node at level 1 must always be locked when our path has
416 * keep_locks set and lowest_level is 1, regardless of the value of
419 BUG_ON(path
->locks
[1] == 0);
420 ret
= btrfs_realloc_node(trans
, root
,
423 &root
->defrag_progress
);
425 WARN_ON(ret
== -EAGAIN
);
429 * Now that we reallocated the node we can find the next key. Note that
430 * btrfs_find_next_key() can release our path and do another search
431 * without COWing, this is because even with path->keep_locks = 1,
432 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
433 * node when path->slots[node_level - 1] does not point to the last
434 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
435 * we search for the next key after reallocating our node.
437 path
->slots
[1] = btrfs_header_nritems(path
->nodes
[1]);
438 next_key_ret
= btrfs_find_next_key(root
, path
, &key
, 1,
439 BTRFS_OLDEST_GENERATION
);
440 if (next_key_ret
== 0) {
441 memcpy(&root
->defrag_progress
, &key
, sizeof(key
));
445 btrfs_free_path(path
);
446 if (ret
== -EAGAIN
) {
447 if (root
->defrag_max
.objectid
> root
->defrag_progress
.objectid
)
449 if (root
->defrag_max
.type
> root
->defrag_progress
.type
)
451 if (root
->defrag_max
.offset
> root
->defrag_progress
.offset
)
457 memset(&root
->defrag_progress
, 0,
458 sizeof(root
->defrag_progress
));
464 * Defrag a given btree. Every leaf in the btree is read and defragmented.
466 int btrfs_defrag_root(struct btrfs_root
*root
)
468 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
471 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING
, &root
->state
))
475 struct btrfs_trans_handle
*trans
;
477 trans
= btrfs_start_transaction(root
, 0);
479 ret
= PTR_ERR(trans
);
483 ret
= btrfs_defrag_leaves(trans
, root
);
485 btrfs_end_transaction(trans
);
486 btrfs_btree_balance_dirty(fs_info
);
489 if (btrfs_fs_closing(fs_info
) || ret
!= -EAGAIN
)
492 if (btrfs_defrag_cancelled(fs_info
)) {
493 btrfs_debug(fs_info
, "defrag_root cancelled");
498 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING
, &root
->state
);
503 * Defrag specific helper to get an extent map.
505 * Differences between this and btrfs_get_extent() are:
507 * - No extent_map will be added to inode->extent_tree
508 * To reduce memory usage in the long run.
510 * - Extra optimization to skip file extents older than @newer_than
511 * By using btrfs_search_forward() we can skip entire file ranges that
512 * have extents created in past transactions, because btrfs_search_forward()
513 * will not visit leaves and nodes with a generation smaller than given
514 * minimal generation threshold (@newer_than).
516 * Return valid em if we find a file extent matching the requirement.
517 * Return NULL if we can not find a file extent matching the requirement.
519 * Return ERR_PTR() for error.
521 static struct extent_map
*defrag_get_extent(struct btrfs_inode
*inode
,
522 u64 start
, u64 newer_than
)
524 struct btrfs_root
*root
= inode
->root
;
525 struct btrfs_file_extent_item
*fi
;
526 struct btrfs_path path
= { 0 };
527 struct extent_map
*em
;
528 struct btrfs_key key
;
529 u64 ino
= btrfs_ino(inode
);
532 em
= alloc_extent_map();
539 key
.type
= BTRFS_EXTENT_DATA_KEY
;
543 ret
= btrfs_search_forward(root
, &key
, &path
, newer_than
);
546 /* Can't find anything newer */
550 ret
= btrfs_search_slot(NULL
, root
, &key
, &path
, 0, 0);
554 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0])) {
556 * If btrfs_search_slot() makes path to point beyond nritems,
557 * we should not have an empty leaf, as this inode must at
558 * least have its INODE_ITEM.
560 ASSERT(btrfs_header_nritems(path
.nodes
[0]));
561 path
.slots
[0] = btrfs_header_nritems(path
.nodes
[0]) - 1;
563 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
564 /* Perfect match, no need to go one slot back */
565 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
&&
569 /* We didn't find a perfect match, needs to go one slot back */
570 if (path
.slots
[0] > 0) {
571 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
572 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
)
577 /* Iterate through the path to find a file extent covering @start */
581 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0]))
584 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
587 * We may go one slot back to INODE_REF/XATTR item, then
588 * need to go forward until we reach an EXTENT_DATA.
589 * But we should still has the correct ino as key.objectid.
591 if (WARN_ON(key
.objectid
< ino
) || key
.type
< BTRFS_EXTENT_DATA_KEY
)
594 /* It's beyond our target range, definitely not extent found */
595 if (key
.objectid
> ino
|| key
.type
> BTRFS_EXTENT_DATA_KEY
)
599 * | |<- File extent ->|
602 * This means there is a hole between start and key.offset.
604 if (key
.offset
> start
) {
606 em
->orig_start
= start
;
607 em
->block_start
= EXTENT_MAP_HOLE
;
608 em
->len
= key
.offset
- start
;
612 fi
= btrfs_item_ptr(path
.nodes
[0], path
.slots
[0],
613 struct btrfs_file_extent_item
);
614 extent_end
= btrfs_file_extent_end(&path
);
617 * |<- file extent ->| |
620 * We haven't reached start, search next slot.
622 if (extent_end
<= start
)
625 /* Now this extent covers @start, convert it to em */
626 btrfs_extent_item_to_extent_map(inode
, &path
, fi
, em
);
629 ret
= btrfs_next_item(root
, &path
);
635 btrfs_release_path(&path
);
639 btrfs_release_path(&path
);
644 btrfs_release_path(&path
);
649 static struct extent_map
*defrag_lookup_extent(struct inode
*inode
, u64 start
,
650 u64 newer_than
, bool locked
)
652 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
653 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
654 struct extent_map
*em
;
655 const u32 sectorsize
= BTRFS_I(inode
)->root
->fs_info
->sectorsize
;
658 * Hopefully we have this extent in the tree already, try without the
661 read_lock(&em_tree
->lock
);
662 em
= lookup_extent_mapping(em_tree
, start
, sectorsize
);
663 read_unlock(&em_tree
->lock
);
666 * We can get a merged extent, in that case, we need to re-search
667 * tree to get the original em for defrag.
669 * If @newer_than is 0 or em::generation < newer_than, we can trust
670 * this em, as either we don't care about the generation, or the
671 * merged extent map will be rejected anyway.
673 if (em
&& test_bit(EXTENT_FLAG_MERGED
, &em
->flags
) &&
674 newer_than
&& em
->generation
>= newer_than
) {
680 struct extent_state
*cached
= NULL
;
681 u64 end
= start
+ sectorsize
- 1;
683 /* Get the big lock and read metadata off disk. */
685 lock_extent(io_tree
, start
, end
, &cached
);
686 em
= defrag_get_extent(BTRFS_I(inode
), start
, newer_than
);
688 unlock_extent(io_tree
, start
, end
, &cached
);
697 static u32
get_extent_max_capacity(const struct btrfs_fs_info
*fs_info
,
698 const struct extent_map
*em
)
700 if (test_bit(EXTENT_FLAG_COMPRESSED
, &em
->flags
))
701 return BTRFS_MAX_COMPRESSED
;
702 return fs_info
->max_extent_size
;
705 static bool defrag_check_next_extent(struct inode
*inode
, struct extent_map
*em
,
706 u32 extent_thresh
, u64 newer_than
, bool locked
)
708 struct btrfs_fs_info
*fs_info
= btrfs_sb(inode
->i_sb
);
709 struct extent_map
*next
;
712 /* This is the last extent */
713 if (em
->start
+ em
->len
>= i_size_read(inode
))
717 * Here we need to pass @newer_then when checking the next extent, or
718 * we will hit a case we mark current extent for defrag, but the next
719 * one will not be a target.
720 * This will just cause extra IO without really reducing the fragments.
722 next
= defrag_lookup_extent(inode
, em
->start
+ em
->len
, newer_than
, locked
);
723 /* No more em or hole */
724 if (!next
|| next
->block_start
>= EXTENT_MAP_LAST_BYTE
)
726 if (test_bit(EXTENT_FLAG_PREALLOC
, &next
->flags
))
729 * If the next extent is at its max capacity, defragging current extent
730 * makes no sense, as the total number of extents won't change.
732 if (next
->len
>= get_extent_max_capacity(fs_info
, em
))
734 /* Skip older extent */
735 if (next
->generation
< newer_than
)
737 /* Also check extent size */
738 if (next
->len
>= extent_thresh
)
743 free_extent_map(next
);
748 * Prepare one page to be defragged.
752 * - Returned page is locked and has been set up properly.
753 * - No ordered extent exists in the page.
754 * - The page is uptodate.
756 * NOTE: Caller should also wait for page writeback after the cluster is
757 * prepared, here we don't do writeback wait for each page.
759 static struct page
*defrag_prepare_one_page(struct btrfs_inode
*inode
, pgoff_t index
)
761 struct address_space
*mapping
= inode
->vfs_inode
.i_mapping
;
762 gfp_t mask
= btrfs_alloc_write_mask(mapping
);
763 u64 page_start
= (u64
)index
<< PAGE_SHIFT
;
764 u64 page_end
= page_start
+ PAGE_SIZE
- 1;
765 struct extent_state
*cached_state
= NULL
;
770 page
= find_or_create_page(mapping
, index
, mask
);
772 return ERR_PTR(-ENOMEM
);
775 * Since we can defragment files opened read-only, we can encounter
776 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
777 * can't do I/O using huge pages yet, so return an error for now.
778 * Filesystem transparent huge pages are typically only used for
779 * executables that explicitly enable them, so this isn't very
782 if (PageCompound(page
)) {
785 return ERR_PTR(-ETXTBSY
);
788 ret
= set_page_extent_mapped(page
);
795 /* Wait for any existing ordered extent in the range */
797 struct btrfs_ordered_extent
*ordered
;
799 lock_extent(&inode
->io_tree
, page_start
, page_end
, &cached_state
);
800 ordered
= btrfs_lookup_ordered_range(inode
, page_start
, PAGE_SIZE
);
801 unlock_extent(&inode
->io_tree
, page_start
, page_end
,
807 btrfs_start_ordered_extent(ordered
);
808 btrfs_put_ordered_extent(ordered
);
811 * We unlocked the page above, so we need check if it was
814 if (page
->mapping
!= mapping
|| !PagePrivate(page
)) {
822 * Now the page range has no ordered extent any more. Read the page to
825 if (!PageUptodate(page
)) {
826 btrfs_read_folio(NULL
, page_folio(page
));
828 if (page
->mapping
!= mapping
|| !PagePrivate(page
)) {
833 if (!PageUptodate(page
)) {
836 return ERR_PTR(-EIO
);
842 struct defrag_target_range
{
843 struct list_head list
;
849 * Collect all valid target extents.
851 * @start: file offset to lookup
852 * @len: length to lookup
853 * @extent_thresh: file extent size threshold, any extent size >= this value
855 * @newer_than: only defrag extents newer than this value
856 * @do_compress: whether the defrag is doing compression
857 * if true, @extent_thresh will be ignored and all regular
858 * file extents meeting @newer_than will be targets.
859 * @locked: if the range has already held extent lock
860 * @target_list: list of targets file extents
862 static int defrag_collect_targets(struct btrfs_inode
*inode
,
863 u64 start
, u64 len
, u32 extent_thresh
,
864 u64 newer_than
, bool do_compress
,
865 bool locked
, struct list_head
*target_list
,
866 u64
*last_scanned_ret
)
868 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
869 bool last_is_target
= false;
873 while (cur
< start
+ len
) {
874 struct extent_map
*em
;
875 struct defrag_target_range
*new;
876 bool next_mergeable
= true;
879 last_is_target
= false;
880 em
= defrag_lookup_extent(&inode
->vfs_inode
, cur
, newer_than
, locked
);
885 * If the file extent is an inlined one, we may still want to
886 * defrag it (fallthrough) if it will cause a regular extent.
887 * This is for users who want to convert inline extents to
888 * regular ones through max_inline= mount option.
890 if (em
->block_start
== EXTENT_MAP_INLINE
&&
891 em
->len
<= inode
->root
->fs_info
->max_inline
)
894 /* Skip hole/delalloc/preallocated extents */
895 if (em
->block_start
== EXTENT_MAP_HOLE
||
896 em
->block_start
== EXTENT_MAP_DELALLOC
||
897 test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
))
900 /* Skip older extent */
901 if (em
->generation
< newer_than
)
904 /* This em is under writeback, no need to defrag */
905 if (em
->generation
== (u64
)-1)
909 * Our start offset might be in the middle of an existing extent
910 * map, so take that into account.
912 range_len
= em
->len
- (cur
- em
->start
);
914 * If this range of the extent map is already flagged for delalloc,
917 * 1) We could deadlock later, when trying to reserve space for
918 * delalloc, because in case we can't immediately reserve space
919 * the flusher can start delalloc and wait for the respective
920 * ordered extents to complete. The deadlock would happen
921 * because we do the space reservation while holding the range
922 * locked, and starting writeback, or finishing an ordered
923 * extent, requires locking the range;
925 * 2) If there's delalloc there, it means there's dirty pages for
926 * which writeback has not started yet (we clean the delalloc
927 * flag when starting writeback and after creating an ordered
928 * extent). If we mark pages in an adjacent range for defrag,
929 * then we will have a larger contiguous range for delalloc,
930 * very likely resulting in a larger extent after writeback is
931 * triggered (except in a case of free space fragmentation).
933 if (test_range_bit(&inode
->io_tree
, cur
, cur
+ range_len
- 1,
934 EXTENT_DELALLOC
, 0, NULL
))
938 * For do_compress case, we want to compress all valid file
939 * extents, thus no @extent_thresh or mergeable check.
944 /* Skip too large extent */
945 if (range_len
>= extent_thresh
)
949 * Skip extents already at its max capacity, this is mostly for
950 * compressed extents, which max cap is only 128K.
952 if (em
->len
>= get_extent_max_capacity(fs_info
, em
))
956 * Normally there are no more extents after an inline one, thus
957 * @next_mergeable will normally be false and not defragged.
958 * So if an inline extent passed all above checks, just add it
959 * for defrag, and be converted to regular extents.
961 if (em
->block_start
== EXTENT_MAP_INLINE
)
964 next_mergeable
= defrag_check_next_extent(&inode
->vfs_inode
, em
,
965 extent_thresh
, newer_than
, locked
);
966 if (!next_mergeable
) {
967 struct defrag_target_range
*last
;
969 /* Empty target list, no way to merge with last entry */
970 if (list_empty(target_list
))
972 last
= list_entry(target_list
->prev
,
973 struct defrag_target_range
, list
);
974 /* Not mergeable with last entry */
975 if (last
->start
+ last
->len
!= cur
)
978 /* Mergeable, fall through to add it to @target_list. */
982 last_is_target
= true;
983 range_len
= min(extent_map_end(em
), start
+ len
) - cur
;
985 * This one is a good target, check if it can be merged into
986 * last range of the target list.
988 if (!list_empty(target_list
)) {
989 struct defrag_target_range
*last
;
991 last
= list_entry(target_list
->prev
,
992 struct defrag_target_range
, list
);
993 ASSERT(last
->start
+ last
->len
<= cur
);
994 if (last
->start
+ last
->len
== cur
) {
995 /* Mergeable, enlarge the last entry */
996 last
->len
+= range_len
;
999 /* Fall through to allocate a new entry */
1002 /* Allocate new defrag_target_range */
1003 new = kmalloc(sizeof(*new), GFP_NOFS
);
1005 free_extent_map(em
);
1010 new->len
= range_len
;
1011 list_add_tail(&new->list
, target_list
);
1014 cur
= extent_map_end(em
);
1015 free_extent_map(em
);
1018 struct defrag_target_range
*entry
;
1019 struct defrag_target_range
*tmp
;
1021 list_for_each_entry_safe(entry
, tmp
, target_list
, list
) {
1022 list_del_init(&entry
->list
);
1026 if (!ret
&& last_scanned_ret
) {
1028 * If the last extent is not a target, the caller can skip to
1029 * the end of that extent.
1030 * Otherwise, we can only go the end of the specified range.
1032 if (!last_is_target
)
1033 *last_scanned_ret
= max(cur
, *last_scanned_ret
);
1035 *last_scanned_ret
= max(start
+ len
, *last_scanned_ret
);
1040 #define CLUSTER_SIZE (SZ_256K)
1041 static_assert(PAGE_ALIGNED(CLUSTER_SIZE
));
1044 * Defrag one contiguous target range.
1046 * @inode: target inode
1047 * @target: target range to defrag
1048 * @pages: locked pages covering the defrag range
1049 * @nr_pages: number of locked pages
1051 * Caller should ensure:
1053 * - Pages are prepared
1054 * Pages should be locked, no ordered extent in the pages range,
1057 * - Extent bits are locked
1059 static int defrag_one_locked_target(struct btrfs_inode
*inode
,
1060 struct defrag_target_range
*target
,
1061 struct page
**pages
, int nr_pages
,
1062 struct extent_state
**cached_state
)
1064 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1065 struct extent_changeset
*data_reserved
= NULL
;
1066 const u64 start
= target
->start
;
1067 const u64 len
= target
->len
;
1068 unsigned long last_index
= (start
+ len
- 1) >> PAGE_SHIFT
;
1069 unsigned long start_index
= start
>> PAGE_SHIFT
;
1070 unsigned long first_index
= page_index(pages
[0]);
1074 ASSERT(last_index
- first_index
+ 1 <= nr_pages
);
1076 ret
= btrfs_delalloc_reserve_space(inode
, &data_reserved
, start
, len
);
1079 clear_extent_bit(&inode
->io_tree
, start
, start
+ len
- 1,
1080 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
|
1081 EXTENT_DEFRAG
, cached_state
);
1082 set_extent_bit(&inode
->io_tree
, start
, start
+ len
- 1,
1083 EXTENT_DELALLOC
| EXTENT_DEFRAG
, cached_state
);
1085 /* Update the page status */
1086 for (i
= start_index
- first_index
; i
<= last_index
- first_index
; i
++) {
1087 ClearPageChecked(pages
[i
]);
1088 btrfs_page_clamp_set_dirty(fs_info
, pages
[i
], start
, len
);
1090 btrfs_delalloc_release_extents(inode
, len
);
1091 extent_changeset_free(data_reserved
);
1096 static int defrag_one_range(struct btrfs_inode
*inode
, u64 start
, u32 len
,
1097 u32 extent_thresh
, u64 newer_than
, bool do_compress
,
1098 u64
*last_scanned_ret
)
1100 struct extent_state
*cached_state
= NULL
;
1101 struct defrag_target_range
*entry
;
1102 struct defrag_target_range
*tmp
;
1103 LIST_HEAD(target_list
);
1104 struct page
**pages
;
1105 const u32 sectorsize
= inode
->root
->fs_info
->sectorsize
;
1106 u64 last_index
= (start
+ len
- 1) >> PAGE_SHIFT
;
1107 u64 start_index
= start
>> PAGE_SHIFT
;
1108 unsigned int nr_pages
= last_index
- start_index
+ 1;
1112 ASSERT(nr_pages
<= CLUSTER_SIZE
/ PAGE_SIZE
);
1113 ASSERT(IS_ALIGNED(start
, sectorsize
) && IS_ALIGNED(len
, sectorsize
));
1115 pages
= kcalloc(nr_pages
, sizeof(struct page
*), GFP_NOFS
);
1119 /* Prepare all pages */
1120 for (i
= 0; i
< nr_pages
; i
++) {
1121 pages
[i
] = defrag_prepare_one_page(inode
, start_index
+ i
);
1122 if (IS_ERR(pages
[i
])) {
1123 ret
= PTR_ERR(pages
[i
]);
1128 for (i
= 0; i
< nr_pages
; i
++)
1129 wait_on_page_writeback(pages
[i
]);
1131 /* Lock the pages range */
1132 lock_extent(&inode
->io_tree
, start_index
<< PAGE_SHIFT
,
1133 (last_index
<< PAGE_SHIFT
) + PAGE_SIZE
- 1,
1136 * Now we have a consistent view about the extent map, re-check
1137 * which range really needs to be defragged.
1139 * And this time we have extent locked already, pass @locked = true
1140 * so that we won't relock the extent range and cause deadlock.
1142 ret
= defrag_collect_targets(inode
, start
, len
, extent_thresh
,
1143 newer_than
, do_compress
, true,
1144 &target_list
, last_scanned_ret
);
1148 list_for_each_entry(entry
, &target_list
, list
) {
1149 ret
= defrag_one_locked_target(inode
, entry
, pages
, nr_pages
,
1155 list_for_each_entry_safe(entry
, tmp
, &target_list
, list
) {
1156 list_del_init(&entry
->list
);
1160 unlock_extent(&inode
->io_tree
, start_index
<< PAGE_SHIFT
,
1161 (last_index
<< PAGE_SHIFT
) + PAGE_SIZE
- 1,
1164 for (i
= 0; i
< nr_pages
; i
++) {
1166 unlock_page(pages
[i
]);
1174 static int defrag_one_cluster(struct btrfs_inode
*inode
,
1175 struct file_ra_state
*ra
,
1176 u64 start
, u32 len
, u32 extent_thresh
,
1177 u64 newer_than
, bool do_compress
,
1178 unsigned long *sectors_defragged
,
1179 unsigned long max_sectors
,
1180 u64
*last_scanned_ret
)
1182 const u32 sectorsize
= inode
->root
->fs_info
->sectorsize
;
1183 struct defrag_target_range
*entry
;
1184 struct defrag_target_range
*tmp
;
1185 LIST_HEAD(target_list
);
1188 ret
= defrag_collect_targets(inode
, start
, len
, extent_thresh
,
1189 newer_than
, do_compress
, false,
1190 &target_list
, NULL
);
1194 list_for_each_entry(entry
, &target_list
, list
) {
1195 u32 range_len
= entry
->len
;
1197 /* Reached or beyond the limit */
1198 if (max_sectors
&& *sectors_defragged
>= max_sectors
) {
1204 range_len
= min_t(u32
, range_len
,
1205 (max_sectors
- *sectors_defragged
) * sectorsize
);
1208 * If defrag_one_range() has updated last_scanned_ret,
1209 * our range may already be invalid (e.g. hole punched).
1210 * Skip if our range is before last_scanned_ret, as there is
1211 * no need to defrag the range anymore.
1213 if (entry
->start
+ range_len
<= *last_scanned_ret
)
1217 page_cache_sync_readahead(inode
->vfs_inode
.i_mapping
,
1218 ra
, NULL
, entry
->start
>> PAGE_SHIFT
,
1219 ((entry
->start
+ range_len
- 1) >> PAGE_SHIFT
) -
1220 (entry
->start
>> PAGE_SHIFT
) + 1);
1222 * Here we may not defrag any range if holes are punched before
1223 * we locked the pages.
1224 * But that's fine, it only affects the @sectors_defragged
1227 ret
= defrag_one_range(inode
, entry
->start
, range_len
,
1228 extent_thresh
, newer_than
, do_compress
,
1232 *sectors_defragged
+= range_len
>>
1233 inode
->root
->fs_info
->sectorsize_bits
;
1236 list_for_each_entry_safe(entry
, tmp
, &target_list
, list
) {
1237 list_del_init(&entry
->list
);
1241 *last_scanned_ret
= max(*last_scanned_ret
, start
+ len
);
1246 * Entry point to file defragmentation.
1248 * @inode: inode to be defragged
1249 * @ra: readahead state (can be NUL)
1250 * @range: defrag options including range and flags
1251 * @newer_than: minimum transid to defrag
1252 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1253 * will be defragged.
1255 * Return <0 for error.
1256 * Return >=0 for the number of sectors defragged, and range->start will be updated
1257 * to indicate the file offset where next defrag should be started at.
1258 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1259 * defragging all the range).
1261 int btrfs_defrag_file(struct inode
*inode
, struct file_ra_state
*ra
,
1262 struct btrfs_ioctl_defrag_range_args
*range
,
1263 u64 newer_than
, unsigned long max_to_defrag
)
1265 struct btrfs_fs_info
*fs_info
= btrfs_sb(inode
->i_sb
);
1266 unsigned long sectors_defragged
= 0;
1267 u64 isize
= i_size_read(inode
);
1270 bool do_compress
= (range
->flags
& BTRFS_DEFRAG_RANGE_COMPRESS
);
1271 bool ra_allocated
= false;
1272 int compress_type
= BTRFS_COMPRESS_ZLIB
;
1274 u32 extent_thresh
= range
->extent_thresh
;
1275 pgoff_t start_index
;
1280 if (range
->start
>= isize
)
1284 if (range
->compress_type
>= BTRFS_NR_COMPRESS_TYPES
)
1286 if (range
->compress_type
)
1287 compress_type
= range
->compress_type
;
1290 if (extent_thresh
== 0)
1291 extent_thresh
= SZ_256K
;
1293 if (range
->start
+ range
->len
> range
->start
) {
1294 /* Got a specific range */
1295 last_byte
= min(isize
, range
->start
+ range
->len
);
1297 /* Defrag until file end */
1301 /* Align the range */
1302 cur
= round_down(range
->start
, fs_info
->sectorsize
);
1303 last_byte
= round_up(last_byte
, fs_info
->sectorsize
) - 1;
1306 * If we were not given a ra, allocate a readahead context. As
1307 * readahead is just an optimization, defrag will work without it so
1308 * we don't error out.
1311 ra_allocated
= true;
1312 ra
= kzalloc(sizeof(*ra
), GFP_KERNEL
);
1314 file_ra_state_init(ra
, inode
->i_mapping
);
1318 * Make writeback start from the beginning of the range, so that the
1319 * defrag range can be written sequentially.
1321 start_index
= cur
>> PAGE_SHIFT
;
1322 if (start_index
< inode
->i_mapping
->writeback_index
)
1323 inode
->i_mapping
->writeback_index
= start_index
;
1325 while (cur
< last_byte
) {
1326 const unsigned long prev_sectors_defragged
= sectors_defragged
;
1327 u64 last_scanned
= cur
;
1330 if (btrfs_defrag_cancelled(fs_info
)) {
1335 /* We want the cluster end at page boundary when possible */
1336 cluster_end
= (((cur
>> PAGE_SHIFT
) +
1337 (SZ_256K
>> PAGE_SHIFT
)) << PAGE_SHIFT
) - 1;
1338 cluster_end
= min(cluster_end
, last_byte
);
1340 btrfs_inode_lock(BTRFS_I(inode
), 0);
1341 if (IS_SWAPFILE(inode
)) {
1343 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1346 if (!(inode
->i_sb
->s_flags
& SB_ACTIVE
)) {
1347 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1351 BTRFS_I(inode
)->defrag_compress
= compress_type
;
1352 ret
= defrag_one_cluster(BTRFS_I(inode
), ra
, cur
,
1353 cluster_end
+ 1 - cur
, extent_thresh
,
1354 newer_than
, do_compress
, §ors_defragged
,
1355 max_to_defrag
, &last_scanned
);
1357 if (sectors_defragged
> prev_sectors_defragged
)
1358 balance_dirty_pages_ratelimited(inode
->i_mapping
);
1360 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1363 cur
= max(cluster_end
+ 1, last_scanned
);
1374 * Update range.start for autodefrag, this will indicate where to start
1378 if (sectors_defragged
) {
1380 * We have defragged some sectors, for compression case they
1381 * need to be written back immediately.
1383 if (range
->flags
& BTRFS_DEFRAG_RANGE_START_IO
) {
1384 filemap_flush(inode
->i_mapping
);
1385 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT
,
1386 &BTRFS_I(inode
)->runtime_flags
))
1387 filemap_flush(inode
->i_mapping
);
1389 if (range
->compress_type
== BTRFS_COMPRESS_LZO
)
1390 btrfs_set_fs_incompat(fs_info
, COMPRESS_LZO
);
1391 else if (range
->compress_type
== BTRFS_COMPRESS_ZSTD
)
1392 btrfs_set_fs_incompat(fs_info
, COMPRESS_ZSTD
);
1393 ret
= sectors_defragged
;
1396 btrfs_inode_lock(BTRFS_I(inode
), 0);
1397 BTRFS_I(inode
)->defrag_compress
= BTRFS_COMPRESS_NONE
;
1398 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1403 void __cold
btrfs_auto_defrag_exit(void)
1405 kmem_cache_destroy(btrfs_inode_defrag_cachep
);
1408 int __init
btrfs_auto_defrag_init(void)
1410 btrfs_inode_defrag_cachep
= kmem_cache_create("btrfs_inode_defrag",
1411 sizeof(struct inode_defrag
), 0,
1414 if (!btrfs_inode_defrag_cachep
)