1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007 Oracle. All rights reserved.
6 #include <linux/sched.h>
9 #include "print-tree.h"
10 #include "transaction.h"
12 #include "accessors.h"
14 #include "delalloc-space.h"
17 #include "file-item.h"
20 static struct kmem_cache
*btrfs_inode_defrag_cachep
;
23 * When auto defrag is enabled we queue up these defrag structs to remember
24 * which inodes need defragging passes.
27 struct rb_node rb_node
;
31 * Transid where the defrag was added, we search for extents newer than
40 * The extent size threshold for autodefrag.
42 * This value is different for compressed/non-compressed extents, thus
43 * needs to be passed from higher layer.
44 * (aka, inode_should_defrag())
49 static int __compare_inode_defrag(struct inode_defrag
*defrag1
,
50 struct inode_defrag
*defrag2
)
52 if (defrag1
->root
> defrag2
->root
)
54 else if (defrag1
->root
< defrag2
->root
)
56 else if (defrag1
->ino
> defrag2
->ino
)
58 else if (defrag1
->ino
< defrag2
->ino
)
65 * Pop a record for an inode into the defrag tree. The lock must be held
68 * If you're inserting a record for an older transid than an existing record,
69 * the transid already in the tree is lowered.
71 * If an existing record is found the defrag item you pass in is freed.
73 static int __btrfs_add_inode_defrag(struct btrfs_inode
*inode
,
74 struct inode_defrag
*defrag
)
76 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
77 struct inode_defrag
*entry
;
79 struct rb_node
*parent
= NULL
;
82 p
= &fs_info
->defrag_inodes
.rb_node
;
85 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
87 ret
= __compare_inode_defrag(defrag
, entry
);
91 p
= &parent
->rb_right
;
94 * If we're reinserting an entry for an old defrag run,
95 * make sure to lower the transid of our existing
98 if (defrag
->transid
< entry
->transid
)
99 entry
->transid
= defrag
->transid
;
100 entry
->extent_thresh
= min(defrag
->extent_thresh
,
101 entry
->extent_thresh
);
105 set_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
);
106 rb_link_node(&defrag
->rb_node
, parent
, p
);
107 rb_insert_color(&defrag
->rb_node
, &fs_info
->defrag_inodes
);
111 static inline int __need_auto_defrag(struct btrfs_fs_info
*fs_info
)
113 if (!btrfs_test_opt(fs_info
, AUTO_DEFRAG
))
116 if (btrfs_fs_closing(fs_info
))
123 * Insert a defrag record for this inode if auto defrag is enabled.
125 int btrfs_add_inode_defrag(struct btrfs_trans_handle
*trans
,
126 struct btrfs_inode
*inode
, u32 extent_thresh
)
128 struct btrfs_root
*root
= inode
->root
;
129 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
130 struct inode_defrag
*defrag
;
134 if (!__need_auto_defrag(fs_info
))
137 if (test_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
))
141 transid
= trans
->transid
;
143 transid
= inode
->root
->last_trans
;
145 defrag
= kmem_cache_zalloc(btrfs_inode_defrag_cachep
, GFP_NOFS
);
149 defrag
->ino
= btrfs_ino(inode
);
150 defrag
->transid
= transid
;
151 defrag
->root
= root
->root_key
.objectid
;
152 defrag
->extent_thresh
= extent_thresh
;
154 spin_lock(&fs_info
->defrag_inodes_lock
);
155 if (!test_bit(BTRFS_INODE_IN_DEFRAG
, &inode
->runtime_flags
)) {
157 * If we set IN_DEFRAG flag and evict the inode from memory,
158 * and then re-read this inode, this new inode doesn't have
159 * IN_DEFRAG flag. At the case, we may find the existed defrag.
161 ret
= __btrfs_add_inode_defrag(inode
, defrag
);
163 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
165 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
167 spin_unlock(&fs_info
->defrag_inodes_lock
);
172 * Pick the defragable inode that we want, if it doesn't exist, we will get the
175 static struct inode_defrag
*btrfs_pick_defrag_inode(
176 struct btrfs_fs_info
*fs_info
, u64 root
, u64 ino
)
178 struct inode_defrag
*entry
= NULL
;
179 struct inode_defrag tmp
;
181 struct rb_node
*parent
= NULL
;
187 spin_lock(&fs_info
->defrag_inodes_lock
);
188 p
= fs_info
->defrag_inodes
.rb_node
;
191 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
193 ret
= __compare_inode_defrag(&tmp
, entry
);
197 p
= parent
->rb_right
;
202 if (parent
&& __compare_inode_defrag(&tmp
, entry
) > 0) {
203 parent
= rb_next(parent
);
205 entry
= rb_entry(parent
, struct inode_defrag
, rb_node
);
211 rb_erase(parent
, &fs_info
->defrag_inodes
);
212 spin_unlock(&fs_info
->defrag_inodes_lock
);
216 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info
*fs_info
)
218 struct inode_defrag
*defrag
;
219 struct rb_node
*node
;
221 spin_lock(&fs_info
->defrag_inodes_lock
);
222 node
= rb_first(&fs_info
->defrag_inodes
);
224 rb_erase(node
, &fs_info
->defrag_inodes
);
225 defrag
= rb_entry(node
, struct inode_defrag
, rb_node
);
226 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
228 cond_resched_lock(&fs_info
->defrag_inodes_lock
);
230 node
= rb_first(&fs_info
->defrag_inodes
);
232 spin_unlock(&fs_info
->defrag_inodes_lock
);
235 #define BTRFS_DEFRAG_BATCH 1024
237 static int __btrfs_run_defrag_inode(struct btrfs_fs_info
*fs_info
,
238 struct inode_defrag
*defrag
)
240 struct btrfs_root
*inode_root
;
242 struct btrfs_ioctl_defrag_range_args range
;
247 if (test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
))
249 if (!__need_auto_defrag(fs_info
))
253 inode_root
= btrfs_get_fs_root(fs_info
, defrag
->root
, true);
254 if (IS_ERR(inode_root
)) {
255 ret
= PTR_ERR(inode_root
);
259 inode
= btrfs_iget(fs_info
->sb
, defrag
->ino
, inode_root
);
260 btrfs_put_root(inode_root
);
262 ret
= PTR_ERR(inode
);
266 if (cur
>= i_size_read(inode
)) {
271 /* Do a chunk of defrag */
272 clear_bit(BTRFS_INODE_IN_DEFRAG
, &BTRFS_I(inode
)->runtime_flags
);
273 memset(&range
, 0, sizeof(range
));
276 range
.extent_thresh
= defrag
->extent_thresh
;
278 sb_start_write(fs_info
->sb
);
279 ret
= btrfs_defrag_file(inode
, NULL
, &range
, defrag
->transid
,
281 sb_end_write(fs_info
->sb
);
287 cur
= max(cur
+ fs_info
->sectorsize
, range
.start
);
291 kmem_cache_free(btrfs_inode_defrag_cachep
, defrag
);
296 * Run through the list of inodes in the FS that need defragging.
298 int btrfs_run_defrag_inodes(struct btrfs_fs_info
*fs_info
)
300 struct inode_defrag
*defrag
;
302 u64 root_objectid
= 0;
304 atomic_inc(&fs_info
->defrag_running
);
306 /* Pause the auto defragger. */
307 if (test_bit(BTRFS_FS_STATE_REMOUNTING
, &fs_info
->fs_state
))
310 if (!__need_auto_defrag(fs_info
))
313 /* find an inode to defrag */
314 defrag
= btrfs_pick_defrag_inode(fs_info
, root_objectid
, first_ino
);
316 if (root_objectid
|| first_ino
) {
325 first_ino
= defrag
->ino
+ 1;
326 root_objectid
= defrag
->root
;
328 __btrfs_run_defrag_inode(fs_info
, defrag
);
330 atomic_dec(&fs_info
->defrag_running
);
333 * During unmount, we use the transaction_wait queue to wait for the
336 wake_up(&fs_info
->transaction_wait
);
341 * Check if two blocks addresses are close, used by defrag.
343 static bool close_blocks(u64 blocknr
, u64 other
, u32 blocksize
)
345 if (blocknr
< other
&& other
- (blocknr
+ blocksize
) < SZ_32K
)
347 if (blocknr
> other
&& blocknr
- (other
+ blocksize
) < SZ_32K
)
353 * Go through all the leaves pointed to by a node and reallocate them so that
354 * disk order is close to key order.
356 static int btrfs_realloc_node(struct btrfs_trans_handle
*trans
,
357 struct btrfs_root
*root
,
358 struct extent_buffer
*parent
,
359 int start_slot
, u64
*last_ret
,
360 struct btrfs_key
*progress
)
362 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
363 const u32 blocksize
= fs_info
->nodesize
;
364 const int end_slot
= btrfs_header_nritems(parent
) - 1;
365 u64 search_start
= *last_ret
;
368 bool progress_passed
= false;
371 * COWing must happen through a running transaction, which always
372 * matches the current fs generation (it's a transaction with a state
373 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
374 * into error state to prevent the commit of any transaction.
376 if (unlikely(trans
->transaction
!= fs_info
->running_transaction
||
377 trans
->transid
!= fs_info
->generation
)) {
378 btrfs_abort_transaction(trans
, -EUCLEAN
);
380 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
381 parent
->start
, btrfs_root_id(root
), trans
->transid
,
382 fs_info
->running_transaction
->transid
,
383 fs_info
->generation
);
387 if (btrfs_header_nritems(parent
) <= 1)
390 for (int i
= start_slot
; i
<= end_slot
; i
++) {
391 struct extent_buffer
*cur
;
392 struct btrfs_disk_key disk_key
;
397 btrfs_node_key(parent
, &disk_key
, i
);
398 if (!progress_passed
&& btrfs_comp_keys(&disk_key
, progress
) < 0)
401 progress_passed
= true;
402 blocknr
= btrfs_node_blockptr(parent
, i
);
404 last_block
= blocknr
;
407 other
= btrfs_node_blockptr(parent
, i
- 1);
408 close
= close_blocks(blocknr
, other
, blocksize
);
410 if (!close
&& i
< end_slot
) {
411 other
= btrfs_node_blockptr(parent
, i
+ 1);
412 close
= close_blocks(blocknr
, other
, blocksize
);
415 last_block
= blocknr
;
419 cur
= btrfs_read_node_slot(parent
, i
);
422 if (search_start
== 0)
423 search_start
= last_block
;
425 btrfs_tree_lock(cur
);
426 ret
= btrfs_force_cow_block(trans
, root
, cur
, parent
, i
,
429 (end_slot
- i
) * blocksize
),
432 btrfs_tree_unlock(cur
);
433 free_extent_buffer(cur
);
436 search_start
= cur
->start
;
437 last_block
= cur
->start
;
438 *last_ret
= search_start
;
439 btrfs_tree_unlock(cur
);
440 free_extent_buffer(cur
);
446 * Defrag all the leaves in a given btree.
447 * Read all the leaves and try to get key order to
448 * better reflect disk order
451 static int btrfs_defrag_leaves(struct btrfs_trans_handle
*trans
,
452 struct btrfs_root
*root
)
454 struct btrfs_path
*path
= NULL
;
455 struct btrfs_key key
;
459 int next_key_ret
= 0;
462 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
465 path
= btrfs_alloc_path();
471 level
= btrfs_header_level(root
->node
);
476 if (root
->defrag_progress
.objectid
== 0) {
477 struct extent_buffer
*root_node
;
480 root_node
= btrfs_lock_root_node(root
);
481 nritems
= btrfs_header_nritems(root_node
);
482 root
->defrag_max
.objectid
= 0;
483 /* from above we know this is not a leaf */
484 btrfs_node_key_to_cpu(root_node
, &root
->defrag_max
,
486 btrfs_tree_unlock(root_node
);
487 free_extent_buffer(root_node
);
488 memset(&key
, 0, sizeof(key
));
490 memcpy(&key
, &root
->defrag_progress
, sizeof(key
));
493 path
->keep_locks
= 1;
495 ret
= btrfs_search_forward(root
, &key
, path
, BTRFS_OLDEST_GENERATION
);
502 btrfs_release_path(path
);
504 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
505 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
506 * a deadlock (attempting to write lock an already write locked leaf).
508 path
->lowest_level
= 1;
509 wret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
515 if (!path
->nodes
[1]) {
520 * The node at level 1 must always be locked when our path has
521 * keep_locks set and lowest_level is 1, regardless of the value of
524 BUG_ON(path
->locks
[1] == 0);
525 ret
= btrfs_realloc_node(trans
, root
,
528 &root
->defrag_progress
);
530 WARN_ON(ret
== -EAGAIN
);
534 * Now that we reallocated the node we can find the next key. Note that
535 * btrfs_find_next_key() can release our path and do another search
536 * without COWing, this is because even with path->keep_locks = 1,
537 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
538 * node when path->slots[node_level - 1] does not point to the last
539 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
540 * we search for the next key after reallocating our node.
542 path
->slots
[1] = btrfs_header_nritems(path
->nodes
[1]);
543 next_key_ret
= btrfs_find_next_key(root
, path
, &key
, 1,
544 BTRFS_OLDEST_GENERATION
);
545 if (next_key_ret
== 0) {
546 memcpy(&root
->defrag_progress
, &key
, sizeof(key
));
550 btrfs_free_path(path
);
551 if (ret
== -EAGAIN
) {
552 if (root
->defrag_max
.objectid
> root
->defrag_progress
.objectid
)
554 if (root
->defrag_max
.type
> root
->defrag_progress
.type
)
556 if (root
->defrag_max
.offset
> root
->defrag_progress
.offset
)
562 memset(&root
->defrag_progress
, 0,
563 sizeof(root
->defrag_progress
));
569 * Defrag a given btree. Every leaf in the btree is read and defragmented.
571 int btrfs_defrag_root(struct btrfs_root
*root
)
573 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
576 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING
, &root
->state
))
580 struct btrfs_trans_handle
*trans
;
582 trans
= btrfs_start_transaction(root
, 0);
584 ret
= PTR_ERR(trans
);
588 ret
= btrfs_defrag_leaves(trans
, root
);
590 btrfs_end_transaction(trans
);
591 btrfs_btree_balance_dirty(fs_info
);
594 if (btrfs_fs_closing(fs_info
) || ret
!= -EAGAIN
)
597 if (btrfs_defrag_cancelled(fs_info
)) {
598 btrfs_debug(fs_info
, "defrag_root cancelled");
603 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING
, &root
->state
);
608 * Defrag specific helper to get an extent map.
610 * Differences between this and btrfs_get_extent() are:
612 * - No extent_map will be added to inode->extent_tree
613 * To reduce memory usage in the long run.
615 * - Extra optimization to skip file extents older than @newer_than
616 * By using btrfs_search_forward() we can skip entire file ranges that
617 * have extents created in past transactions, because btrfs_search_forward()
618 * will not visit leaves and nodes with a generation smaller than given
619 * minimal generation threshold (@newer_than).
621 * Return valid em if we find a file extent matching the requirement.
622 * Return NULL if we can not find a file extent matching the requirement.
624 * Return ERR_PTR() for error.
626 static struct extent_map
*defrag_get_extent(struct btrfs_inode
*inode
,
627 u64 start
, u64 newer_than
)
629 struct btrfs_root
*root
= inode
->root
;
630 struct btrfs_file_extent_item
*fi
;
631 struct btrfs_path path
= { 0 };
632 struct extent_map
*em
;
633 struct btrfs_key key
;
634 u64 ino
= btrfs_ino(inode
);
637 em
= alloc_extent_map();
644 key
.type
= BTRFS_EXTENT_DATA_KEY
;
648 ret
= btrfs_search_forward(root
, &key
, &path
, newer_than
);
651 /* Can't find anything newer */
655 ret
= btrfs_search_slot(NULL
, root
, &key
, &path
, 0, 0);
659 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0])) {
661 * If btrfs_search_slot() makes path to point beyond nritems,
662 * we should not have an empty leaf, as this inode must at
663 * least have its INODE_ITEM.
665 ASSERT(btrfs_header_nritems(path
.nodes
[0]));
666 path
.slots
[0] = btrfs_header_nritems(path
.nodes
[0]) - 1;
668 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
669 /* Perfect match, no need to go one slot back */
670 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
&&
674 /* We didn't find a perfect match, needs to go one slot back */
675 if (path
.slots
[0] > 0) {
676 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
677 if (key
.objectid
== ino
&& key
.type
== BTRFS_EXTENT_DATA_KEY
)
682 /* Iterate through the path to find a file extent covering @start */
686 if (path
.slots
[0] >= btrfs_header_nritems(path
.nodes
[0]))
689 btrfs_item_key_to_cpu(path
.nodes
[0], &key
, path
.slots
[0]);
692 * We may go one slot back to INODE_REF/XATTR item, then
693 * need to go forward until we reach an EXTENT_DATA.
694 * But we should still has the correct ino as key.objectid.
696 if (WARN_ON(key
.objectid
< ino
) || key
.type
< BTRFS_EXTENT_DATA_KEY
)
699 /* It's beyond our target range, definitely not extent found */
700 if (key
.objectid
> ino
|| key
.type
> BTRFS_EXTENT_DATA_KEY
)
704 * | |<- File extent ->|
707 * This means there is a hole between start and key.offset.
709 if (key
.offset
> start
) {
711 em
->orig_start
= start
;
712 em
->block_start
= EXTENT_MAP_HOLE
;
713 em
->len
= key
.offset
- start
;
717 fi
= btrfs_item_ptr(path
.nodes
[0], path
.slots
[0],
718 struct btrfs_file_extent_item
);
719 extent_end
= btrfs_file_extent_end(&path
);
722 * |<- file extent ->| |
725 * We haven't reached start, search next slot.
727 if (extent_end
<= start
)
730 /* Now this extent covers @start, convert it to em */
731 btrfs_extent_item_to_extent_map(inode
, &path
, fi
, em
);
734 ret
= btrfs_next_item(root
, &path
);
740 btrfs_release_path(&path
);
744 btrfs_release_path(&path
);
749 btrfs_release_path(&path
);
754 static struct extent_map
*defrag_lookup_extent(struct inode
*inode
, u64 start
,
755 u64 newer_than
, bool locked
)
757 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
758 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
759 struct extent_map
*em
;
760 const u32 sectorsize
= BTRFS_I(inode
)->root
->fs_info
->sectorsize
;
763 * Hopefully we have this extent in the tree already, try without the
766 read_lock(&em_tree
->lock
);
767 em
= lookup_extent_mapping(em_tree
, start
, sectorsize
);
768 read_unlock(&em_tree
->lock
);
771 * We can get a merged extent, in that case, we need to re-search
772 * tree to get the original em for defrag.
774 * If @newer_than is 0 or em::generation < newer_than, we can trust
775 * this em, as either we don't care about the generation, or the
776 * merged extent map will be rejected anyway.
778 if (em
&& test_bit(EXTENT_FLAG_MERGED
, &em
->flags
) &&
779 newer_than
&& em
->generation
>= newer_than
) {
785 struct extent_state
*cached
= NULL
;
786 u64 end
= start
+ sectorsize
- 1;
788 /* Get the big lock and read metadata off disk. */
790 lock_extent(io_tree
, start
, end
, &cached
);
791 em
= defrag_get_extent(BTRFS_I(inode
), start
, newer_than
);
793 unlock_extent(io_tree
, start
, end
, &cached
);
802 static u32
get_extent_max_capacity(const struct btrfs_fs_info
*fs_info
,
803 const struct extent_map
*em
)
805 if (test_bit(EXTENT_FLAG_COMPRESSED
, &em
->flags
))
806 return BTRFS_MAX_COMPRESSED
;
807 return fs_info
->max_extent_size
;
810 static bool defrag_check_next_extent(struct inode
*inode
, struct extent_map
*em
,
811 u32 extent_thresh
, u64 newer_than
, bool locked
)
813 struct btrfs_fs_info
*fs_info
= btrfs_sb(inode
->i_sb
);
814 struct extent_map
*next
;
817 /* This is the last extent */
818 if (em
->start
+ em
->len
>= i_size_read(inode
))
822 * Here we need to pass @newer_then when checking the next extent, or
823 * we will hit a case we mark current extent for defrag, but the next
824 * one will not be a target.
825 * This will just cause extra IO without really reducing the fragments.
827 next
= defrag_lookup_extent(inode
, em
->start
+ em
->len
, newer_than
, locked
);
828 /* No more em or hole */
829 if (!next
|| next
->block_start
>= EXTENT_MAP_LAST_BYTE
)
831 if (test_bit(EXTENT_FLAG_PREALLOC
, &next
->flags
))
834 * If the next extent is at its max capacity, defragging current extent
835 * makes no sense, as the total number of extents won't change.
837 if (next
->len
>= get_extent_max_capacity(fs_info
, em
))
839 /* Skip older extent */
840 if (next
->generation
< newer_than
)
842 /* Also check extent size */
843 if (next
->len
>= extent_thresh
)
848 free_extent_map(next
);
853 * Prepare one page to be defragged.
857 * - Returned page is locked and has been set up properly.
858 * - No ordered extent exists in the page.
859 * - The page is uptodate.
861 * NOTE: Caller should also wait for page writeback after the cluster is
862 * prepared, here we don't do writeback wait for each page.
864 static struct page
*defrag_prepare_one_page(struct btrfs_inode
*inode
, pgoff_t index
)
866 struct address_space
*mapping
= inode
->vfs_inode
.i_mapping
;
867 gfp_t mask
= btrfs_alloc_write_mask(mapping
);
868 u64 page_start
= (u64
)index
<< PAGE_SHIFT
;
869 u64 page_end
= page_start
+ PAGE_SIZE
- 1;
870 struct extent_state
*cached_state
= NULL
;
875 page
= find_or_create_page(mapping
, index
, mask
);
877 return ERR_PTR(-ENOMEM
);
880 * Since we can defragment files opened read-only, we can encounter
881 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
882 * can't do I/O using huge pages yet, so return an error for now.
883 * Filesystem transparent huge pages are typically only used for
884 * executables that explicitly enable them, so this isn't very
887 if (PageCompound(page
)) {
890 return ERR_PTR(-ETXTBSY
);
893 ret
= set_page_extent_mapped(page
);
900 /* Wait for any existing ordered extent in the range */
902 struct btrfs_ordered_extent
*ordered
;
904 lock_extent(&inode
->io_tree
, page_start
, page_end
, &cached_state
);
905 ordered
= btrfs_lookup_ordered_range(inode
, page_start
, PAGE_SIZE
);
906 unlock_extent(&inode
->io_tree
, page_start
, page_end
,
912 btrfs_start_ordered_extent(ordered
);
913 btrfs_put_ordered_extent(ordered
);
916 * We unlocked the page above, so we need check if it was
919 if (page
->mapping
!= mapping
|| !PagePrivate(page
)) {
927 * Now the page range has no ordered extent any more. Read the page to
930 if (!PageUptodate(page
)) {
931 btrfs_read_folio(NULL
, page_folio(page
));
933 if (page
->mapping
!= mapping
|| !PagePrivate(page
)) {
938 if (!PageUptodate(page
)) {
941 return ERR_PTR(-EIO
);
947 struct defrag_target_range
{
948 struct list_head list
;
954 * Collect all valid target extents.
956 * @start: file offset to lookup
957 * @len: length to lookup
958 * @extent_thresh: file extent size threshold, any extent size >= this value
960 * @newer_than: only defrag extents newer than this value
961 * @do_compress: whether the defrag is doing compression
962 * if true, @extent_thresh will be ignored and all regular
963 * file extents meeting @newer_than will be targets.
964 * @locked: if the range has already held extent lock
965 * @target_list: list of targets file extents
967 static int defrag_collect_targets(struct btrfs_inode
*inode
,
968 u64 start
, u64 len
, u32 extent_thresh
,
969 u64 newer_than
, bool do_compress
,
970 bool locked
, struct list_head
*target_list
,
971 u64
*last_scanned_ret
)
973 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
974 bool last_is_target
= false;
978 while (cur
< start
+ len
) {
979 struct extent_map
*em
;
980 struct defrag_target_range
*new;
981 bool next_mergeable
= true;
984 last_is_target
= false;
985 em
= defrag_lookup_extent(&inode
->vfs_inode
, cur
, newer_than
, locked
);
990 * If the file extent is an inlined one, we may still want to
991 * defrag it (fallthrough) if it will cause a regular extent.
992 * This is for users who want to convert inline extents to
993 * regular ones through max_inline= mount option.
995 if (em
->block_start
== EXTENT_MAP_INLINE
&&
996 em
->len
<= inode
->root
->fs_info
->max_inline
)
999 /* Skip hole/delalloc/preallocated extents */
1000 if (em
->block_start
== EXTENT_MAP_HOLE
||
1001 em
->block_start
== EXTENT_MAP_DELALLOC
||
1002 test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
))
1005 /* Skip older extent */
1006 if (em
->generation
< newer_than
)
1009 /* This em is under writeback, no need to defrag */
1010 if (em
->generation
== (u64
)-1)
1014 * Our start offset might be in the middle of an existing extent
1015 * map, so take that into account.
1017 range_len
= em
->len
- (cur
- em
->start
);
1019 * If this range of the extent map is already flagged for delalloc,
1022 * 1) We could deadlock later, when trying to reserve space for
1023 * delalloc, because in case we can't immediately reserve space
1024 * the flusher can start delalloc and wait for the respective
1025 * ordered extents to complete. The deadlock would happen
1026 * because we do the space reservation while holding the range
1027 * locked, and starting writeback, or finishing an ordered
1028 * extent, requires locking the range;
1030 * 2) If there's delalloc there, it means there's dirty pages for
1031 * which writeback has not started yet (we clean the delalloc
1032 * flag when starting writeback and after creating an ordered
1033 * extent). If we mark pages in an adjacent range for defrag,
1034 * then we will have a larger contiguous range for delalloc,
1035 * very likely resulting in a larger extent after writeback is
1036 * triggered (except in a case of free space fragmentation).
1038 if (test_range_bit(&inode
->io_tree
, cur
, cur
+ range_len
- 1,
1039 EXTENT_DELALLOC
, 0, NULL
))
1043 * For do_compress case, we want to compress all valid file
1044 * extents, thus no @extent_thresh or mergeable check.
1049 /* Skip too large extent */
1050 if (range_len
>= extent_thresh
)
1054 * Skip extents already at its max capacity, this is mostly for
1055 * compressed extents, which max cap is only 128K.
1057 if (em
->len
>= get_extent_max_capacity(fs_info
, em
))
1061 * Normally there are no more extents after an inline one, thus
1062 * @next_mergeable will normally be false and not defragged.
1063 * So if an inline extent passed all above checks, just add it
1064 * for defrag, and be converted to regular extents.
1066 if (em
->block_start
== EXTENT_MAP_INLINE
)
1069 next_mergeable
= defrag_check_next_extent(&inode
->vfs_inode
, em
,
1070 extent_thresh
, newer_than
, locked
);
1071 if (!next_mergeable
) {
1072 struct defrag_target_range
*last
;
1074 /* Empty target list, no way to merge with last entry */
1075 if (list_empty(target_list
))
1077 last
= list_entry(target_list
->prev
,
1078 struct defrag_target_range
, list
);
1079 /* Not mergeable with last entry */
1080 if (last
->start
+ last
->len
!= cur
)
1083 /* Mergeable, fall through to add it to @target_list. */
1087 last_is_target
= true;
1088 range_len
= min(extent_map_end(em
), start
+ len
) - cur
;
1090 * This one is a good target, check if it can be merged into
1091 * last range of the target list.
1093 if (!list_empty(target_list
)) {
1094 struct defrag_target_range
*last
;
1096 last
= list_entry(target_list
->prev
,
1097 struct defrag_target_range
, list
);
1098 ASSERT(last
->start
+ last
->len
<= cur
);
1099 if (last
->start
+ last
->len
== cur
) {
1100 /* Mergeable, enlarge the last entry */
1101 last
->len
+= range_len
;
1104 /* Fall through to allocate a new entry */
1107 /* Allocate new defrag_target_range */
1108 new = kmalloc(sizeof(*new), GFP_NOFS
);
1110 free_extent_map(em
);
1115 new->len
= range_len
;
1116 list_add_tail(&new->list
, target_list
);
1119 cur
= extent_map_end(em
);
1120 free_extent_map(em
);
1123 struct defrag_target_range
*entry
;
1124 struct defrag_target_range
*tmp
;
1126 list_for_each_entry_safe(entry
, tmp
, target_list
, list
) {
1127 list_del_init(&entry
->list
);
1131 if (!ret
&& last_scanned_ret
) {
1133 * If the last extent is not a target, the caller can skip to
1134 * the end of that extent.
1135 * Otherwise, we can only go the end of the specified range.
1137 if (!last_is_target
)
1138 *last_scanned_ret
= max(cur
, *last_scanned_ret
);
1140 *last_scanned_ret
= max(start
+ len
, *last_scanned_ret
);
1145 #define CLUSTER_SIZE (SZ_256K)
1146 static_assert(PAGE_ALIGNED(CLUSTER_SIZE
));
1149 * Defrag one contiguous target range.
1151 * @inode: target inode
1152 * @target: target range to defrag
1153 * @pages: locked pages covering the defrag range
1154 * @nr_pages: number of locked pages
1156 * Caller should ensure:
1158 * - Pages are prepared
1159 * Pages should be locked, no ordered extent in the pages range,
1162 * - Extent bits are locked
1164 static int defrag_one_locked_target(struct btrfs_inode
*inode
,
1165 struct defrag_target_range
*target
,
1166 struct page
**pages
, int nr_pages
,
1167 struct extent_state
**cached_state
)
1169 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1170 struct extent_changeset
*data_reserved
= NULL
;
1171 const u64 start
= target
->start
;
1172 const u64 len
= target
->len
;
1173 unsigned long last_index
= (start
+ len
- 1) >> PAGE_SHIFT
;
1174 unsigned long start_index
= start
>> PAGE_SHIFT
;
1175 unsigned long first_index
= page_index(pages
[0]);
1179 ASSERT(last_index
- first_index
+ 1 <= nr_pages
);
1181 ret
= btrfs_delalloc_reserve_space(inode
, &data_reserved
, start
, len
);
1184 clear_extent_bit(&inode
->io_tree
, start
, start
+ len
- 1,
1185 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
|
1186 EXTENT_DEFRAG
, cached_state
);
1187 set_extent_bit(&inode
->io_tree
, start
, start
+ len
- 1,
1188 EXTENT_DELALLOC
| EXTENT_DEFRAG
, cached_state
);
1190 /* Update the page status */
1191 for (i
= start_index
- first_index
; i
<= last_index
- first_index
; i
++) {
1192 ClearPageChecked(pages
[i
]);
1193 btrfs_page_clamp_set_dirty(fs_info
, pages
[i
], start
, len
);
1195 btrfs_delalloc_release_extents(inode
, len
);
1196 extent_changeset_free(data_reserved
);
1201 static int defrag_one_range(struct btrfs_inode
*inode
, u64 start
, u32 len
,
1202 u32 extent_thresh
, u64 newer_than
, bool do_compress
,
1203 u64
*last_scanned_ret
)
1205 struct extent_state
*cached_state
= NULL
;
1206 struct defrag_target_range
*entry
;
1207 struct defrag_target_range
*tmp
;
1208 LIST_HEAD(target_list
);
1209 struct page
**pages
;
1210 const u32 sectorsize
= inode
->root
->fs_info
->sectorsize
;
1211 u64 last_index
= (start
+ len
- 1) >> PAGE_SHIFT
;
1212 u64 start_index
= start
>> PAGE_SHIFT
;
1213 unsigned int nr_pages
= last_index
- start_index
+ 1;
1217 ASSERT(nr_pages
<= CLUSTER_SIZE
/ PAGE_SIZE
);
1218 ASSERT(IS_ALIGNED(start
, sectorsize
) && IS_ALIGNED(len
, sectorsize
));
1220 pages
= kcalloc(nr_pages
, sizeof(struct page
*), GFP_NOFS
);
1224 /* Prepare all pages */
1225 for (i
= 0; i
< nr_pages
; i
++) {
1226 pages
[i
] = defrag_prepare_one_page(inode
, start_index
+ i
);
1227 if (IS_ERR(pages
[i
])) {
1228 ret
= PTR_ERR(pages
[i
]);
1233 for (i
= 0; i
< nr_pages
; i
++)
1234 wait_on_page_writeback(pages
[i
]);
1236 /* Lock the pages range */
1237 lock_extent(&inode
->io_tree
, start_index
<< PAGE_SHIFT
,
1238 (last_index
<< PAGE_SHIFT
) + PAGE_SIZE
- 1,
1241 * Now we have a consistent view about the extent map, re-check
1242 * which range really needs to be defragged.
1244 * And this time we have extent locked already, pass @locked = true
1245 * so that we won't relock the extent range and cause deadlock.
1247 ret
= defrag_collect_targets(inode
, start
, len
, extent_thresh
,
1248 newer_than
, do_compress
, true,
1249 &target_list
, last_scanned_ret
);
1253 list_for_each_entry(entry
, &target_list
, list
) {
1254 ret
= defrag_one_locked_target(inode
, entry
, pages
, nr_pages
,
1260 list_for_each_entry_safe(entry
, tmp
, &target_list
, list
) {
1261 list_del_init(&entry
->list
);
1265 unlock_extent(&inode
->io_tree
, start_index
<< PAGE_SHIFT
,
1266 (last_index
<< PAGE_SHIFT
) + PAGE_SIZE
- 1,
1269 for (i
= 0; i
< nr_pages
; i
++) {
1271 unlock_page(pages
[i
]);
1279 static int defrag_one_cluster(struct btrfs_inode
*inode
,
1280 struct file_ra_state
*ra
,
1281 u64 start
, u32 len
, u32 extent_thresh
,
1282 u64 newer_than
, bool do_compress
,
1283 unsigned long *sectors_defragged
,
1284 unsigned long max_sectors
,
1285 u64
*last_scanned_ret
)
1287 const u32 sectorsize
= inode
->root
->fs_info
->sectorsize
;
1288 struct defrag_target_range
*entry
;
1289 struct defrag_target_range
*tmp
;
1290 LIST_HEAD(target_list
);
1293 ret
= defrag_collect_targets(inode
, start
, len
, extent_thresh
,
1294 newer_than
, do_compress
, false,
1295 &target_list
, NULL
);
1299 list_for_each_entry(entry
, &target_list
, list
) {
1300 u32 range_len
= entry
->len
;
1302 /* Reached or beyond the limit */
1303 if (max_sectors
&& *sectors_defragged
>= max_sectors
) {
1309 range_len
= min_t(u32
, range_len
,
1310 (max_sectors
- *sectors_defragged
) * sectorsize
);
1313 * If defrag_one_range() has updated last_scanned_ret,
1314 * our range may already be invalid (e.g. hole punched).
1315 * Skip if our range is before last_scanned_ret, as there is
1316 * no need to defrag the range anymore.
1318 if (entry
->start
+ range_len
<= *last_scanned_ret
)
1322 page_cache_sync_readahead(inode
->vfs_inode
.i_mapping
,
1323 ra
, NULL
, entry
->start
>> PAGE_SHIFT
,
1324 ((entry
->start
+ range_len
- 1) >> PAGE_SHIFT
) -
1325 (entry
->start
>> PAGE_SHIFT
) + 1);
1327 * Here we may not defrag any range if holes are punched before
1328 * we locked the pages.
1329 * But that's fine, it only affects the @sectors_defragged
1332 ret
= defrag_one_range(inode
, entry
->start
, range_len
,
1333 extent_thresh
, newer_than
, do_compress
,
1337 *sectors_defragged
+= range_len
>>
1338 inode
->root
->fs_info
->sectorsize_bits
;
1341 list_for_each_entry_safe(entry
, tmp
, &target_list
, list
) {
1342 list_del_init(&entry
->list
);
1346 *last_scanned_ret
= max(*last_scanned_ret
, start
+ len
);
1351 * Entry point to file defragmentation.
1353 * @inode: inode to be defragged
1354 * @ra: readahead state (can be NUL)
1355 * @range: defrag options including range and flags
1356 * @newer_than: minimum transid to defrag
1357 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1358 * will be defragged.
1360 * Return <0 for error.
1361 * Return >=0 for the number of sectors defragged, and range->start will be updated
1362 * to indicate the file offset where next defrag should be started at.
1363 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1364 * defragging all the range).
1366 int btrfs_defrag_file(struct inode
*inode
, struct file_ra_state
*ra
,
1367 struct btrfs_ioctl_defrag_range_args
*range
,
1368 u64 newer_than
, unsigned long max_to_defrag
)
1370 struct btrfs_fs_info
*fs_info
= btrfs_sb(inode
->i_sb
);
1371 unsigned long sectors_defragged
= 0;
1372 u64 isize
= i_size_read(inode
);
1375 bool do_compress
= (range
->flags
& BTRFS_DEFRAG_RANGE_COMPRESS
);
1376 bool ra_allocated
= false;
1377 int compress_type
= BTRFS_COMPRESS_ZLIB
;
1379 u32 extent_thresh
= range
->extent_thresh
;
1380 pgoff_t start_index
;
1385 if (range
->start
>= isize
)
1389 if (range
->compress_type
>= BTRFS_NR_COMPRESS_TYPES
)
1391 if (range
->compress_type
)
1392 compress_type
= range
->compress_type
;
1395 if (extent_thresh
== 0)
1396 extent_thresh
= SZ_256K
;
1398 if (range
->start
+ range
->len
> range
->start
) {
1399 /* Got a specific range */
1400 last_byte
= min(isize
, range
->start
+ range
->len
);
1402 /* Defrag until file end */
1406 /* Align the range */
1407 cur
= round_down(range
->start
, fs_info
->sectorsize
);
1408 last_byte
= round_up(last_byte
, fs_info
->sectorsize
) - 1;
1411 * If we were not given a ra, allocate a readahead context. As
1412 * readahead is just an optimization, defrag will work without it so
1413 * we don't error out.
1416 ra_allocated
= true;
1417 ra
= kzalloc(sizeof(*ra
), GFP_KERNEL
);
1419 file_ra_state_init(ra
, inode
->i_mapping
);
1423 * Make writeback start from the beginning of the range, so that the
1424 * defrag range can be written sequentially.
1426 start_index
= cur
>> PAGE_SHIFT
;
1427 if (start_index
< inode
->i_mapping
->writeback_index
)
1428 inode
->i_mapping
->writeback_index
= start_index
;
1430 while (cur
< last_byte
) {
1431 const unsigned long prev_sectors_defragged
= sectors_defragged
;
1432 u64 last_scanned
= cur
;
1435 if (btrfs_defrag_cancelled(fs_info
)) {
1440 /* We want the cluster end at page boundary when possible */
1441 cluster_end
= (((cur
>> PAGE_SHIFT
) +
1442 (SZ_256K
>> PAGE_SHIFT
)) << PAGE_SHIFT
) - 1;
1443 cluster_end
= min(cluster_end
, last_byte
);
1445 btrfs_inode_lock(BTRFS_I(inode
), 0);
1446 if (IS_SWAPFILE(inode
)) {
1448 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1451 if (!(inode
->i_sb
->s_flags
& SB_ACTIVE
)) {
1452 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1456 BTRFS_I(inode
)->defrag_compress
= compress_type
;
1457 ret
= defrag_one_cluster(BTRFS_I(inode
), ra
, cur
,
1458 cluster_end
+ 1 - cur
, extent_thresh
,
1459 newer_than
, do_compress
, §ors_defragged
,
1460 max_to_defrag
, &last_scanned
);
1462 if (sectors_defragged
> prev_sectors_defragged
)
1463 balance_dirty_pages_ratelimited(inode
->i_mapping
);
1465 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1468 cur
= max(cluster_end
+ 1, last_scanned
);
1479 * Update range.start for autodefrag, this will indicate where to start
1483 if (sectors_defragged
) {
1485 * We have defragged some sectors, for compression case they
1486 * need to be written back immediately.
1488 if (range
->flags
& BTRFS_DEFRAG_RANGE_START_IO
) {
1489 filemap_flush(inode
->i_mapping
);
1490 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT
,
1491 &BTRFS_I(inode
)->runtime_flags
))
1492 filemap_flush(inode
->i_mapping
);
1494 if (range
->compress_type
== BTRFS_COMPRESS_LZO
)
1495 btrfs_set_fs_incompat(fs_info
, COMPRESS_LZO
);
1496 else if (range
->compress_type
== BTRFS_COMPRESS_ZSTD
)
1497 btrfs_set_fs_incompat(fs_info
, COMPRESS_ZSTD
);
1498 ret
= sectors_defragged
;
1501 btrfs_inode_lock(BTRFS_I(inode
), 0);
1502 BTRFS_I(inode
)->defrag_compress
= BTRFS_COMPRESS_NONE
;
1503 btrfs_inode_unlock(BTRFS_I(inode
), 0);
1508 void __cold
btrfs_auto_defrag_exit(void)
1510 kmem_cache_destroy(btrfs_inode_defrag_cachep
);
1513 int __init
btrfs_auto_defrag_init(void)
1515 btrfs_inode_defrag_cachep
= kmem_cache_create("btrfs_inode_defrag",
1516 sizeof(struct inode_defrag
), 0,
1519 if (!btrfs_inode_defrag_cachep
)