1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011 STRATO. All rights reserved.
7 #include <linux/rbtree.h>
8 #include <trace/events/btrfs.h>
13 #include "transaction.h"
14 #include "delayed-ref.h"
17 #include "tree-mod-log.h"
19 #include "accessors.h"
20 #include "extent-tree.h"
21 #include "relocation.h"
22 #include "tree-checker.h"
24 /* Just arbitrary numbers so we can be sure one of these happened. */
25 #define BACKREF_FOUND_SHARED 6
26 #define BACKREF_FOUND_NOT_SHARED 7
28 struct extent_inode_elem
{
32 struct extent_inode_elem
*next
;
35 static int check_extent_in_eb(struct btrfs_backref_walk_ctx
*ctx
,
36 const struct btrfs_key
*key
,
37 const struct extent_buffer
*eb
,
38 const struct btrfs_file_extent_item
*fi
,
39 struct extent_inode_elem
**eie
)
41 const u64 data_len
= btrfs_file_extent_num_bytes(eb
, fi
);
42 u64 offset
= key
->offset
;
43 struct extent_inode_elem
*e
;
48 if (!ctx
->ignore_extent_item_pos
&&
49 !btrfs_file_extent_compression(eb
, fi
) &&
50 !btrfs_file_extent_encryption(eb
, fi
) &&
51 !btrfs_file_extent_other_encoding(eb
, fi
)) {
54 data_offset
= btrfs_file_extent_offset(eb
, fi
);
56 if (ctx
->extent_item_pos
< data_offset
||
57 ctx
->extent_item_pos
>= data_offset
+ data_len
)
59 offset
+= ctx
->extent_item_pos
- data_offset
;
62 if (!ctx
->indirect_ref_iterator
|| !ctx
->cache_lookup
)
65 cached
= ctx
->cache_lookup(eb
->start
, ctx
->user_ctx
, &root_ids
,
70 for (int i
= 0; i
< root_count
; i
++) {
73 ret
= ctx
->indirect_ref_iterator(key
->objectid
, offset
,
74 data_len
, root_ids
[i
],
81 e
= kmalloc(sizeof(*e
), GFP_NOFS
);
86 e
->inum
= key
->objectid
;
88 e
->num_bytes
= data_len
;
94 static void free_inode_elem_list(struct extent_inode_elem
*eie
)
96 struct extent_inode_elem
*eie_next
;
98 for (; eie
; eie
= eie_next
) {
104 static int find_extent_in_eb(struct btrfs_backref_walk_ctx
*ctx
,
105 const struct extent_buffer
*eb
,
106 struct extent_inode_elem
**eie
)
109 struct btrfs_key key
;
110 struct btrfs_file_extent_item
*fi
;
117 * from the shared data ref, we only have the leaf but we need
118 * the key. thus, we must look into all items and see that we
119 * find one (some) with a reference to our extent item.
121 nritems
= btrfs_header_nritems(eb
);
122 for (slot
= 0; slot
< nritems
; ++slot
) {
123 btrfs_item_key_to_cpu(eb
, &key
, slot
);
124 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
126 fi
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
127 extent_type
= btrfs_file_extent_type(eb
, fi
);
128 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
)
130 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
131 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
132 if (disk_byte
!= ctx
->bytenr
)
135 ret
= check_extent_in_eb(ctx
, &key
, eb
, fi
, eie
);
136 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
144 struct rb_root_cached root
;
148 #define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
151 struct preftree direct
; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
152 struct preftree indirect
; /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
153 struct preftree indirect_missing_keys
;
157 * Checks for a shared extent during backref search.
159 * The share_count tracks prelim_refs (direct and indirect) having a
161 * - incremented when a ref->count transitions to >0
162 * - decremented when a ref->count transitions to <1
165 struct btrfs_backref_share_check_ctx
*ctx
;
166 struct btrfs_root
*root
;
171 * Counts number of inodes that refer to an extent (different inodes in
172 * the same root or different roots) that we could find. The sharedness
173 * check typically stops once this counter gets greater than 1, so it
174 * may not reflect the total number of inodes.
178 * The number of times we found our inode refers to the data extent we
179 * are determining the sharedness. In other words, how many file extent
180 * items we could find for our inode that point to our target data
181 * extent. The value we get here after finishing the extent sharedness
182 * check may be smaller than reality, but if it ends up being greater
183 * than 1, then we know for sure the inode has multiple file extent
184 * items that point to our inode, and we can safely assume it's useful
185 * to cache the sharedness check result.
188 bool have_delayed_delete_refs
;
191 static inline int extent_is_shared(struct share_check
*sc
)
193 return (sc
&& sc
->share_count
> 1) ? BACKREF_FOUND_SHARED
: 0;
196 static struct kmem_cache
*btrfs_prelim_ref_cache
;
198 int __init
btrfs_prelim_ref_init(void)
200 btrfs_prelim_ref_cache
= kmem_cache_create("btrfs_prelim_ref",
201 sizeof(struct prelim_ref
),
205 if (!btrfs_prelim_ref_cache
)
210 void __cold
btrfs_prelim_ref_exit(void)
212 kmem_cache_destroy(btrfs_prelim_ref_cache
);
215 static void free_pref(struct prelim_ref
*ref
)
217 kmem_cache_free(btrfs_prelim_ref_cache
, ref
);
221 * Return 0 when both refs are for the same block (and can be merged).
222 * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
223 * indicates a 'higher' block.
225 static int prelim_ref_compare(struct prelim_ref
*ref1
,
226 struct prelim_ref
*ref2
)
228 if (ref1
->level
< ref2
->level
)
230 if (ref1
->level
> ref2
->level
)
232 if (ref1
->root_id
< ref2
->root_id
)
234 if (ref1
->root_id
> ref2
->root_id
)
236 if (ref1
->key_for_search
.type
< ref2
->key_for_search
.type
)
238 if (ref1
->key_for_search
.type
> ref2
->key_for_search
.type
)
240 if (ref1
->key_for_search
.objectid
< ref2
->key_for_search
.objectid
)
242 if (ref1
->key_for_search
.objectid
> ref2
->key_for_search
.objectid
)
244 if (ref1
->key_for_search
.offset
< ref2
->key_for_search
.offset
)
246 if (ref1
->key_for_search
.offset
> ref2
->key_for_search
.offset
)
248 if (ref1
->parent
< ref2
->parent
)
250 if (ref1
->parent
> ref2
->parent
)
256 static void update_share_count(struct share_check
*sc
, int oldcount
,
257 int newcount
, struct prelim_ref
*newref
)
259 if ((!sc
) || (oldcount
== 0 && newcount
< 1))
262 if (oldcount
> 0 && newcount
< 1)
264 else if (oldcount
< 1 && newcount
> 0)
267 if (newref
->root_id
== sc
->root
->root_key
.objectid
&&
268 newref
->wanted_disk_byte
== sc
->data_bytenr
&&
269 newref
->key_for_search
.objectid
== sc
->inum
)
270 sc
->self_ref_count
+= newref
->count
;
274 * Add @newref to the @root rbtree, merging identical refs.
276 * Callers should assume that newref has been freed after calling.
278 static void prelim_ref_insert(const struct btrfs_fs_info
*fs_info
,
279 struct preftree
*preftree
,
280 struct prelim_ref
*newref
,
281 struct share_check
*sc
)
283 struct rb_root_cached
*root
;
285 struct rb_node
*parent
= NULL
;
286 struct prelim_ref
*ref
;
288 bool leftmost
= true;
290 root
= &preftree
->root
;
291 p
= &root
->rb_root
.rb_node
;
295 ref
= rb_entry(parent
, struct prelim_ref
, rbnode
);
296 result
= prelim_ref_compare(ref
, newref
);
299 } else if (result
> 0) {
303 /* Identical refs, merge them and free @newref */
304 struct extent_inode_elem
*eie
= ref
->inode_list
;
306 while (eie
&& eie
->next
)
310 ref
->inode_list
= newref
->inode_list
;
312 eie
->next
= newref
->inode_list
;
313 trace_btrfs_prelim_ref_merge(fs_info
, ref
, newref
,
316 * A delayed ref can have newref->count < 0.
317 * The ref->count is updated to follow any
318 * BTRFS_[ADD|DROP]_DELAYED_REF actions.
320 update_share_count(sc
, ref
->count
,
321 ref
->count
+ newref
->count
, newref
);
322 ref
->count
+= newref
->count
;
328 update_share_count(sc
, 0, newref
->count
, newref
);
330 trace_btrfs_prelim_ref_insert(fs_info
, newref
, NULL
, preftree
->count
);
331 rb_link_node(&newref
->rbnode
, parent
, p
);
332 rb_insert_color_cached(&newref
->rbnode
, root
, leftmost
);
336 * Release the entire tree. We don't care about internal consistency so
337 * just free everything and then reset the tree root.
339 static void prelim_release(struct preftree
*preftree
)
341 struct prelim_ref
*ref
, *next_ref
;
343 rbtree_postorder_for_each_entry_safe(ref
, next_ref
,
344 &preftree
->root
.rb_root
, rbnode
) {
345 free_inode_elem_list(ref
->inode_list
);
349 preftree
->root
= RB_ROOT_CACHED
;
354 * the rules for all callers of this function are:
355 * - obtaining the parent is the goal
356 * - if you add a key, you must know that it is a correct key
357 * - if you cannot add the parent or a correct key, then we will look into the
358 * block later to set a correct key
362 * backref type | shared | indirect | shared | indirect
363 * information | tree | tree | data | data
364 * --------------------+--------+----------+--------+----------
365 * parent logical | y | - | - | -
366 * key to resolve | - | y | y | y
367 * tree block logical | - | - | - | -
368 * root for resolving | y | y | y | y
370 * - column 1: we've the parent -> done
371 * - column 2, 3, 4: we use the key to find the parent
373 * on disk refs (inline or keyed)
374 * ==============================
375 * backref type | shared | indirect | shared | indirect
376 * information | tree | tree | data | data
377 * --------------------+--------+----------+--------+----------
378 * parent logical | y | - | y | -
379 * key to resolve | - | - | - | y
380 * tree block logical | y | y | y | y
381 * root for resolving | - | y | y | y
383 * - column 1, 3: we've the parent -> done
384 * - column 2: we take the first key from the block to find the parent
385 * (see add_missing_keys)
386 * - column 4: we use the key to find the parent
388 * additional information that's available but not required to find the parent
389 * block might help in merging entries to gain some speed.
391 static int add_prelim_ref(const struct btrfs_fs_info
*fs_info
,
392 struct preftree
*preftree
, u64 root_id
,
393 const struct btrfs_key
*key
, int level
, u64 parent
,
394 u64 wanted_disk_byte
, int count
,
395 struct share_check
*sc
, gfp_t gfp_mask
)
397 struct prelim_ref
*ref
;
399 if (root_id
== BTRFS_DATA_RELOC_TREE_OBJECTID
)
402 ref
= kmem_cache_alloc(btrfs_prelim_ref_cache
, gfp_mask
);
406 ref
->root_id
= root_id
;
408 ref
->key_for_search
= *key
;
410 memset(&ref
->key_for_search
, 0, sizeof(ref
->key_for_search
));
412 ref
->inode_list
= NULL
;
415 ref
->parent
= parent
;
416 ref
->wanted_disk_byte
= wanted_disk_byte
;
417 prelim_ref_insert(fs_info
, preftree
, ref
, sc
);
418 return extent_is_shared(sc
);
421 /* direct refs use root == 0, key == NULL */
422 static int add_direct_ref(const struct btrfs_fs_info
*fs_info
,
423 struct preftrees
*preftrees
, int level
, u64 parent
,
424 u64 wanted_disk_byte
, int count
,
425 struct share_check
*sc
, gfp_t gfp_mask
)
427 return add_prelim_ref(fs_info
, &preftrees
->direct
, 0, NULL
, level
,
428 parent
, wanted_disk_byte
, count
, sc
, gfp_mask
);
431 /* indirect refs use parent == 0 */
432 static int add_indirect_ref(const struct btrfs_fs_info
*fs_info
,
433 struct preftrees
*preftrees
, u64 root_id
,
434 const struct btrfs_key
*key
, int level
,
435 u64 wanted_disk_byte
, int count
,
436 struct share_check
*sc
, gfp_t gfp_mask
)
438 struct preftree
*tree
= &preftrees
->indirect
;
441 tree
= &preftrees
->indirect_missing_keys
;
442 return add_prelim_ref(fs_info
, tree
, root_id
, key
, level
, 0,
443 wanted_disk_byte
, count
, sc
, gfp_mask
);
446 static int is_shared_data_backref(struct preftrees
*preftrees
, u64 bytenr
)
448 struct rb_node
**p
= &preftrees
->direct
.root
.rb_root
.rb_node
;
449 struct rb_node
*parent
= NULL
;
450 struct prelim_ref
*ref
= NULL
;
451 struct prelim_ref target
= {};
454 target
.parent
= bytenr
;
458 ref
= rb_entry(parent
, struct prelim_ref
, rbnode
);
459 result
= prelim_ref_compare(ref
, &target
);
471 static int add_all_parents(struct btrfs_backref_walk_ctx
*ctx
,
472 struct btrfs_root
*root
, struct btrfs_path
*path
,
473 struct ulist
*parents
,
474 struct preftrees
*preftrees
, struct prelim_ref
*ref
,
479 struct extent_buffer
*eb
;
480 struct btrfs_key key
;
481 struct btrfs_key
*key_for_search
= &ref
->key_for_search
;
482 struct btrfs_file_extent_item
*fi
;
483 struct extent_inode_elem
*eie
= NULL
, *old
= NULL
;
485 u64 wanted_disk_byte
= ref
->wanted_disk_byte
;
491 eb
= path
->nodes
[level
];
492 ret
= ulist_add(parents
, eb
->start
, 0, GFP_NOFS
);
499 * 1. We normally enter this function with the path already pointing to
500 * the first item to check. But sometimes, we may enter it with
502 * 2. We are searching for normal backref but bytenr of this leaf
503 * matches shared data backref
504 * 3. The leaf owner is not equal to the root we are searching
506 * For these cases, go to the next leaf before we continue.
509 if (path
->slots
[0] >= btrfs_header_nritems(eb
) ||
510 is_shared_data_backref(preftrees
, eb
->start
) ||
511 ref
->root_id
!= btrfs_header_owner(eb
)) {
512 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
513 ret
= btrfs_next_leaf(root
, path
);
515 ret
= btrfs_next_old_leaf(root
, path
, ctx
->time_seq
);
518 while (!ret
&& count
< ref
->count
) {
520 slot
= path
->slots
[0];
522 btrfs_item_key_to_cpu(eb
, &key
, slot
);
524 if (key
.objectid
!= key_for_search
->objectid
||
525 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
529 * We are searching for normal backref but bytenr of this leaf
530 * matches shared data backref, OR
531 * the leaf owner is not equal to the root we are searching for
534 (is_shared_data_backref(preftrees
, eb
->start
) ||
535 ref
->root_id
!= btrfs_header_owner(eb
))) {
536 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
537 ret
= btrfs_next_leaf(root
, path
);
539 ret
= btrfs_next_old_leaf(root
, path
, ctx
->time_seq
);
542 fi
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
543 type
= btrfs_file_extent_type(eb
, fi
);
544 if (type
== BTRFS_FILE_EXTENT_INLINE
)
546 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
547 data_offset
= btrfs_file_extent_offset(eb
, fi
);
549 if (disk_byte
== wanted_disk_byte
) {
552 if (ref
->key_for_search
.offset
== key
.offset
- data_offset
)
556 if (!ctx
->skip_inode_ref_list
) {
557 ret
= check_extent_in_eb(ctx
, &key
, eb
, fi
, &eie
);
558 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
564 ret
= ulist_add_merge_ptr(parents
, eb
->start
,
565 eie
, (void **)&old
, GFP_NOFS
);
568 if (!ret
&& !ctx
->skip_inode_ref_list
) {
576 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
577 ret
= btrfs_next_item(root
, path
);
579 ret
= btrfs_next_old_item(root
, path
, ctx
->time_seq
);
582 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
583 free_inode_elem_list(eie
);
591 * resolve an indirect backref in the form (root_id, key, level)
592 * to a logical address
594 static int resolve_indirect_ref(struct btrfs_backref_walk_ctx
*ctx
,
595 struct btrfs_path
*path
,
596 struct preftrees
*preftrees
,
597 struct prelim_ref
*ref
, struct ulist
*parents
)
599 struct btrfs_root
*root
;
600 struct extent_buffer
*eb
;
603 int level
= ref
->level
;
604 struct btrfs_key search_key
= ref
->key_for_search
;
607 * If we're search_commit_root we could possibly be holding locks on
608 * other tree nodes. This happens when qgroups does backref walks when
609 * adding new delayed refs. To deal with this we need to look in cache
610 * for the root, and if we don't find it then we need to search the
611 * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
614 if (path
->search_commit_root
)
615 root
= btrfs_get_fs_root_commit_root(ctx
->fs_info
, path
, ref
->root_id
);
617 root
= btrfs_get_fs_root(ctx
->fs_info
, ref
->root_id
, false);
623 if (!path
->search_commit_root
&&
624 test_bit(BTRFS_ROOT_DELETING
, &root
->state
)) {
629 if (btrfs_is_testing(ctx
->fs_info
)) {
634 if (path
->search_commit_root
)
635 root_level
= btrfs_header_level(root
->commit_root
);
636 else if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
637 root_level
= btrfs_header_level(root
->node
);
639 root_level
= btrfs_old_root_level(root
, ctx
->time_seq
);
641 if (root_level
+ 1 == level
)
645 * We can often find data backrefs with an offset that is too large
646 * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
647 * subtracting a file's offset with the data offset of its
648 * corresponding extent data item. This can happen for example in the
651 * So if we detect such case we set the search key's offset to zero to
652 * make sure we will find the matching file extent item at
653 * add_all_parents(), otherwise we will miss it because the offset
654 * taken form the backref is much larger then the offset of the file
655 * extent item. This can make us scan a very large number of file
656 * extent items, but at least it will not make us miss any.
658 * This is an ugly workaround for a behaviour that should have never
659 * existed, but it does and a fix for the clone ioctl would touch a lot
660 * of places, cause backwards incompatibility and would not fix the
661 * problem for extents cloned with older kernels.
663 if (search_key
.type
== BTRFS_EXTENT_DATA_KEY
&&
664 search_key
.offset
>= LLONG_MAX
)
665 search_key
.offset
= 0;
666 path
->lowest_level
= level
;
667 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
668 ret
= btrfs_search_slot(NULL
, root
, &search_key
, path
, 0, 0);
670 ret
= btrfs_search_old_slot(root
, &search_key
, path
, ctx
->time_seq
);
672 btrfs_debug(ctx
->fs_info
,
673 "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
674 ref
->root_id
, level
, ref
->count
, ret
,
675 ref
->key_for_search
.objectid
, ref
->key_for_search
.type
,
676 ref
->key_for_search
.offset
);
680 eb
= path
->nodes
[level
];
682 if (WARN_ON(!level
)) {
687 eb
= path
->nodes
[level
];
690 ret
= add_all_parents(ctx
, root
, path
, parents
, preftrees
, ref
, level
);
692 btrfs_put_root(root
);
694 path
->lowest_level
= 0;
695 btrfs_release_path(path
);
699 static struct extent_inode_elem
*
700 unode_aux_to_inode_list(struct ulist_node
*node
)
704 return (struct extent_inode_elem
*)(uintptr_t)node
->aux
;
707 static void free_leaf_list(struct ulist
*ulist
)
709 struct ulist_node
*node
;
710 struct ulist_iterator uiter
;
712 ULIST_ITER_INIT(&uiter
);
713 while ((node
= ulist_next(ulist
, &uiter
)))
714 free_inode_elem_list(unode_aux_to_inode_list(node
));
720 * We maintain three separate rbtrees: one for direct refs, one for
721 * indirect refs which have a key, and one for indirect refs which do not
722 * have a key. Each tree does merge on insertion.
724 * Once all of the references are located, we iterate over the tree of
725 * indirect refs with missing keys. An appropriate key is located and
726 * the ref is moved onto the tree for indirect refs. After all missing
727 * keys are thus located, we iterate over the indirect ref tree, resolve
728 * each reference, and then insert the resolved reference onto the
729 * direct tree (merging there too).
731 * New backrefs (i.e., for parent nodes) are added to the appropriate
732 * rbtree as they are encountered. The new backrefs are subsequently
735 static int resolve_indirect_refs(struct btrfs_backref_walk_ctx
*ctx
,
736 struct btrfs_path
*path
,
737 struct preftrees
*preftrees
,
738 struct share_check
*sc
)
742 struct ulist
*parents
;
743 struct ulist_node
*node
;
744 struct ulist_iterator uiter
;
745 struct rb_node
*rnode
;
747 parents
= ulist_alloc(GFP_NOFS
);
752 * We could trade memory usage for performance here by iterating
753 * the tree, allocating new refs for each insertion, and then
754 * freeing the entire indirect tree when we're done. In some test
755 * cases, the tree can grow quite large (~200k objects).
757 while ((rnode
= rb_first_cached(&preftrees
->indirect
.root
))) {
758 struct prelim_ref
*ref
;
760 ref
= rb_entry(rnode
, struct prelim_ref
, rbnode
);
761 if (WARN(ref
->parent
,
762 "BUG: direct ref found in indirect tree")) {
767 rb_erase_cached(&ref
->rbnode
, &preftrees
->indirect
.root
);
768 preftrees
->indirect
.count
--;
770 if (ref
->count
== 0) {
775 if (sc
&& ref
->root_id
!= sc
->root
->root_key
.objectid
) {
777 ret
= BACKREF_FOUND_SHARED
;
780 err
= resolve_indirect_ref(ctx
, path
, preftrees
, ref
, parents
);
782 * we can only tolerate ENOENT,otherwise,we should catch error
783 * and return directly.
785 if (err
== -ENOENT
) {
786 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
, ref
,
795 /* we put the first parent into the ref at hand */
796 ULIST_ITER_INIT(&uiter
);
797 node
= ulist_next(parents
, &uiter
);
798 ref
->parent
= node
? node
->val
: 0;
799 ref
->inode_list
= unode_aux_to_inode_list(node
);
801 /* Add a prelim_ref(s) for any other parent(s). */
802 while ((node
= ulist_next(parents
, &uiter
))) {
803 struct prelim_ref
*new_ref
;
805 new_ref
= kmem_cache_alloc(btrfs_prelim_ref_cache
,
812 memcpy(new_ref
, ref
, sizeof(*ref
));
813 new_ref
->parent
= node
->val
;
814 new_ref
->inode_list
= unode_aux_to_inode_list(node
);
815 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
,
820 * Now it's a direct ref, put it in the direct tree. We must
821 * do this last because the ref could be merged/freed here.
823 prelim_ref_insert(ctx
->fs_info
, &preftrees
->direct
, ref
, NULL
);
825 ulist_reinit(parents
);
830 * We may have inode lists attached to refs in the parents ulist, so we
831 * must free them before freeing the ulist and its refs.
833 free_leaf_list(parents
);
838 * read tree blocks and add keys where required.
840 static int add_missing_keys(struct btrfs_fs_info
*fs_info
,
841 struct preftrees
*preftrees
, bool lock
)
843 struct prelim_ref
*ref
;
844 struct extent_buffer
*eb
;
845 struct preftree
*tree
= &preftrees
->indirect_missing_keys
;
846 struct rb_node
*node
;
848 while ((node
= rb_first_cached(&tree
->root
))) {
849 struct btrfs_tree_parent_check check
= { 0 };
851 ref
= rb_entry(node
, struct prelim_ref
, rbnode
);
852 rb_erase_cached(node
, &tree
->root
);
854 BUG_ON(ref
->parent
); /* should not be a direct ref */
855 BUG_ON(ref
->key_for_search
.type
);
856 BUG_ON(!ref
->wanted_disk_byte
);
858 check
.level
= ref
->level
- 1;
859 check
.owner_root
= ref
->root_id
;
861 eb
= read_tree_block(fs_info
, ref
->wanted_disk_byte
, &check
);
866 if (!extent_buffer_uptodate(eb
)) {
868 free_extent_buffer(eb
);
873 btrfs_tree_read_lock(eb
);
874 if (btrfs_header_level(eb
) == 0)
875 btrfs_item_key_to_cpu(eb
, &ref
->key_for_search
, 0);
877 btrfs_node_key_to_cpu(eb
, &ref
->key_for_search
, 0);
879 btrfs_tree_read_unlock(eb
);
880 free_extent_buffer(eb
);
881 prelim_ref_insert(fs_info
, &preftrees
->indirect
, ref
, NULL
);
888 * add all currently queued delayed refs from this head whose seq nr is
889 * smaller or equal that seq to the list
891 static int add_delayed_refs(const struct btrfs_fs_info
*fs_info
,
892 struct btrfs_delayed_ref_head
*head
, u64 seq
,
893 struct preftrees
*preftrees
, struct share_check
*sc
)
895 struct btrfs_delayed_ref_node
*node
;
896 struct btrfs_key key
;
901 spin_lock(&head
->lock
);
902 for (n
= rb_first_cached(&head
->ref_tree
); n
; n
= rb_next(n
)) {
903 node
= rb_entry(n
, struct btrfs_delayed_ref_node
,
908 switch (node
->action
) {
909 case BTRFS_ADD_DELAYED_EXTENT
:
910 case BTRFS_UPDATE_DELAYED_HEAD
:
913 case BTRFS_ADD_DELAYED_REF
:
914 count
= node
->ref_mod
;
916 case BTRFS_DROP_DELAYED_REF
:
917 count
= node
->ref_mod
* -1;
922 switch (node
->type
) {
923 case BTRFS_TREE_BLOCK_REF_KEY
: {
924 /* NORMAL INDIRECT METADATA backref */
925 struct btrfs_delayed_tree_ref
*ref
;
926 struct btrfs_key
*key_ptr
= NULL
;
928 if (head
->extent_op
&& head
->extent_op
->update_key
) {
929 btrfs_disk_key_to_cpu(&key
, &head
->extent_op
->key
);
933 ref
= btrfs_delayed_node_to_tree_ref(node
);
934 ret
= add_indirect_ref(fs_info
, preftrees
, ref
->root
,
935 key_ptr
, ref
->level
+ 1,
936 node
->bytenr
, count
, sc
,
940 case BTRFS_SHARED_BLOCK_REF_KEY
: {
941 /* SHARED DIRECT METADATA backref */
942 struct btrfs_delayed_tree_ref
*ref
;
944 ref
= btrfs_delayed_node_to_tree_ref(node
);
946 ret
= add_direct_ref(fs_info
, preftrees
, ref
->level
+ 1,
947 ref
->parent
, node
->bytenr
, count
,
951 case BTRFS_EXTENT_DATA_REF_KEY
: {
952 /* NORMAL INDIRECT DATA backref */
953 struct btrfs_delayed_data_ref
*ref
;
954 ref
= btrfs_delayed_node_to_data_ref(node
);
956 key
.objectid
= ref
->objectid
;
957 key
.type
= BTRFS_EXTENT_DATA_KEY
;
958 key
.offset
= ref
->offset
;
961 * If we have a share check context and a reference for
962 * another inode, we can't exit immediately. This is
963 * because even if this is a BTRFS_ADD_DELAYED_REF
964 * reference we may find next a BTRFS_DROP_DELAYED_REF
965 * which cancels out this ADD reference.
967 * If this is a DROP reference and there was no previous
968 * ADD reference, then we need to signal that when we
969 * process references from the extent tree (through
970 * add_inline_refs() and add_keyed_refs()), we should
971 * not exit early if we find a reference for another
972 * inode, because one of the delayed DROP references
973 * may cancel that reference in the extent tree.
976 sc
->have_delayed_delete_refs
= true;
978 ret
= add_indirect_ref(fs_info
, preftrees
, ref
->root
,
979 &key
, 0, node
->bytenr
, count
, sc
,
983 case BTRFS_SHARED_DATA_REF_KEY
: {
984 /* SHARED DIRECT FULL backref */
985 struct btrfs_delayed_data_ref
*ref
;
987 ref
= btrfs_delayed_node_to_data_ref(node
);
989 ret
= add_direct_ref(fs_info
, preftrees
, 0, ref
->parent
,
990 node
->bytenr
, count
, sc
,
998 * We must ignore BACKREF_FOUND_SHARED until all delayed
999 * refs have been checked.
1001 if (ret
&& (ret
!= BACKREF_FOUND_SHARED
))
1005 ret
= extent_is_shared(sc
);
1007 spin_unlock(&head
->lock
);
1012 * add all inline backrefs for bytenr to the list
1014 * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
1016 static int add_inline_refs(struct btrfs_backref_walk_ctx
*ctx
,
1017 struct btrfs_path
*path
,
1018 int *info_level
, struct preftrees
*preftrees
,
1019 struct share_check
*sc
)
1023 struct extent_buffer
*leaf
;
1024 struct btrfs_key key
;
1025 struct btrfs_key found_key
;
1028 struct btrfs_extent_item
*ei
;
1033 * enumerate all inline refs
1035 leaf
= path
->nodes
[0];
1036 slot
= path
->slots
[0];
1038 item_size
= btrfs_item_size(leaf
, slot
);
1039 ei
= btrfs_item_ptr(leaf
, slot
, struct btrfs_extent_item
);
1041 if (ctx
->check_extent_item
) {
1042 ret
= ctx
->check_extent_item(ctx
->bytenr
, ei
, leaf
, ctx
->user_ctx
);
1047 flags
= btrfs_extent_flags(leaf
, ei
);
1048 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
1050 ptr
= (unsigned long)(ei
+ 1);
1051 end
= (unsigned long)ei
+ item_size
;
1053 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
&&
1054 flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
1055 struct btrfs_tree_block_info
*info
;
1057 info
= (struct btrfs_tree_block_info
*)ptr
;
1058 *info_level
= btrfs_tree_block_level(leaf
, info
);
1059 ptr
+= sizeof(struct btrfs_tree_block_info
);
1061 } else if (found_key
.type
== BTRFS_METADATA_ITEM_KEY
) {
1062 *info_level
= found_key
.offset
;
1064 BUG_ON(!(flags
& BTRFS_EXTENT_FLAG_DATA
));
1068 struct btrfs_extent_inline_ref
*iref
;
1072 iref
= (struct btrfs_extent_inline_ref
*)ptr
;
1073 type
= btrfs_get_extent_inline_ref_type(leaf
, iref
,
1074 BTRFS_REF_TYPE_ANY
);
1075 if (type
== BTRFS_REF_TYPE_INVALID
)
1078 offset
= btrfs_extent_inline_ref_offset(leaf
, iref
);
1081 case BTRFS_SHARED_BLOCK_REF_KEY
:
1082 ret
= add_direct_ref(ctx
->fs_info
, preftrees
,
1083 *info_level
+ 1, offset
,
1084 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1086 case BTRFS_SHARED_DATA_REF_KEY
: {
1087 struct btrfs_shared_data_ref
*sdref
;
1090 sdref
= (struct btrfs_shared_data_ref
*)(iref
+ 1);
1091 count
= btrfs_shared_data_ref_count(leaf
, sdref
);
1093 ret
= add_direct_ref(ctx
->fs_info
, preftrees
, 0, offset
,
1094 ctx
->bytenr
, count
, sc
, GFP_NOFS
);
1097 case BTRFS_TREE_BLOCK_REF_KEY
:
1098 ret
= add_indirect_ref(ctx
->fs_info
, preftrees
, offset
,
1099 NULL
, *info_level
+ 1,
1100 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1102 case BTRFS_EXTENT_DATA_REF_KEY
: {
1103 struct btrfs_extent_data_ref
*dref
;
1107 dref
= (struct btrfs_extent_data_ref
*)(&iref
->offset
);
1108 count
= btrfs_extent_data_ref_count(leaf
, dref
);
1109 key
.objectid
= btrfs_extent_data_ref_objectid(leaf
,
1111 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1112 key
.offset
= btrfs_extent_data_ref_offset(leaf
, dref
);
1114 if (sc
&& key
.objectid
!= sc
->inum
&&
1115 !sc
->have_delayed_delete_refs
) {
1116 ret
= BACKREF_FOUND_SHARED
;
1120 root
= btrfs_extent_data_ref_root(leaf
, dref
);
1122 if (!ctx
->skip_data_ref
||
1123 !ctx
->skip_data_ref(root
, key
.objectid
, key
.offset
,
1125 ret
= add_indirect_ref(ctx
->fs_info
, preftrees
,
1126 root
, &key
, 0, ctx
->bytenr
,
1127 count
, sc
, GFP_NOFS
);
1130 case BTRFS_EXTENT_OWNER_REF_KEY
:
1131 ASSERT(btrfs_fs_incompat(ctx
->fs_info
, SIMPLE_QUOTA
));
1138 ptr
+= btrfs_extent_inline_ref_size(type
);
1145 * add all non-inline backrefs for bytenr to the list
1147 * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
1149 static int add_keyed_refs(struct btrfs_backref_walk_ctx
*ctx
,
1150 struct btrfs_root
*extent_root
,
1151 struct btrfs_path
*path
,
1152 int info_level
, struct preftrees
*preftrees
,
1153 struct share_check
*sc
)
1155 struct btrfs_fs_info
*fs_info
= extent_root
->fs_info
;
1158 struct extent_buffer
*leaf
;
1159 struct btrfs_key key
;
1162 ret
= btrfs_next_item(extent_root
, path
);
1170 slot
= path
->slots
[0];
1171 leaf
= path
->nodes
[0];
1172 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1174 if (key
.objectid
!= ctx
->bytenr
)
1176 if (key
.type
< BTRFS_TREE_BLOCK_REF_KEY
)
1178 if (key
.type
> BTRFS_SHARED_DATA_REF_KEY
)
1182 case BTRFS_SHARED_BLOCK_REF_KEY
:
1183 /* SHARED DIRECT METADATA backref */
1184 ret
= add_direct_ref(fs_info
, preftrees
,
1185 info_level
+ 1, key
.offset
,
1186 ctx
->bytenr
, 1, NULL
, GFP_NOFS
);
1188 case BTRFS_SHARED_DATA_REF_KEY
: {
1189 /* SHARED DIRECT FULL backref */
1190 struct btrfs_shared_data_ref
*sdref
;
1193 sdref
= btrfs_item_ptr(leaf
, slot
,
1194 struct btrfs_shared_data_ref
);
1195 count
= btrfs_shared_data_ref_count(leaf
, sdref
);
1196 ret
= add_direct_ref(fs_info
, preftrees
, 0,
1197 key
.offset
, ctx
->bytenr
, count
,
1201 case BTRFS_TREE_BLOCK_REF_KEY
:
1202 /* NORMAL INDIRECT METADATA backref */
1203 ret
= add_indirect_ref(fs_info
, preftrees
, key
.offset
,
1204 NULL
, info_level
+ 1, ctx
->bytenr
,
1207 case BTRFS_EXTENT_DATA_REF_KEY
: {
1208 /* NORMAL INDIRECT DATA backref */
1209 struct btrfs_extent_data_ref
*dref
;
1213 dref
= btrfs_item_ptr(leaf
, slot
,
1214 struct btrfs_extent_data_ref
);
1215 count
= btrfs_extent_data_ref_count(leaf
, dref
);
1216 key
.objectid
= btrfs_extent_data_ref_objectid(leaf
,
1218 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1219 key
.offset
= btrfs_extent_data_ref_offset(leaf
, dref
);
1221 if (sc
&& key
.objectid
!= sc
->inum
&&
1222 !sc
->have_delayed_delete_refs
) {
1223 ret
= BACKREF_FOUND_SHARED
;
1227 root
= btrfs_extent_data_ref_root(leaf
, dref
);
1229 if (!ctx
->skip_data_ref
||
1230 !ctx
->skip_data_ref(root
, key
.objectid
, key
.offset
,
1232 ret
= add_indirect_ref(fs_info
, preftrees
, root
,
1233 &key
, 0, ctx
->bytenr
,
1234 count
, sc
, GFP_NOFS
);
1249 * The caller has joined a transaction or is holding a read lock on the
1250 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
1251 * snapshot field changing while updating or checking the cache.
1253 static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx
*ctx
,
1254 struct btrfs_root
*root
,
1255 u64 bytenr
, int level
, bool *is_shared
)
1257 const struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1258 struct btrfs_backref_shared_cache_entry
*entry
;
1260 if (!current
->journal_info
)
1261 lockdep_assert_held(&fs_info
->commit_root_sem
);
1263 if (!ctx
->use_path_cache
)
1266 if (WARN_ON_ONCE(level
>= BTRFS_MAX_LEVEL
))
1270 * Level -1 is used for the data extent, which is not reliable to cache
1271 * because its reference count can increase or decrease without us
1272 * realizing. We cache results only for extent buffers that lead from
1273 * the root node down to the leaf with the file extent item.
1277 entry
= &ctx
->path_cache_entries
[level
];
1279 /* Unused cache entry or being used for some other extent buffer. */
1280 if (entry
->bytenr
!= bytenr
)
1284 * We cached a false result, but the last snapshot generation of the
1285 * root changed, so we now have a snapshot. Don't trust the result.
1287 if (!entry
->is_shared
&&
1288 entry
->gen
!= btrfs_root_last_snapshot(&root
->root_item
))
1292 * If we cached a true result and the last generation used for dropping
1293 * a root changed, we can not trust the result, because the dropped root
1294 * could be a snapshot sharing this extent buffer.
1296 if (entry
->is_shared
&&
1297 entry
->gen
!= btrfs_get_last_root_drop_gen(fs_info
))
1300 *is_shared
= entry
->is_shared
;
1302 * If the node at this level is shared, than all nodes below are also
1303 * shared. Currently some of the nodes below may be marked as not shared
1304 * because we have just switched from one leaf to another, and switched
1305 * also other nodes above the leaf and below the current level, so mark
1309 for (int i
= 0; i
< level
; i
++) {
1310 ctx
->path_cache_entries
[i
].is_shared
= true;
1311 ctx
->path_cache_entries
[i
].gen
= entry
->gen
;
1319 * The caller has joined a transaction or is holding a read lock on the
1320 * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
1321 * snapshot field changing while updating or checking the cache.
1323 static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx
*ctx
,
1324 struct btrfs_root
*root
,
1325 u64 bytenr
, int level
, bool is_shared
)
1327 const struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1328 struct btrfs_backref_shared_cache_entry
*entry
;
1331 if (!current
->journal_info
)
1332 lockdep_assert_held(&fs_info
->commit_root_sem
);
1334 if (!ctx
->use_path_cache
)
1337 if (WARN_ON_ONCE(level
>= BTRFS_MAX_LEVEL
))
1341 * Level -1 is used for the data extent, which is not reliable to cache
1342 * because its reference count can increase or decrease without us
1343 * realizing. We cache results only for extent buffers that lead from
1344 * the root node down to the leaf with the file extent item.
1349 gen
= btrfs_get_last_root_drop_gen(fs_info
);
1351 gen
= btrfs_root_last_snapshot(&root
->root_item
);
1353 entry
= &ctx
->path_cache_entries
[level
];
1354 entry
->bytenr
= bytenr
;
1355 entry
->is_shared
= is_shared
;
1359 * If we found an extent buffer is shared, set the cache result for all
1360 * extent buffers below it to true. As nodes in the path are COWed,
1361 * their sharedness is moved to their children, and if a leaf is COWed,
1362 * then the sharedness of a data extent becomes direct, the refcount of
1363 * data extent is increased in the extent item at the extent tree.
1366 for (int i
= 0; i
< level
; i
++) {
1367 entry
= &ctx
->path_cache_entries
[i
];
1368 entry
->is_shared
= is_shared
;
1375 * this adds all existing backrefs (inline backrefs, backrefs and delayed
1376 * refs) for the given bytenr to the refs list, merges duplicates and resolves
1377 * indirect refs to their parent bytenr.
1378 * When roots are found, they're added to the roots list
1380 * @ctx: Backref walking context object, must be not NULL.
1381 * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a
1382 * shared extent is detected.
1384 * Otherwise this returns 0 for success and <0 for an error.
1386 * FIXME some caching might speed things up
1388 static int find_parent_nodes(struct btrfs_backref_walk_ctx
*ctx
,
1389 struct share_check
*sc
)
1391 struct btrfs_root
*root
= btrfs_extent_root(ctx
->fs_info
, ctx
->bytenr
);
1392 struct btrfs_key key
;
1393 struct btrfs_path
*path
;
1394 struct btrfs_delayed_ref_root
*delayed_refs
= NULL
;
1395 struct btrfs_delayed_ref_head
*head
;
1398 struct prelim_ref
*ref
;
1399 struct rb_node
*node
;
1400 struct extent_inode_elem
*eie
= NULL
;
1401 struct preftrees preftrees
= {
1402 .direct
= PREFTREE_INIT
,
1403 .indirect
= PREFTREE_INIT
,
1404 .indirect_missing_keys
= PREFTREE_INIT
1407 /* Roots ulist is not needed when using a sharedness check context. */
1409 ASSERT(ctx
->roots
== NULL
);
1411 key
.objectid
= ctx
->bytenr
;
1412 key
.offset
= (u64
)-1;
1413 if (btrfs_fs_incompat(ctx
->fs_info
, SKINNY_METADATA
))
1414 key
.type
= BTRFS_METADATA_ITEM_KEY
;
1416 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1418 path
= btrfs_alloc_path();
1422 path
->search_commit_root
= 1;
1423 path
->skip_locking
= 1;
1426 if (ctx
->time_seq
== BTRFS_SEQ_LAST
)
1427 path
->skip_locking
= 1;
1432 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1437 * Key with offset -1 found, there would have to exist an extent
1438 * item with such offset, but this is out of the valid range.
1444 if (ctx
->trans
&& likely(ctx
->trans
->type
!= __TRANS_DUMMY
) &&
1445 ctx
->time_seq
!= BTRFS_SEQ_LAST
) {
1447 * We have a specific time_seq we care about and trans which
1448 * means we have the path lock, we need to grab the ref head and
1449 * lock it so we have a consistent view of the refs at the given
1452 delayed_refs
= &ctx
->trans
->transaction
->delayed_refs
;
1453 spin_lock(&delayed_refs
->lock
);
1454 head
= btrfs_find_delayed_ref_head(delayed_refs
, ctx
->bytenr
);
1456 if (!mutex_trylock(&head
->mutex
)) {
1457 refcount_inc(&head
->refs
);
1458 spin_unlock(&delayed_refs
->lock
);
1460 btrfs_release_path(path
);
1463 * Mutex was contended, block until it's
1464 * released and try again
1466 mutex_lock(&head
->mutex
);
1467 mutex_unlock(&head
->mutex
);
1468 btrfs_put_delayed_ref_head(head
);
1471 spin_unlock(&delayed_refs
->lock
);
1472 ret
= add_delayed_refs(ctx
->fs_info
, head
, ctx
->time_seq
,
1474 mutex_unlock(&head
->mutex
);
1478 spin_unlock(&delayed_refs
->lock
);
1482 if (path
->slots
[0]) {
1483 struct extent_buffer
*leaf
;
1487 leaf
= path
->nodes
[0];
1488 slot
= path
->slots
[0];
1489 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
1490 if (key
.objectid
== ctx
->bytenr
&&
1491 (key
.type
== BTRFS_EXTENT_ITEM_KEY
||
1492 key
.type
== BTRFS_METADATA_ITEM_KEY
)) {
1493 ret
= add_inline_refs(ctx
, path
, &info_level
,
1497 ret
= add_keyed_refs(ctx
, root
, path
, info_level
,
1505 * If we have a share context and we reached here, it means the extent
1506 * is not directly shared (no multiple reference items for it),
1507 * otherwise we would have exited earlier with a return value of
1508 * BACKREF_FOUND_SHARED after processing delayed references or while
1509 * processing inline or keyed references from the extent tree.
1510 * The extent may however be indirectly shared through shared subtrees
1511 * as a result from creating snapshots, so we determine below what is
1512 * its parent node, in case we are dealing with a metadata extent, or
1513 * what's the leaf (or leaves), from a fs tree, that has a file extent
1514 * item pointing to it in case we are dealing with a data extent.
1516 ASSERT(extent_is_shared(sc
) == 0);
1519 * If we are here for a data extent and we have a share_check structure
1520 * it means the data extent is not directly shared (does not have
1521 * multiple reference items), so we have to check if a path in the fs
1522 * tree (going from the root node down to the leaf that has the file
1523 * extent item pointing to the data extent) is shared, that is, if any
1524 * of the extent buffers in the path is referenced by other trees.
1526 if (sc
&& ctx
->bytenr
== sc
->data_bytenr
) {
1528 * If our data extent is from a generation more recent than the
1529 * last generation used to snapshot the root, then we know that
1530 * it can not be shared through subtrees, so we can skip
1531 * resolving indirect references, there's no point in
1532 * determining the extent buffers for the path from the fs tree
1533 * root node down to the leaf that has the file extent item that
1534 * points to the data extent.
1536 if (sc
->data_extent_gen
>
1537 btrfs_root_last_snapshot(&sc
->root
->root_item
)) {
1538 ret
= BACKREF_FOUND_NOT_SHARED
;
1543 * If we are only determining if a data extent is shared or not
1544 * and the corresponding file extent item is located in the same
1545 * leaf as the previous file extent item, we can skip resolving
1546 * indirect references for a data extent, since the fs tree path
1547 * is the same (same leaf, so same path). We skip as long as the
1548 * cached result for the leaf is valid and only if there's only
1549 * one file extent item pointing to the data extent, because in
1550 * the case of multiple file extent items, they may be located
1551 * in different leaves and therefore we have multiple paths.
1553 if (sc
->ctx
->curr_leaf_bytenr
== sc
->ctx
->prev_leaf_bytenr
&&
1554 sc
->self_ref_count
== 1) {
1558 cached
= lookup_backref_shared_cache(sc
->ctx
, sc
->root
,
1559 sc
->ctx
->curr_leaf_bytenr
,
1563 ret
= BACKREF_FOUND_SHARED
;
1565 ret
= BACKREF_FOUND_NOT_SHARED
;
1571 btrfs_release_path(path
);
1573 ret
= add_missing_keys(ctx
->fs_info
, &preftrees
, path
->skip_locking
== 0);
1577 WARN_ON(!RB_EMPTY_ROOT(&preftrees
.indirect_missing_keys
.root
.rb_root
));
1579 ret
= resolve_indirect_refs(ctx
, path
, &preftrees
, sc
);
1583 WARN_ON(!RB_EMPTY_ROOT(&preftrees
.indirect
.root
.rb_root
));
1586 * This walks the tree of merged and resolved refs. Tree blocks are
1587 * read in as needed. Unique entries are added to the ulist, and
1588 * the list of found roots is updated.
1590 * We release the entire tree in one go before returning.
1592 node
= rb_first_cached(&preftrees
.direct
.root
);
1594 ref
= rb_entry(node
, struct prelim_ref
, rbnode
);
1595 node
= rb_next(&ref
->rbnode
);
1597 * ref->count < 0 can happen here if there are delayed
1598 * refs with a node->action of BTRFS_DROP_DELAYED_REF.
1599 * prelim_ref_insert() relies on this when merging
1600 * identical refs to keep the overall count correct.
1601 * prelim_ref_insert() will merge only those refs
1602 * which compare identically. Any refs having
1603 * e.g. different offsets would not be merged,
1604 * and would retain their original ref->count < 0.
1606 if (ctx
->roots
&& ref
->count
&& ref
->root_id
&& ref
->parent
== 0) {
1607 /* no parent == root of tree */
1608 ret
= ulist_add(ctx
->roots
, ref
->root_id
, 0, GFP_NOFS
);
1612 if (ref
->count
&& ref
->parent
) {
1613 if (!ctx
->skip_inode_ref_list
&& !ref
->inode_list
&&
1615 struct btrfs_tree_parent_check check
= { 0 };
1616 struct extent_buffer
*eb
;
1618 check
.level
= ref
->level
;
1620 eb
= read_tree_block(ctx
->fs_info
, ref
->parent
,
1626 if (!extent_buffer_uptodate(eb
)) {
1627 free_extent_buffer(eb
);
1632 if (!path
->skip_locking
)
1633 btrfs_tree_read_lock(eb
);
1634 ret
= find_extent_in_eb(ctx
, eb
, &eie
);
1635 if (!path
->skip_locking
)
1636 btrfs_tree_read_unlock(eb
);
1637 free_extent_buffer(eb
);
1638 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
1641 ref
->inode_list
= eie
;
1643 * We transferred the list ownership to the ref,
1644 * so set to NULL to avoid a double free in case
1645 * an error happens after this.
1649 ret
= ulist_add_merge_ptr(ctx
->refs
, ref
->parent
,
1651 (void **)&eie
, GFP_NOFS
);
1654 if (!ret
&& !ctx
->skip_inode_ref_list
) {
1656 * We've recorded that parent, so we must extend
1657 * its inode list here.
1659 * However if there was corruption we may not
1660 * have found an eie, return an error in this
1670 eie
->next
= ref
->inode_list
;
1674 * We have transferred the inode list ownership from
1675 * this ref to the ref we added to the 'refs' ulist.
1676 * So set this ref's inode list to NULL to avoid
1677 * use-after-free when our caller uses it or double
1678 * frees in case an error happens before we return.
1680 ref
->inode_list
= NULL
;
1686 btrfs_free_path(path
);
1688 prelim_release(&preftrees
.direct
);
1689 prelim_release(&preftrees
.indirect
);
1690 prelim_release(&preftrees
.indirect_missing_keys
);
1692 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
|| ret
< 0)
1693 free_inode_elem_list(eie
);
1698 * Finds all leaves with a reference to the specified combination of
1699 * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
1700 * added to the ulist at @ctx->refs, and that ulist is allocated by this
1701 * function. The caller should free the ulist with free_leaf_list() if
1702 * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
1705 * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
1707 int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx
*ctx
)
1711 ASSERT(ctx
->refs
== NULL
);
1713 ctx
->refs
= ulist_alloc(GFP_NOFS
);
1717 ret
= find_parent_nodes(ctx
, NULL
);
1718 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
||
1719 (ret
< 0 && ret
!= -ENOENT
)) {
1720 free_leaf_list(ctx
->refs
);
1729 * Walk all backrefs for a given extent to find all roots that reference this
1730 * extent. Walking a backref means finding all extents that reference this
1731 * extent and in turn walk the backrefs of those, too. Naturally this is a
1732 * recursive process, but here it is implemented in an iterative fashion: We
1733 * find all referencing extents for the extent in question and put them on a
1734 * list. In turn, we find all referencing extents for those, further appending
1735 * to the list. The way we iterate the list allows adding more elements after
1736 * the current while iterating. The process stops when we reach the end of the
1739 * Found roots are added to @ctx->roots, which is allocated by this function if
1740 * it points to NULL, in which case the caller is responsible for freeing it
1741 * after it's not needed anymore.
1742 * This function requires @ctx->refs to be NULL, as it uses it for allocating a
1743 * ulist to do temporary work, and frees it before returning.
1745 * Returns 0 on success, < 0 on error.
1747 static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx
*ctx
)
1749 const u64 orig_bytenr
= ctx
->bytenr
;
1750 const bool orig_skip_inode_ref_list
= ctx
->skip_inode_ref_list
;
1751 bool roots_ulist_allocated
= false;
1752 struct ulist_iterator uiter
;
1755 ASSERT(ctx
->refs
== NULL
);
1757 ctx
->refs
= ulist_alloc(GFP_NOFS
);
1762 ctx
->roots
= ulist_alloc(GFP_NOFS
);
1764 ulist_free(ctx
->refs
);
1768 roots_ulist_allocated
= true;
1771 ctx
->skip_inode_ref_list
= true;
1773 ULIST_ITER_INIT(&uiter
);
1775 struct ulist_node
*node
;
1777 ret
= find_parent_nodes(ctx
, NULL
);
1778 if (ret
< 0 && ret
!= -ENOENT
) {
1779 if (roots_ulist_allocated
) {
1780 ulist_free(ctx
->roots
);
1786 node
= ulist_next(ctx
->refs
, &uiter
);
1789 ctx
->bytenr
= node
->val
;
1793 ulist_free(ctx
->refs
);
1795 ctx
->bytenr
= orig_bytenr
;
1796 ctx
->skip_inode_ref_list
= orig_skip_inode_ref_list
;
1801 int btrfs_find_all_roots(struct btrfs_backref_walk_ctx
*ctx
,
1802 bool skip_commit_root_sem
)
1806 if (!ctx
->trans
&& !skip_commit_root_sem
)
1807 down_read(&ctx
->fs_info
->commit_root_sem
);
1808 ret
= btrfs_find_all_roots_safe(ctx
);
1809 if (!ctx
->trans
&& !skip_commit_root_sem
)
1810 up_read(&ctx
->fs_info
->commit_root_sem
);
1814 struct btrfs_backref_share_check_ctx
*btrfs_alloc_backref_share_check_ctx(void)
1816 struct btrfs_backref_share_check_ctx
*ctx
;
1818 ctx
= kzalloc(sizeof(*ctx
), GFP_KERNEL
);
1822 ulist_init(&ctx
->refs
);
1827 void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx
*ctx
)
1832 ulist_release(&ctx
->refs
);
1837 * Check if a data extent is shared or not.
1839 * @inode: The inode whose extent we are checking.
1840 * @bytenr: Logical bytenr of the extent we are checking.
1841 * @extent_gen: Generation of the extent (file extent item) or 0 if it is
1843 * @ctx: A backref sharedness check context.
1845 * btrfs_is_data_extent_shared uses the backref walking code but will short
1846 * circuit as soon as it finds a root or inode that doesn't match the
1847 * one passed in. This provides a significant performance benefit for
1848 * callers (such as fiemap) which want to know whether the extent is
1849 * shared but do not need a ref count.
1851 * This attempts to attach to the running transaction in order to account for
1852 * delayed refs, but continues on even when no running transaction exists.
1854 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
1856 int btrfs_is_data_extent_shared(struct btrfs_inode
*inode
, u64 bytenr
,
1858 struct btrfs_backref_share_check_ctx
*ctx
)
1860 struct btrfs_backref_walk_ctx walk_ctx
= { 0 };
1861 struct btrfs_root
*root
= inode
->root
;
1862 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1863 struct btrfs_trans_handle
*trans
;
1864 struct ulist_iterator uiter
;
1865 struct ulist_node
*node
;
1866 struct btrfs_seq_list elem
= BTRFS_SEQ_LIST_INIT(elem
);
1868 struct share_check shared
= {
1871 .inum
= btrfs_ino(inode
),
1872 .data_bytenr
= bytenr
,
1873 .data_extent_gen
= extent_gen
,
1875 .self_ref_count
= 0,
1876 .have_delayed_delete_refs
= false,
1880 bool leaf_is_shared
;
1882 for (int i
= 0; i
< BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE
; i
++) {
1883 if (ctx
->prev_extents_cache
[i
].bytenr
== bytenr
)
1884 return ctx
->prev_extents_cache
[i
].is_shared
;
1887 ulist_init(&ctx
->refs
);
1889 trans
= btrfs_join_transaction_nostart(root
);
1890 if (IS_ERR(trans
)) {
1891 if (PTR_ERR(trans
) != -ENOENT
&& PTR_ERR(trans
) != -EROFS
) {
1892 ret
= PTR_ERR(trans
);
1896 down_read(&fs_info
->commit_root_sem
);
1898 btrfs_get_tree_mod_seq(fs_info
, &elem
);
1899 walk_ctx
.time_seq
= elem
.seq
;
1902 ctx
->use_path_cache
= true;
1905 * We may have previously determined that the current leaf is shared.
1906 * If it is, then we have a data extent that is shared due to a shared
1907 * subtree (caused by snapshotting) and we don't need to check for data
1908 * backrefs. If the leaf is not shared, then we must do backref walking
1909 * to determine if the data extent is shared through reflinks.
1911 leaf_cached
= lookup_backref_shared_cache(ctx
, root
,
1912 ctx
->curr_leaf_bytenr
, 0,
1914 if (leaf_cached
&& leaf_is_shared
) {
1919 walk_ctx
.skip_inode_ref_list
= true;
1920 walk_ctx
.trans
= trans
;
1921 walk_ctx
.fs_info
= fs_info
;
1922 walk_ctx
.refs
= &ctx
->refs
;
1924 /* -1 means we are in the bytenr of the data extent. */
1926 ULIST_ITER_INIT(&uiter
);
1928 const unsigned long prev_ref_count
= ctx
->refs
.nnodes
;
1930 walk_ctx
.bytenr
= bytenr
;
1931 ret
= find_parent_nodes(&walk_ctx
, &shared
);
1932 if (ret
== BACKREF_FOUND_SHARED
||
1933 ret
== BACKREF_FOUND_NOT_SHARED
) {
1934 /* If shared must return 1, otherwise return 0. */
1935 ret
= (ret
== BACKREF_FOUND_SHARED
) ? 1 : 0;
1937 store_backref_shared_cache(ctx
, root
, bytenr
,
1941 if (ret
< 0 && ret
!= -ENOENT
)
1946 * More than one extent buffer (bytenr) may have been added to
1947 * the ctx->refs ulist, in which case we have to check multiple
1948 * tree paths in case the first one is not shared, so we can not
1949 * use the path cache which is made for a single path. Multiple
1950 * extent buffers at the current level happen when:
1952 * 1) level -1, the data extent: If our data extent was not
1953 * directly shared (without multiple reference items), then
1954 * it might have a single reference item with a count > 1 for
1955 * the same offset, which means there are 2 (or more) file
1956 * extent items that point to the data extent - this happens
1957 * when a file extent item needs to be split and then one
1958 * item gets moved to another leaf due to a b+tree leaf split
1959 * when inserting some item. In this case the file extent
1960 * items may be located in different leaves and therefore
1961 * some of the leaves may be referenced through shared
1962 * subtrees while others are not. Since our extent buffer
1963 * cache only works for a single path (by far the most common
1964 * case and simpler to deal with), we can not use it if we
1965 * have multiple leaves (which implies multiple paths).
1967 * 2) level >= 0, a tree node/leaf: We can have a mix of direct
1968 * and indirect references on a b+tree node/leaf, so we have
1969 * to check multiple paths, and the extent buffer (the
1970 * current bytenr) may be shared or not. One example is
1971 * during relocation as we may get a shared tree block ref
1972 * (direct ref) and a non-shared tree block ref (indirect
1973 * ref) for the same node/leaf.
1975 if ((ctx
->refs
.nnodes
- prev_ref_count
) > 1)
1976 ctx
->use_path_cache
= false;
1979 store_backref_shared_cache(ctx
, root
, bytenr
,
1981 node
= ulist_next(&ctx
->refs
, &uiter
);
1985 if (ctx
->use_path_cache
) {
1990 cached
= lookup_backref_shared_cache(ctx
, root
, bytenr
,
1993 ret
= (is_shared
? 1 : 0);
1997 shared
.share_count
= 0;
1998 shared
.have_delayed_delete_refs
= false;
2003 * If the path cache is disabled, then it means at some tree level we
2004 * got multiple parents due to a mix of direct and indirect backrefs or
2005 * multiple leaves with file extent items pointing to the same data
2006 * extent. We have to invalidate the cache and cache only the sharedness
2007 * result for the levels where we got only one node/reference.
2009 if (!ctx
->use_path_cache
) {
2013 if (ret
>= 0 && level
>= 0) {
2014 bytenr
= ctx
->path_cache_entries
[level
].bytenr
;
2015 ctx
->use_path_cache
= true;
2016 store_backref_shared_cache(ctx
, root
, bytenr
, level
, ret
);
2020 for ( ; i
< BTRFS_MAX_LEVEL
; i
++)
2021 ctx
->path_cache_entries
[i
].bytenr
= 0;
2025 * Cache the sharedness result for the data extent if we know our inode
2026 * has more than 1 file extent item that refers to the data extent.
2028 if (ret
>= 0 && shared
.self_ref_count
> 1) {
2029 int slot
= ctx
->prev_extents_cache_slot
;
2031 ctx
->prev_extents_cache
[slot
].bytenr
= shared
.data_bytenr
;
2032 ctx
->prev_extents_cache
[slot
].is_shared
= (ret
== 1);
2034 slot
= (slot
+ 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE
;
2035 ctx
->prev_extents_cache_slot
= slot
;
2040 btrfs_put_tree_mod_seq(fs_info
, &elem
);
2041 btrfs_end_transaction(trans
);
2043 up_read(&fs_info
->commit_root_sem
);
2046 ulist_release(&ctx
->refs
);
2047 ctx
->prev_leaf_bytenr
= ctx
->curr_leaf_bytenr
;
2052 int btrfs_find_one_extref(struct btrfs_root
*root
, u64 inode_objectid
,
2053 u64 start_off
, struct btrfs_path
*path
,
2054 struct btrfs_inode_extref
**ret_extref
,
2058 struct btrfs_key key
;
2059 struct btrfs_key found_key
;
2060 struct btrfs_inode_extref
*extref
;
2061 const struct extent_buffer
*leaf
;
2064 key
.objectid
= inode_objectid
;
2065 key
.type
= BTRFS_INODE_EXTREF_KEY
;
2066 key
.offset
= start_off
;
2068 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2073 leaf
= path
->nodes
[0];
2074 slot
= path
->slots
[0];
2075 if (slot
>= btrfs_header_nritems(leaf
)) {
2077 * If the item at offset is not found,
2078 * btrfs_search_slot will point us to the slot
2079 * where it should be inserted. In our case
2080 * that will be the slot directly before the
2081 * next INODE_REF_KEY_V2 item. In the case
2082 * that we're pointing to the last slot in a
2083 * leaf, we must move one leaf over.
2085 ret
= btrfs_next_leaf(root
, path
);
2094 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
2097 * Check that we're still looking at an extended ref key for
2098 * this particular objectid. If we have different
2099 * objectid or type then there are no more to be found
2100 * in the tree and we can exit.
2103 if (found_key
.objectid
!= inode_objectid
)
2105 if (found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)
2109 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
2110 extref
= (struct btrfs_inode_extref
*)ptr
;
2111 *ret_extref
= extref
;
2113 *found_off
= found_key
.offset
;
2121 * this iterates to turn a name (from iref/extref) into a full filesystem path.
2122 * Elements of the path are separated by '/' and the path is guaranteed to be
2123 * 0-terminated. the path is only given within the current file system.
2124 * Therefore, it never starts with a '/'. the caller is responsible to provide
2125 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
2126 * the start point of the resulting string is returned. this pointer is within
2128 * in case the path buffer would overflow, the pointer is decremented further
2129 * as if output was written to the buffer, though no more output is actually
2130 * generated. that way, the caller can determine how much space would be
2131 * required for the path to fit into the buffer. in that case, the returned
2132 * value will be smaller than dest. callers must check this!
2134 char *btrfs_ref_to_path(struct btrfs_root
*fs_root
, struct btrfs_path
*path
,
2135 u32 name_len
, unsigned long name_off
,
2136 struct extent_buffer
*eb_in
, u64 parent
,
2137 char *dest
, u32 size
)
2142 s64 bytes_left
= ((s64
)size
) - 1;
2143 struct extent_buffer
*eb
= eb_in
;
2144 struct btrfs_key found_key
;
2145 struct btrfs_inode_ref
*iref
;
2147 if (bytes_left
>= 0)
2148 dest
[bytes_left
] = '\0';
2151 bytes_left
-= name_len
;
2152 if (bytes_left
>= 0)
2153 read_extent_buffer(eb
, dest
+ bytes_left
,
2154 name_off
, name_len
);
2156 if (!path
->skip_locking
)
2157 btrfs_tree_read_unlock(eb
);
2158 free_extent_buffer(eb
);
2160 ret
= btrfs_find_item(fs_root
, path
, parent
, 0,
2161 BTRFS_INODE_REF_KEY
, &found_key
);
2167 next_inum
= found_key
.offset
;
2169 /* regular exit ahead */
2170 if (parent
== next_inum
)
2173 slot
= path
->slots
[0];
2174 eb
= path
->nodes
[0];
2175 /* make sure we can use eb after releasing the path */
2177 path
->nodes
[0] = NULL
;
2180 btrfs_release_path(path
);
2181 iref
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_ref
);
2183 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
2184 name_off
= (unsigned long)(iref
+ 1);
2188 if (bytes_left
>= 0)
2189 dest
[bytes_left
] = '/';
2192 btrfs_release_path(path
);
2195 return ERR_PTR(ret
);
2197 return dest
+ bytes_left
;
2201 * this makes the path point to (logical EXTENT_ITEM *)
2202 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
2203 * tree blocks and <0 on error.
2205 int extent_from_logical(struct btrfs_fs_info
*fs_info
, u64 logical
,
2206 struct btrfs_path
*path
, struct btrfs_key
*found_key
,
2209 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, logical
);
2214 const struct extent_buffer
*eb
;
2215 struct btrfs_extent_item
*ei
;
2216 struct btrfs_key key
;
2218 if (btrfs_fs_incompat(fs_info
, SKINNY_METADATA
))
2219 key
.type
= BTRFS_METADATA_ITEM_KEY
;
2221 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
2222 key
.objectid
= logical
;
2223 key
.offset
= (u64
)-1;
2225 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2230 * Key with offset -1 found, there would have to exist an extent
2231 * item with such offset, but this is out of the valid range.
2236 ret
= btrfs_previous_extent_item(extent_root
, path
, 0);
2242 btrfs_item_key_to_cpu(path
->nodes
[0], found_key
, path
->slots
[0]);
2243 if (found_key
->type
== BTRFS_METADATA_ITEM_KEY
)
2244 size
= fs_info
->nodesize
;
2245 else if (found_key
->type
== BTRFS_EXTENT_ITEM_KEY
)
2246 size
= found_key
->offset
;
2248 if (found_key
->objectid
> logical
||
2249 found_key
->objectid
+ size
<= logical
) {
2250 btrfs_debug(fs_info
,
2251 "logical %llu is not within any extent", logical
);
2255 eb
= path
->nodes
[0];
2256 item_size
= btrfs_item_size(eb
, path
->slots
[0]);
2258 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
2259 flags
= btrfs_extent_flags(eb
, ei
);
2261 btrfs_debug(fs_info
,
2262 "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
2263 logical
, logical
- found_key
->objectid
, found_key
->objectid
,
2264 found_key
->offset
, flags
, item_size
);
2266 WARN_ON(!flags_ret
);
2268 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
2269 *flags_ret
= BTRFS_EXTENT_FLAG_TREE_BLOCK
;
2270 else if (flags
& BTRFS_EXTENT_FLAG_DATA
)
2271 *flags_ret
= BTRFS_EXTENT_FLAG_DATA
;
2281 * helper function to iterate extent inline refs. ptr must point to a 0 value
2282 * for the first call and may be modified. it is used to track state.
2283 * if more refs exist, 0 is returned and the next call to
2284 * get_extent_inline_ref must pass the modified ptr parameter to get the
2285 * next ref. after the last ref was processed, 1 is returned.
2286 * returns <0 on error
2288 static int get_extent_inline_ref(unsigned long *ptr
,
2289 const struct extent_buffer
*eb
,
2290 const struct btrfs_key
*key
,
2291 const struct btrfs_extent_item
*ei
,
2293 struct btrfs_extent_inline_ref
**out_eiref
,
2298 struct btrfs_tree_block_info
*info
;
2302 flags
= btrfs_extent_flags(eb
, ei
);
2303 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
2304 if (key
->type
== BTRFS_METADATA_ITEM_KEY
) {
2305 /* a skinny metadata extent */
2307 (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2309 WARN_ON(key
->type
!= BTRFS_EXTENT_ITEM_KEY
);
2310 info
= (struct btrfs_tree_block_info
*)(ei
+ 1);
2312 (struct btrfs_extent_inline_ref
*)(info
+ 1);
2315 *out_eiref
= (struct btrfs_extent_inline_ref
*)(ei
+ 1);
2317 *ptr
= (unsigned long)*out_eiref
;
2318 if ((unsigned long)(*ptr
) >= (unsigned long)ei
+ item_size
)
2322 end
= (unsigned long)ei
+ item_size
;
2323 *out_eiref
= (struct btrfs_extent_inline_ref
*)(*ptr
);
2324 *out_type
= btrfs_get_extent_inline_ref_type(eb
, *out_eiref
,
2325 BTRFS_REF_TYPE_ANY
);
2326 if (*out_type
== BTRFS_REF_TYPE_INVALID
)
2329 *ptr
+= btrfs_extent_inline_ref_size(*out_type
);
2330 WARN_ON(*ptr
> end
);
2332 return 1; /* last */
2338 * reads the tree block backref for an extent. tree level and root are returned
2339 * through out_level and out_root. ptr must point to a 0 value for the first
2340 * call and may be modified (see get_extent_inline_ref comment).
2341 * returns 0 if data was provided, 1 if there was no more data to provide or
2344 int tree_backref_for_extent(unsigned long *ptr
, struct extent_buffer
*eb
,
2345 struct btrfs_key
*key
, struct btrfs_extent_item
*ei
,
2346 u32 item_size
, u64
*out_root
, u8
*out_level
)
2350 struct btrfs_extent_inline_ref
*eiref
;
2352 if (*ptr
== (unsigned long)-1)
2356 ret
= get_extent_inline_ref(ptr
, eb
, key
, ei
, item_size
,
2361 if (type
== BTRFS_TREE_BLOCK_REF_KEY
||
2362 type
== BTRFS_SHARED_BLOCK_REF_KEY
)
2369 /* we can treat both ref types equally here */
2370 *out_root
= btrfs_extent_inline_ref_offset(eb
, eiref
);
2372 if (key
->type
== BTRFS_EXTENT_ITEM_KEY
) {
2373 struct btrfs_tree_block_info
*info
;
2375 info
= (struct btrfs_tree_block_info
*)(ei
+ 1);
2376 *out_level
= btrfs_tree_block_level(eb
, info
);
2378 ASSERT(key
->type
== BTRFS_METADATA_ITEM_KEY
);
2379 *out_level
= (u8
)key
->offset
;
2383 *ptr
= (unsigned long)-1;
2388 static int iterate_leaf_refs(struct btrfs_fs_info
*fs_info
,
2389 struct extent_inode_elem
*inode_list
,
2390 u64 root
, u64 extent_item_objectid
,
2391 iterate_extent_inodes_t
*iterate
, void *ctx
)
2393 struct extent_inode_elem
*eie
;
2396 for (eie
= inode_list
; eie
; eie
= eie
->next
) {
2397 btrfs_debug(fs_info
,
2398 "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu",
2399 extent_item_objectid
, eie
->inum
,
2401 ret
= iterate(eie
->inum
, eie
->offset
, eie
->num_bytes
, root
, ctx
);
2403 btrfs_debug(fs_info
,
2404 "stopping iteration for %llu due to ret=%d",
2405 extent_item_objectid
, ret
);
2414 * calls iterate() for every inode that references the extent identified by
2415 * the given parameters.
2416 * when the iterator function returns a non-zero value, iteration stops.
2418 int iterate_extent_inodes(struct btrfs_backref_walk_ctx
*ctx
,
2419 bool search_commit_root
,
2420 iterate_extent_inodes_t
*iterate
, void *user_ctx
)
2424 struct ulist_node
*ref_node
;
2425 struct btrfs_seq_list seq_elem
= BTRFS_SEQ_LIST_INIT(seq_elem
);
2426 struct ulist_iterator ref_uiter
;
2428 btrfs_debug(ctx
->fs_info
, "resolving all inodes for extent %llu",
2431 ASSERT(ctx
->trans
== NULL
);
2432 ASSERT(ctx
->roots
== NULL
);
2434 if (!search_commit_root
) {
2435 struct btrfs_trans_handle
*trans
;
2437 trans
= btrfs_attach_transaction(ctx
->fs_info
->tree_root
);
2438 if (IS_ERR(trans
)) {
2439 if (PTR_ERR(trans
) != -ENOENT
&&
2440 PTR_ERR(trans
) != -EROFS
)
2441 return PTR_ERR(trans
);
2448 btrfs_get_tree_mod_seq(ctx
->fs_info
, &seq_elem
);
2449 ctx
->time_seq
= seq_elem
.seq
;
2451 down_read(&ctx
->fs_info
->commit_root_sem
);
2454 ret
= btrfs_find_all_leafs(ctx
);
2460 ULIST_ITER_INIT(&ref_uiter
);
2461 while (!ret
&& (ref_node
= ulist_next(refs
, &ref_uiter
))) {
2462 const u64 leaf_bytenr
= ref_node
->val
;
2463 struct ulist_node
*root_node
;
2464 struct ulist_iterator root_uiter
;
2465 struct extent_inode_elem
*inode_list
;
2467 inode_list
= (struct extent_inode_elem
*)(uintptr_t)ref_node
->aux
;
2469 if (ctx
->cache_lookup
) {
2470 const u64
*root_ids
;
2474 cached
= ctx
->cache_lookup(leaf_bytenr
, ctx
->user_ctx
,
2475 &root_ids
, &root_count
);
2477 for (int i
= 0; i
< root_count
; i
++) {
2478 ret
= iterate_leaf_refs(ctx
->fs_info
,
2492 ctx
->roots
= ulist_alloc(GFP_NOFS
);
2499 ctx
->bytenr
= leaf_bytenr
;
2500 ret
= btrfs_find_all_roots_safe(ctx
);
2504 if (ctx
->cache_store
)
2505 ctx
->cache_store(leaf_bytenr
, ctx
->roots
, ctx
->user_ctx
);
2507 ULIST_ITER_INIT(&root_uiter
);
2508 while (!ret
&& (root_node
= ulist_next(ctx
->roots
, &root_uiter
))) {
2509 btrfs_debug(ctx
->fs_info
,
2510 "root %llu references leaf %llu, data list %#llx",
2511 root_node
->val
, ref_node
->val
,
2513 ret
= iterate_leaf_refs(ctx
->fs_info
, inode_list
,
2514 root_node
->val
, ctx
->bytenr
,
2517 ulist_reinit(ctx
->roots
);
2520 free_leaf_list(refs
);
2523 btrfs_put_tree_mod_seq(ctx
->fs_info
, &seq_elem
);
2524 btrfs_end_transaction(ctx
->trans
);
2527 up_read(&ctx
->fs_info
->commit_root_sem
);
2530 ulist_free(ctx
->roots
);
2533 if (ret
== BTRFS_ITERATE_EXTENT_INODES_STOP
)
2539 static int build_ino_list(u64 inum
, u64 offset
, u64 num_bytes
, u64 root
, void *ctx
)
2541 struct btrfs_data_container
*inodes
= ctx
;
2542 const size_t c
= 3 * sizeof(u64
);
2544 if (inodes
->bytes_left
>= c
) {
2545 inodes
->bytes_left
-= c
;
2546 inodes
->val
[inodes
->elem_cnt
] = inum
;
2547 inodes
->val
[inodes
->elem_cnt
+ 1] = offset
;
2548 inodes
->val
[inodes
->elem_cnt
+ 2] = root
;
2549 inodes
->elem_cnt
+= 3;
2551 inodes
->bytes_missing
+= c
- inodes
->bytes_left
;
2552 inodes
->bytes_left
= 0;
2553 inodes
->elem_missed
+= 3;
2559 int iterate_inodes_from_logical(u64 logical
, struct btrfs_fs_info
*fs_info
,
2560 struct btrfs_path
*path
,
2561 void *ctx
, bool ignore_offset
)
2563 struct btrfs_backref_walk_ctx walk_ctx
= { 0 };
2566 struct btrfs_key found_key
;
2567 int search_commit_root
= path
->search_commit_root
;
2569 ret
= extent_from_logical(fs_info
, logical
, path
, &found_key
, &flags
);
2570 btrfs_release_path(path
);
2573 if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)
2576 walk_ctx
.bytenr
= found_key
.objectid
;
2578 walk_ctx
.ignore_extent_item_pos
= true;
2580 walk_ctx
.extent_item_pos
= logical
- found_key
.objectid
;
2581 walk_ctx
.fs_info
= fs_info
;
2583 return iterate_extent_inodes(&walk_ctx
, search_commit_root
,
2584 build_ino_list
, ctx
);
2587 static int inode_to_path(u64 inum
, u32 name_len
, unsigned long name_off
,
2588 struct extent_buffer
*eb
, struct inode_fs_paths
*ipath
);
2590 static int iterate_inode_refs(u64 inum
, struct inode_fs_paths
*ipath
)
2599 struct btrfs_root
*fs_root
= ipath
->fs_root
;
2600 struct btrfs_path
*path
= ipath
->btrfs_path
;
2601 struct extent_buffer
*eb
;
2602 struct btrfs_inode_ref
*iref
;
2603 struct btrfs_key found_key
;
2606 ret
= btrfs_find_item(fs_root
, path
, inum
,
2607 parent
? parent
+ 1 : 0, BTRFS_INODE_REF_KEY
,
2613 ret
= found
? 0 : -ENOENT
;
2618 parent
= found_key
.offset
;
2619 slot
= path
->slots
[0];
2620 eb
= btrfs_clone_extent_buffer(path
->nodes
[0]);
2625 btrfs_release_path(path
);
2627 iref
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_ref
);
2629 for (cur
= 0; cur
< btrfs_item_size(eb
, slot
); cur
+= len
) {
2630 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
2631 /* path must be released before calling iterate()! */
2632 btrfs_debug(fs_root
->fs_info
,
2633 "following ref at offset %u for inode %llu in tree %llu",
2634 cur
, found_key
.objectid
,
2635 fs_root
->root_key
.objectid
);
2636 ret
= inode_to_path(parent
, name_len
,
2637 (unsigned long)(iref
+ 1), eb
, ipath
);
2640 len
= sizeof(*iref
) + name_len
;
2641 iref
= (struct btrfs_inode_ref
*)((char *)iref
+ len
);
2643 free_extent_buffer(eb
);
2646 btrfs_release_path(path
);
2651 static int iterate_inode_extrefs(u64 inum
, struct inode_fs_paths
*ipath
)
2658 struct btrfs_root
*fs_root
= ipath
->fs_root
;
2659 struct btrfs_path
*path
= ipath
->btrfs_path
;
2660 struct extent_buffer
*eb
;
2661 struct btrfs_inode_extref
*extref
;
2667 ret
= btrfs_find_one_extref(fs_root
, inum
, offset
, path
, &extref
,
2672 ret
= found
? 0 : -ENOENT
;
2677 slot
= path
->slots
[0];
2678 eb
= btrfs_clone_extent_buffer(path
->nodes
[0]);
2683 btrfs_release_path(path
);
2685 item_size
= btrfs_item_size(eb
, slot
);
2686 ptr
= btrfs_item_ptr_offset(eb
, slot
);
2689 while (cur_offset
< item_size
) {
2692 extref
= (struct btrfs_inode_extref
*)(ptr
+ cur_offset
);
2693 parent
= btrfs_inode_extref_parent(eb
, extref
);
2694 name_len
= btrfs_inode_extref_name_len(eb
, extref
);
2695 ret
= inode_to_path(parent
, name_len
,
2696 (unsigned long)&extref
->name
, eb
, ipath
);
2700 cur_offset
+= btrfs_inode_extref_name_len(eb
, extref
);
2701 cur_offset
+= sizeof(*extref
);
2703 free_extent_buffer(eb
);
2708 btrfs_release_path(path
);
2714 * returns 0 if the path could be dumped (probably truncated)
2715 * returns <0 in case of an error
2717 static int inode_to_path(u64 inum
, u32 name_len
, unsigned long name_off
,
2718 struct extent_buffer
*eb
, struct inode_fs_paths
*ipath
)
2722 int i
= ipath
->fspath
->elem_cnt
;
2723 const int s_ptr
= sizeof(char *);
2726 bytes_left
= ipath
->fspath
->bytes_left
> s_ptr
?
2727 ipath
->fspath
->bytes_left
- s_ptr
: 0;
2729 fspath_min
= (char *)ipath
->fspath
->val
+ (i
+ 1) * s_ptr
;
2730 fspath
= btrfs_ref_to_path(ipath
->fs_root
, ipath
->btrfs_path
, name_len
,
2731 name_off
, eb
, inum
, fspath_min
, bytes_left
);
2733 return PTR_ERR(fspath
);
2735 if (fspath
> fspath_min
) {
2736 ipath
->fspath
->val
[i
] = (u64
)(unsigned long)fspath
;
2737 ++ipath
->fspath
->elem_cnt
;
2738 ipath
->fspath
->bytes_left
= fspath
- fspath_min
;
2740 ++ipath
->fspath
->elem_missed
;
2741 ipath
->fspath
->bytes_missing
+= fspath_min
- fspath
;
2742 ipath
->fspath
->bytes_left
= 0;
2749 * this dumps all file system paths to the inode into the ipath struct, provided
2750 * is has been created large enough. each path is zero-terminated and accessed
2751 * from ipath->fspath->val[i].
2752 * when it returns, there are ipath->fspath->elem_cnt number of paths available
2753 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
2754 * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise,
2755 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
2756 * have been needed to return all paths.
2758 int paths_from_inode(u64 inum
, struct inode_fs_paths
*ipath
)
2763 ret
= iterate_inode_refs(inum
, ipath
);
2766 else if (ret
!= -ENOENT
)
2769 ret
= iterate_inode_extrefs(inum
, ipath
);
2770 if (ret
== -ENOENT
&& found_refs
)
2776 struct btrfs_data_container
*init_data_container(u32 total_bytes
)
2778 struct btrfs_data_container
*data
;
2781 alloc_bytes
= max_t(size_t, total_bytes
, sizeof(*data
));
2782 data
= kvmalloc(alloc_bytes
, GFP_KERNEL
);
2784 return ERR_PTR(-ENOMEM
);
2786 if (total_bytes
>= sizeof(*data
)) {
2787 data
->bytes_left
= total_bytes
- sizeof(*data
);
2788 data
->bytes_missing
= 0;
2790 data
->bytes_missing
= sizeof(*data
) - total_bytes
;
2791 data
->bytes_left
= 0;
2795 data
->elem_missed
= 0;
2801 * allocates space to return multiple file system paths for an inode.
2802 * total_bytes to allocate are passed, note that space usable for actual path
2803 * information will be total_bytes - sizeof(struct inode_fs_paths).
2804 * the returned pointer must be freed with free_ipath() in the end.
2806 struct inode_fs_paths
*init_ipath(s32 total_bytes
, struct btrfs_root
*fs_root
,
2807 struct btrfs_path
*path
)
2809 struct inode_fs_paths
*ifp
;
2810 struct btrfs_data_container
*fspath
;
2812 fspath
= init_data_container(total_bytes
);
2814 return ERR_CAST(fspath
);
2816 ifp
= kmalloc(sizeof(*ifp
), GFP_KERNEL
);
2819 return ERR_PTR(-ENOMEM
);
2822 ifp
->btrfs_path
= path
;
2823 ifp
->fspath
= fspath
;
2824 ifp
->fs_root
= fs_root
;
2829 void free_ipath(struct inode_fs_paths
*ipath
)
2833 kvfree(ipath
->fspath
);
2837 struct btrfs_backref_iter
*btrfs_backref_iter_alloc(struct btrfs_fs_info
*fs_info
)
2839 struct btrfs_backref_iter
*ret
;
2841 ret
= kzalloc(sizeof(*ret
), GFP_NOFS
);
2845 ret
->path
= btrfs_alloc_path();
2851 /* Current backref iterator only supports iteration in commit root */
2852 ret
->path
->search_commit_root
= 1;
2853 ret
->path
->skip_locking
= 1;
2854 ret
->fs_info
= fs_info
;
2859 static void btrfs_backref_iter_release(struct btrfs_backref_iter
*iter
)
2865 btrfs_release_path(iter
->path
);
2866 memset(&iter
->cur_key
, 0, sizeof(iter
->cur_key
));
2869 int btrfs_backref_iter_start(struct btrfs_backref_iter
*iter
, u64 bytenr
)
2871 struct btrfs_fs_info
*fs_info
= iter
->fs_info
;
2872 struct btrfs_root
*extent_root
= btrfs_extent_root(fs_info
, bytenr
);
2873 struct btrfs_path
*path
= iter
->path
;
2874 struct btrfs_extent_item
*ei
;
2875 struct btrfs_key key
;
2878 key
.objectid
= bytenr
;
2879 key
.type
= BTRFS_METADATA_ITEM_KEY
;
2880 key
.offset
= (u64
)-1;
2881 iter
->bytenr
= bytenr
;
2883 ret
= btrfs_search_slot(NULL
, extent_root
, &key
, path
, 0, 0);
2888 * Key with offset -1 found, there would have to exist an extent
2889 * item with such offset, but this is out of the valid range.
2894 if (path
->slots
[0] == 0) {
2895 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG
));
2901 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
2902 if ((key
.type
!= BTRFS_EXTENT_ITEM_KEY
&&
2903 key
.type
!= BTRFS_METADATA_ITEM_KEY
) || key
.objectid
!= bytenr
) {
2907 memcpy(&iter
->cur_key
, &key
, sizeof(key
));
2908 iter
->item_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
2910 iter
->end_ptr
= (u32
)(iter
->item_ptr
+
2911 btrfs_item_size(path
->nodes
[0], path
->slots
[0]));
2912 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2913 struct btrfs_extent_item
);
2916 * Only support iteration on tree backref yet.
2918 * This is an extra precaution for non skinny-metadata, where
2919 * EXTENT_ITEM is also used for tree blocks, that we can only use
2920 * extent flags to determine if it's a tree block.
2922 if (btrfs_extent_flags(path
->nodes
[0], ei
) & BTRFS_EXTENT_FLAG_DATA
) {
2926 iter
->cur_ptr
= (u32
)(iter
->item_ptr
+ sizeof(*ei
));
2928 /* If there is no inline backref, go search for keyed backref */
2929 if (iter
->cur_ptr
>= iter
->end_ptr
) {
2930 ret
= btrfs_next_item(extent_root
, path
);
2932 /* No inline nor keyed ref */
2940 btrfs_item_key_to_cpu(path
->nodes
[0], &iter
->cur_key
,
2942 if (iter
->cur_key
.objectid
!= bytenr
||
2943 (iter
->cur_key
.type
!= BTRFS_SHARED_BLOCK_REF_KEY
&&
2944 iter
->cur_key
.type
!= BTRFS_TREE_BLOCK_REF_KEY
)) {
2948 iter
->cur_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
2950 iter
->item_ptr
= iter
->cur_ptr
;
2951 iter
->end_ptr
= (u32
)(iter
->item_ptr
+ btrfs_item_size(
2952 path
->nodes
[0], path
->slots
[0]));
2957 btrfs_backref_iter_release(iter
);
2961 static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter
*iter
)
2963 if (iter
->cur_key
.type
== BTRFS_EXTENT_ITEM_KEY
||
2964 iter
->cur_key
.type
== BTRFS_METADATA_ITEM_KEY
)
2970 * Go to the next backref item of current bytenr, can be either inlined or
2973 * Caller needs to check whether it's inline ref or not by iter->cur_key.
2975 * Return 0 if we get next backref without problem.
2976 * Return >0 if there is no extra backref for this bytenr.
2977 * Return <0 if there is something wrong happened.
2979 int btrfs_backref_iter_next(struct btrfs_backref_iter
*iter
)
2981 struct extent_buffer
*eb
= iter
->path
->nodes
[0];
2982 struct btrfs_root
*extent_root
;
2983 struct btrfs_path
*path
= iter
->path
;
2984 struct btrfs_extent_inline_ref
*iref
;
2988 if (btrfs_backref_iter_is_inline_ref(iter
)) {
2989 /* We're still inside the inline refs */
2990 ASSERT(iter
->cur_ptr
< iter
->end_ptr
);
2992 if (btrfs_backref_has_tree_block_info(iter
)) {
2993 /* First tree block info */
2994 size
= sizeof(struct btrfs_tree_block_info
);
2996 /* Use inline ref type to determine the size */
2999 iref
= (struct btrfs_extent_inline_ref
*)
3000 ((unsigned long)iter
->cur_ptr
);
3001 type
= btrfs_extent_inline_ref_type(eb
, iref
);
3003 size
= btrfs_extent_inline_ref_size(type
);
3005 iter
->cur_ptr
+= size
;
3006 if (iter
->cur_ptr
< iter
->end_ptr
)
3009 /* All inline items iterated, fall through */
3012 /* We're at keyed items, there is no inline item, go to the next one */
3013 extent_root
= btrfs_extent_root(iter
->fs_info
, iter
->bytenr
);
3014 ret
= btrfs_next_item(extent_root
, iter
->path
);
3018 btrfs_item_key_to_cpu(path
->nodes
[0], &iter
->cur_key
, path
->slots
[0]);
3019 if (iter
->cur_key
.objectid
!= iter
->bytenr
||
3020 (iter
->cur_key
.type
!= BTRFS_TREE_BLOCK_REF_KEY
&&
3021 iter
->cur_key
.type
!= BTRFS_SHARED_BLOCK_REF_KEY
))
3023 iter
->item_ptr
= (u32
)btrfs_item_ptr_offset(path
->nodes
[0],
3025 iter
->cur_ptr
= iter
->item_ptr
;
3026 iter
->end_ptr
= iter
->item_ptr
+ (u32
)btrfs_item_size(path
->nodes
[0],
3031 void btrfs_backref_init_cache(struct btrfs_fs_info
*fs_info
,
3032 struct btrfs_backref_cache
*cache
, bool is_reloc
)
3036 cache
->rb_root
= RB_ROOT
;
3037 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++)
3038 INIT_LIST_HEAD(&cache
->pending
[i
]);
3039 INIT_LIST_HEAD(&cache
->changed
);
3040 INIT_LIST_HEAD(&cache
->detached
);
3041 INIT_LIST_HEAD(&cache
->leaves
);
3042 INIT_LIST_HEAD(&cache
->pending_edge
);
3043 INIT_LIST_HEAD(&cache
->useless_node
);
3044 cache
->fs_info
= fs_info
;
3045 cache
->is_reloc
= is_reloc
;
3048 struct btrfs_backref_node
*btrfs_backref_alloc_node(
3049 struct btrfs_backref_cache
*cache
, u64 bytenr
, int level
)
3051 struct btrfs_backref_node
*node
;
3053 ASSERT(level
>= 0 && level
< BTRFS_MAX_LEVEL
);
3054 node
= kzalloc(sizeof(*node
), GFP_NOFS
);
3058 INIT_LIST_HEAD(&node
->list
);
3059 INIT_LIST_HEAD(&node
->upper
);
3060 INIT_LIST_HEAD(&node
->lower
);
3061 RB_CLEAR_NODE(&node
->rb_node
);
3063 node
->level
= level
;
3064 node
->bytenr
= bytenr
;
3069 void btrfs_backref_free_node(struct btrfs_backref_cache
*cache
,
3070 struct btrfs_backref_node
*node
)
3073 ASSERT(list_empty(&node
->list
));
3074 ASSERT(list_empty(&node
->lower
));
3075 ASSERT(node
->eb
== NULL
);
3077 btrfs_put_root(node
->root
);
3082 struct btrfs_backref_edge
*btrfs_backref_alloc_edge(
3083 struct btrfs_backref_cache
*cache
)
3085 struct btrfs_backref_edge
*edge
;
3087 edge
= kzalloc(sizeof(*edge
), GFP_NOFS
);
3093 void btrfs_backref_free_edge(struct btrfs_backref_cache
*cache
,
3094 struct btrfs_backref_edge
*edge
)
3102 void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node
*node
)
3105 btrfs_tree_unlock(node
->eb
);
3110 void btrfs_backref_drop_node_buffer(struct btrfs_backref_node
*node
)
3113 btrfs_backref_unlock_node_buffer(node
);
3114 free_extent_buffer(node
->eb
);
3120 * Drop the backref node from cache without cleaning up its children
3123 * This can only be called on node without parent edges.
3124 * The children edges are still kept as is.
3126 void btrfs_backref_drop_node(struct btrfs_backref_cache
*tree
,
3127 struct btrfs_backref_node
*node
)
3129 ASSERT(list_empty(&node
->upper
));
3131 btrfs_backref_drop_node_buffer(node
);
3132 list_del_init(&node
->list
);
3133 list_del_init(&node
->lower
);
3134 if (!RB_EMPTY_NODE(&node
->rb_node
))
3135 rb_erase(&node
->rb_node
, &tree
->rb_root
);
3136 btrfs_backref_free_node(tree
, node
);
3140 * Drop the backref node from cache, also cleaning up all its
3141 * upper edges and any uncached nodes in the path.
3143 * This cleanup happens bottom up, thus the node should either
3144 * be the lowest node in the cache or a detached node.
3146 void btrfs_backref_cleanup_node(struct btrfs_backref_cache
*cache
,
3147 struct btrfs_backref_node
*node
)
3149 struct btrfs_backref_node
*upper
;
3150 struct btrfs_backref_edge
*edge
;
3155 BUG_ON(!node
->lowest
&& !node
->detached
);
3156 while (!list_empty(&node
->upper
)) {
3157 edge
= list_entry(node
->upper
.next
, struct btrfs_backref_edge
,
3159 upper
= edge
->node
[UPPER
];
3160 list_del(&edge
->list
[LOWER
]);
3161 list_del(&edge
->list
[UPPER
]);
3162 btrfs_backref_free_edge(cache
, edge
);
3165 * Add the node to leaf node list if no other child block
3168 if (list_empty(&upper
->lower
)) {
3169 list_add_tail(&upper
->lower
, &cache
->leaves
);
3174 btrfs_backref_drop_node(cache
, node
);
3178 * Release all nodes/edges from current cache
3180 void btrfs_backref_release_cache(struct btrfs_backref_cache
*cache
)
3182 struct btrfs_backref_node
*node
;
3185 while (!list_empty(&cache
->detached
)) {
3186 node
= list_entry(cache
->detached
.next
,
3187 struct btrfs_backref_node
, list
);
3188 btrfs_backref_cleanup_node(cache
, node
);
3191 while (!list_empty(&cache
->leaves
)) {
3192 node
= list_entry(cache
->leaves
.next
,
3193 struct btrfs_backref_node
, lower
);
3194 btrfs_backref_cleanup_node(cache
, node
);
3197 cache
->last_trans
= 0;
3199 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++)
3200 ASSERT(list_empty(&cache
->pending
[i
]));
3201 ASSERT(list_empty(&cache
->pending_edge
));
3202 ASSERT(list_empty(&cache
->useless_node
));
3203 ASSERT(list_empty(&cache
->changed
));
3204 ASSERT(list_empty(&cache
->detached
));
3205 ASSERT(RB_EMPTY_ROOT(&cache
->rb_root
));
3206 ASSERT(!cache
->nr_nodes
);
3207 ASSERT(!cache
->nr_edges
);
3210 void btrfs_backref_link_edge(struct btrfs_backref_edge
*edge
,
3211 struct btrfs_backref_node
*lower
,
3212 struct btrfs_backref_node
*upper
,
3215 ASSERT(upper
&& lower
&& upper
->level
== lower
->level
+ 1);
3216 edge
->node
[LOWER
] = lower
;
3217 edge
->node
[UPPER
] = upper
;
3218 if (link_which
& LINK_LOWER
)
3219 list_add_tail(&edge
->list
[LOWER
], &lower
->upper
);
3220 if (link_which
& LINK_UPPER
)
3221 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3224 * Handle direct tree backref
3226 * Direct tree backref means, the backref item shows its parent bytenr
3227 * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
3229 * @ref_key: The converted backref key.
3230 * For keyed backref, it's the item key.
3231 * For inlined backref, objectid is the bytenr,
3232 * type is btrfs_inline_ref_type, offset is
3233 * btrfs_inline_ref_offset.
3235 static int handle_direct_tree_backref(struct btrfs_backref_cache
*cache
,
3236 struct btrfs_key
*ref_key
,
3237 struct btrfs_backref_node
*cur
)
3239 struct btrfs_backref_edge
*edge
;
3240 struct btrfs_backref_node
*upper
;
3241 struct rb_node
*rb_node
;
3243 ASSERT(ref_key
->type
== BTRFS_SHARED_BLOCK_REF_KEY
);
3245 /* Only reloc root uses backref pointing to itself */
3246 if (ref_key
->objectid
== ref_key
->offset
) {
3247 struct btrfs_root
*root
;
3249 cur
->is_reloc_root
= 1;
3250 /* Only reloc backref cache cares about a specific root */
3251 if (cache
->is_reloc
) {
3252 root
= find_reloc_root(cache
->fs_info
, cur
->bytenr
);
3258 * For generic purpose backref cache, reloc root node
3261 list_add(&cur
->list
, &cache
->useless_node
);
3266 edge
= btrfs_backref_alloc_edge(cache
);
3270 rb_node
= rb_simple_search(&cache
->rb_root
, ref_key
->offset
);
3272 /* Parent node not yet cached */
3273 upper
= btrfs_backref_alloc_node(cache
, ref_key
->offset
,
3276 btrfs_backref_free_edge(cache
, edge
);
3281 * Backrefs for the upper level block isn't cached, add the
3282 * block to pending list
3284 list_add_tail(&edge
->list
[UPPER
], &cache
->pending_edge
);
3286 /* Parent node already cached */
3287 upper
= rb_entry(rb_node
, struct btrfs_backref_node
, rb_node
);
3288 ASSERT(upper
->checked
);
3289 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3291 btrfs_backref_link_edge(edge
, cur
, upper
, LINK_LOWER
);
3296 * Handle indirect tree backref
3298 * Indirect tree backref means, we only know which tree the node belongs to.
3299 * We still need to do a tree search to find out the parents. This is for
3300 * TREE_BLOCK_REF backref (keyed or inlined).
3302 * @trans: Transaction handle.
3303 * @ref_key: The same as @ref_key in handle_direct_tree_backref()
3304 * @tree_key: The first key of this tree block.
3305 * @path: A clean (released) path, to avoid allocating path every time
3306 * the function get called.
3308 static int handle_indirect_tree_backref(struct btrfs_trans_handle
*trans
,
3309 struct btrfs_backref_cache
*cache
,
3310 struct btrfs_path
*path
,
3311 struct btrfs_key
*ref_key
,
3312 struct btrfs_key
*tree_key
,
3313 struct btrfs_backref_node
*cur
)
3315 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
3316 struct btrfs_backref_node
*upper
;
3317 struct btrfs_backref_node
*lower
;
3318 struct btrfs_backref_edge
*edge
;
3319 struct extent_buffer
*eb
;
3320 struct btrfs_root
*root
;
3321 struct rb_node
*rb_node
;
3323 bool need_check
= true;
3326 root
= btrfs_get_fs_root(fs_info
, ref_key
->offset
, false);
3328 return PTR_ERR(root
);
3329 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
3332 if (btrfs_root_level(&root
->root_item
) == cur
->level
) {
3334 ASSERT(btrfs_root_bytenr(&root
->root_item
) == cur
->bytenr
);
3336 * For reloc backref cache, we may ignore reloc root. But for
3337 * general purpose backref cache, we can't rely on
3338 * btrfs_should_ignore_reloc_root() as it may conflict with
3339 * current running relocation and lead to missing root.
3341 * For general purpose backref cache, reloc root detection is
3342 * completely relying on direct backref (key->offset is parent
3343 * bytenr), thus only do such check for reloc cache.
3345 if (btrfs_should_ignore_reloc_root(root
) && cache
->is_reloc
) {
3346 btrfs_put_root(root
);
3347 list_add(&cur
->list
, &cache
->useless_node
);
3354 level
= cur
->level
+ 1;
3356 /* Search the tree to find parent blocks referring to the block */
3357 path
->search_commit_root
= 1;
3358 path
->skip_locking
= 1;
3359 path
->lowest_level
= level
;
3360 ret
= btrfs_search_slot(NULL
, root
, tree_key
, path
, 0, 0);
3361 path
->lowest_level
= 0;
3363 btrfs_put_root(root
);
3366 if (ret
> 0 && path
->slots
[level
] > 0)
3367 path
->slots
[level
]--;
3369 eb
= path
->nodes
[level
];
3370 if (btrfs_node_blockptr(eb
, path
->slots
[level
]) != cur
->bytenr
) {
3372 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
3373 cur
->bytenr
, level
- 1, root
->root_key
.objectid
,
3374 tree_key
->objectid
, tree_key
->type
, tree_key
->offset
);
3375 btrfs_put_root(root
);
3381 /* Add all nodes and edges in the path */
3382 for (; level
< BTRFS_MAX_LEVEL
; level
++) {
3383 if (!path
->nodes
[level
]) {
3384 ASSERT(btrfs_root_bytenr(&root
->root_item
) ==
3386 /* Same as previous should_ignore_reloc_root() call */
3387 if (btrfs_should_ignore_reloc_root(root
) &&
3389 btrfs_put_root(root
);
3390 list_add(&lower
->list
, &cache
->useless_node
);
3397 edge
= btrfs_backref_alloc_edge(cache
);
3399 btrfs_put_root(root
);
3404 eb
= path
->nodes
[level
];
3405 rb_node
= rb_simple_search(&cache
->rb_root
, eb
->start
);
3407 upper
= btrfs_backref_alloc_node(cache
, eb
->start
,
3410 btrfs_put_root(root
);
3411 btrfs_backref_free_edge(cache
, edge
);
3415 upper
->owner
= btrfs_header_owner(eb
);
3416 if (!test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
))
3420 * If we know the block isn't shared we can avoid
3421 * checking its backrefs.
3423 if (btrfs_block_can_be_shared(trans
, root
, eb
))
3429 * Add the block to pending list if we need to check its
3430 * backrefs, we only do this once while walking up a
3431 * tree as we will catch anything else later on.
3433 if (!upper
->checked
&& need_check
) {
3435 list_add_tail(&edge
->list
[UPPER
],
3436 &cache
->pending_edge
);
3440 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3443 upper
= rb_entry(rb_node
, struct btrfs_backref_node
,
3445 ASSERT(upper
->checked
);
3446 INIT_LIST_HEAD(&edge
->list
[UPPER
]);
3448 upper
->owner
= btrfs_header_owner(eb
);
3450 btrfs_backref_link_edge(edge
, lower
, upper
, LINK_LOWER
);
3453 btrfs_put_root(root
);
3460 btrfs_release_path(path
);
3465 * Add backref node @cur into @cache.
3467 * NOTE: Even if the function returned 0, @cur is not yet cached as its upper
3468 * links aren't yet bi-directional. Needs to finish such links.
3469 * Use btrfs_backref_finish_upper_links() to finish such linkage.
3471 * @trans: Transaction handle.
3472 * @path: Released path for indirect tree backref lookup
3473 * @iter: Released backref iter for extent tree search
3474 * @node_key: The first key of the tree block
3476 int btrfs_backref_add_tree_node(struct btrfs_trans_handle
*trans
,
3477 struct btrfs_backref_cache
*cache
,
3478 struct btrfs_path
*path
,
3479 struct btrfs_backref_iter
*iter
,
3480 struct btrfs_key
*node_key
,
3481 struct btrfs_backref_node
*cur
)
3483 struct btrfs_backref_edge
*edge
;
3484 struct btrfs_backref_node
*exist
;
3487 ret
= btrfs_backref_iter_start(iter
, cur
->bytenr
);
3491 * We skip the first btrfs_tree_block_info, as we don't use the key
3492 * stored in it, but fetch it from the tree block
3494 if (btrfs_backref_has_tree_block_info(iter
)) {
3495 ret
= btrfs_backref_iter_next(iter
);
3498 /* No extra backref? This means the tree block is corrupted */
3504 WARN_ON(cur
->checked
);
3505 if (!list_empty(&cur
->upper
)) {
3507 * The backref was added previously when processing backref of
3508 * type BTRFS_TREE_BLOCK_REF_KEY
3510 ASSERT(list_is_singular(&cur
->upper
));
3511 edge
= list_entry(cur
->upper
.next
, struct btrfs_backref_edge
,
3513 ASSERT(list_empty(&edge
->list
[UPPER
]));
3514 exist
= edge
->node
[UPPER
];
3516 * Add the upper level block to pending list if we need check
3519 if (!exist
->checked
)
3520 list_add_tail(&edge
->list
[UPPER
], &cache
->pending_edge
);
3525 for (; ret
== 0; ret
= btrfs_backref_iter_next(iter
)) {
3526 struct extent_buffer
*eb
;
3527 struct btrfs_key key
;
3531 eb
= iter
->path
->nodes
[0];
3533 key
.objectid
= iter
->bytenr
;
3534 if (btrfs_backref_iter_is_inline_ref(iter
)) {
3535 struct btrfs_extent_inline_ref
*iref
;
3537 /* Update key for inline backref */
3538 iref
= (struct btrfs_extent_inline_ref
*)
3539 ((unsigned long)iter
->cur_ptr
);
3540 type
= btrfs_get_extent_inline_ref_type(eb
, iref
,
3541 BTRFS_REF_TYPE_BLOCK
);
3542 if (type
== BTRFS_REF_TYPE_INVALID
) {
3547 key
.offset
= btrfs_extent_inline_ref_offset(eb
, iref
);
3549 key
.type
= iter
->cur_key
.type
;
3550 key
.offset
= iter
->cur_key
.offset
;
3554 * Parent node found and matches current inline ref, no need to
3555 * rebuild this node for this inline ref
3558 ((key
.type
== BTRFS_TREE_BLOCK_REF_KEY
&&
3559 exist
->owner
== key
.offset
) ||
3560 (key
.type
== BTRFS_SHARED_BLOCK_REF_KEY
&&
3561 exist
->bytenr
== key
.offset
))) {
3566 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
3567 if (key
.type
== BTRFS_SHARED_BLOCK_REF_KEY
) {
3568 ret
= handle_direct_tree_backref(cache
, &key
, cur
);
3571 } else if (key
.type
== BTRFS_TREE_BLOCK_REF_KEY
) {
3573 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref
3574 * offset means the root objectid. We need to search
3575 * the tree to get its parent bytenr.
3577 ret
= handle_indirect_tree_backref(trans
, cache
, path
,
3578 &key
, node_key
, cur
);
3583 * Unrecognized tree backref items (if it can pass tree-checker)
3591 btrfs_backref_iter_release(iter
);
3596 * Finish the upwards linkage created by btrfs_backref_add_tree_node()
3598 int btrfs_backref_finish_upper_links(struct btrfs_backref_cache
*cache
,
3599 struct btrfs_backref_node
*start
)
3601 struct list_head
*useless_node
= &cache
->useless_node
;
3602 struct btrfs_backref_edge
*edge
;
3603 struct rb_node
*rb_node
;
3604 LIST_HEAD(pending_edge
);
3606 ASSERT(start
->checked
);
3608 /* Insert this node to cache if it's not COW-only */
3609 if (!start
->cowonly
) {
3610 rb_node
= rb_simple_insert(&cache
->rb_root
, start
->bytenr
,
3613 btrfs_backref_panic(cache
->fs_info
, start
->bytenr
,
3615 list_add_tail(&start
->lower
, &cache
->leaves
);
3619 * Use breadth first search to iterate all related edges.
3621 * The starting points are all the edges of this node
3623 list_for_each_entry(edge
, &start
->upper
, list
[LOWER
])
3624 list_add_tail(&edge
->list
[UPPER
], &pending_edge
);
3626 while (!list_empty(&pending_edge
)) {
3627 struct btrfs_backref_node
*upper
;
3628 struct btrfs_backref_node
*lower
;
3630 edge
= list_first_entry(&pending_edge
,
3631 struct btrfs_backref_edge
, list
[UPPER
]);
3632 list_del_init(&edge
->list
[UPPER
]);
3633 upper
= edge
->node
[UPPER
];
3634 lower
= edge
->node
[LOWER
];
3636 /* Parent is detached, no need to keep any edges */
3637 if (upper
->detached
) {
3638 list_del(&edge
->list
[LOWER
]);
3639 btrfs_backref_free_edge(cache
, edge
);
3641 /* Lower node is orphan, queue for cleanup */
3642 if (list_empty(&lower
->upper
))
3643 list_add(&lower
->list
, useless_node
);
3648 * All new nodes added in current build_backref_tree() haven't
3649 * been linked to the cache rb tree.
3650 * So if we have upper->rb_node populated, this means a cache
3651 * hit. We only need to link the edge, as @upper and all its
3652 * parents have already been linked.
3654 if (!RB_EMPTY_NODE(&upper
->rb_node
)) {
3655 if (upper
->lowest
) {
3656 list_del_init(&upper
->lower
);
3660 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3664 /* Sanity check, we shouldn't have any unchecked nodes */
3665 if (!upper
->checked
) {
3670 /* Sanity check, COW-only node has non-COW-only parent */
3671 if (start
->cowonly
!= upper
->cowonly
) {
3676 /* Only cache non-COW-only (subvolume trees) tree blocks */
3677 if (!upper
->cowonly
) {
3678 rb_node
= rb_simple_insert(&cache
->rb_root
, upper
->bytenr
,
3681 btrfs_backref_panic(cache
->fs_info
,
3682 upper
->bytenr
, -EEXIST
);
3687 list_add_tail(&edge
->list
[UPPER
], &upper
->lower
);
3690 * Also queue all the parent edges of this uncached node
3691 * to finish the upper linkage
3693 list_for_each_entry(edge
, &upper
->upper
, list
[LOWER
])
3694 list_add_tail(&edge
->list
[UPPER
], &pending_edge
);
3699 void btrfs_backref_error_cleanup(struct btrfs_backref_cache
*cache
,
3700 struct btrfs_backref_node
*node
)
3702 struct btrfs_backref_node
*lower
;
3703 struct btrfs_backref_node
*upper
;
3704 struct btrfs_backref_edge
*edge
;
3706 while (!list_empty(&cache
->useless_node
)) {
3707 lower
= list_first_entry(&cache
->useless_node
,
3708 struct btrfs_backref_node
, list
);
3709 list_del_init(&lower
->list
);
3711 while (!list_empty(&cache
->pending_edge
)) {
3712 edge
= list_first_entry(&cache
->pending_edge
,
3713 struct btrfs_backref_edge
, list
[UPPER
]);
3714 list_del(&edge
->list
[UPPER
]);
3715 list_del(&edge
->list
[LOWER
]);
3716 lower
= edge
->node
[LOWER
];
3717 upper
= edge
->node
[UPPER
];
3718 btrfs_backref_free_edge(cache
, edge
);
3721 * Lower is no longer linked to any upper backref nodes and
3722 * isn't in the cache, we can free it ourselves.
3724 if (list_empty(&lower
->upper
) &&
3725 RB_EMPTY_NODE(&lower
->rb_node
))
3726 list_add(&lower
->list
, &cache
->useless_node
);
3728 if (!RB_EMPTY_NODE(&upper
->rb_node
))
3731 /* Add this guy's upper edges to the list to process */
3732 list_for_each_entry(edge
, &upper
->upper
, list
[LOWER
])
3733 list_add_tail(&edge
->list
[UPPER
],
3734 &cache
->pending_edge
);
3735 if (list_empty(&upper
->upper
))
3736 list_add(&upper
->list
, &cache
->useless_node
);
3739 while (!list_empty(&cache
->useless_node
)) {
3740 lower
= list_first_entry(&cache
->useless_node
,
3741 struct btrfs_backref_node
, list
);
3742 list_del_init(&lower
->list
);
3745 btrfs_backref_drop_node(cache
, lower
);
3748 btrfs_backref_cleanup_node(cache
, node
);
3749 ASSERT(list_empty(&cache
->useless_node
) &&
3750 list_empty(&cache
->pending_edge
));