1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
10 #include <linux/error-injection.h>
14 #include "transaction.h"
15 #include "print-tree.h"
19 #include "tree-mod-log.h"
20 #include "tree-checker.h"
22 #include "accessors.h"
23 #include "extent-tree.h"
24 #include "relocation.h"
25 #include "file-item.h"
27 static struct kmem_cache
*btrfs_path_cachep
;
29 static int split_node(struct btrfs_trans_handle
*trans
, struct btrfs_root
30 *root
, struct btrfs_path
*path
, int level
);
31 static int split_leaf(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
32 const struct btrfs_key
*ins_key
, struct btrfs_path
*path
,
33 int data_size
, int extend
);
34 static int push_node_left(struct btrfs_trans_handle
*trans
,
35 struct extent_buffer
*dst
,
36 struct extent_buffer
*src
, int empty
);
37 static int balance_node_right(struct btrfs_trans_handle
*trans
,
38 struct extent_buffer
*dst_buf
,
39 struct extent_buffer
*src_buf
);
41 static const struct btrfs_csums
{
44 const char driver
[12];
46 [BTRFS_CSUM_TYPE_CRC32
] = { .size
= 4, .name
= "crc32c" },
47 [BTRFS_CSUM_TYPE_XXHASH
] = { .size
= 8, .name
= "xxhash64" },
48 [BTRFS_CSUM_TYPE_SHA256
] = { .size
= 32, .name
= "sha256" },
49 [BTRFS_CSUM_TYPE_BLAKE2
] = { .size
= 32, .name
= "blake2b",
50 .driver
= "blake2b-256" },
54 * The leaf data grows from end-to-front in the node. this returns the address
55 * of the start of the last item, which is the stop of the leaf data stack.
57 static unsigned int leaf_data_end(const struct extent_buffer
*leaf
)
59 u32 nr
= btrfs_header_nritems(leaf
);
62 return BTRFS_LEAF_DATA_SIZE(leaf
->fs_info
);
63 return btrfs_item_offset(leaf
, nr
- 1);
67 * Move data in a @leaf (using memmove, safe for overlapping ranges).
69 * @leaf: leaf that we're doing a memmove on
70 * @dst_offset: item data offset we're moving to
71 * @src_offset: item data offset were' moving from
72 * @len: length of the data we're moving
74 * Wrapper around memmove_extent_buffer() that takes into account the header on
75 * the leaf. The btrfs_item offset's start directly after the header, so we
76 * have to adjust any offsets to account for the header in the leaf. This
77 * handles that math to simplify the callers.
79 static inline void memmove_leaf_data(const struct extent_buffer
*leaf
,
80 unsigned long dst_offset
,
81 unsigned long src_offset
,
84 memmove_extent_buffer(leaf
, btrfs_item_nr_offset(leaf
, 0) + dst_offset
,
85 btrfs_item_nr_offset(leaf
, 0) + src_offset
, len
);
89 * Copy item data from @src into @dst at the given @offset.
91 * @dst: destination leaf that we're copying into
92 * @src: source leaf that we're copying from
93 * @dst_offset: item data offset we're copying to
94 * @src_offset: item data offset were' copying from
95 * @len: length of the data we're copying
97 * Wrapper around copy_extent_buffer() that takes into account the header on
98 * the leaf. The btrfs_item offset's start directly after the header, so we
99 * have to adjust any offsets to account for the header in the leaf. This
100 * handles that math to simplify the callers.
102 static inline void copy_leaf_data(const struct extent_buffer
*dst
,
103 const struct extent_buffer
*src
,
104 unsigned long dst_offset
,
105 unsigned long src_offset
, unsigned long len
)
107 copy_extent_buffer(dst
, src
, btrfs_item_nr_offset(dst
, 0) + dst_offset
,
108 btrfs_item_nr_offset(src
, 0) + src_offset
, len
);
112 * Move items in a @leaf (using memmove).
114 * @dst: destination leaf for the items
115 * @dst_item: the item nr we're copying into
116 * @src_item: the item nr we're copying from
117 * @nr_items: the number of items to copy
119 * Wrapper around memmove_extent_buffer() that does the math to get the
120 * appropriate offsets into the leaf from the item numbers.
122 static inline void memmove_leaf_items(const struct extent_buffer
*leaf
,
123 int dst_item
, int src_item
, int nr_items
)
125 memmove_extent_buffer(leaf
, btrfs_item_nr_offset(leaf
, dst_item
),
126 btrfs_item_nr_offset(leaf
, src_item
),
127 nr_items
* sizeof(struct btrfs_item
));
131 * Copy items from @src into @dst at the given @offset.
133 * @dst: destination leaf for the items
134 * @src: source leaf for the items
135 * @dst_item: the item nr we're copying into
136 * @src_item: the item nr we're copying from
137 * @nr_items: the number of items to copy
139 * Wrapper around copy_extent_buffer() that does the math to get the
140 * appropriate offsets into the leaf from the item numbers.
142 static inline void copy_leaf_items(const struct extent_buffer
*dst
,
143 const struct extent_buffer
*src
,
144 int dst_item
, int src_item
, int nr_items
)
146 copy_extent_buffer(dst
, src
, btrfs_item_nr_offset(dst
, dst_item
),
147 btrfs_item_nr_offset(src
, src_item
),
148 nr_items
* sizeof(struct btrfs_item
));
151 /* This exists for btrfs-progs usages. */
152 u16
btrfs_csum_type_size(u16 type
)
154 return btrfs_csums
[type
].size
;
157 int btrfs_super_csum_size(const struct btrfs_super_block
*s
)
159 u16 t
= btrfs_super_csum_type(s
);
161 * csum type is validated at mount time
163 return btrfs_csum_type_size(t
);
166 const char *btrfs_super_csum_name(u16 csum_type
)
168 /* csum type is validated at mount time */
169 return btrfs_csums
[csum_type
].name
;
173 * Return driver name if defined, otherwise the name that's also a valid driver
176 const char *btrfs_super_csum_driver(u16 csum_type
)
178 /* csum type is validated at mount time */
179 return btrfs_csums
[csum_type
].driver
[0] ?
180 btrfs_csums
[csum_type
].driver
:
181 btrfs_csums
[csum_type
].name
;
184 size_t __attribute_const__
btrfs_get_num_csums(void)
186 return ARRAY_SIZE(btrfs_csums
);
189 struct btrfs_path
*btrfs_alloc_path(void)
193 return kmem_cache_zalloc(btrfs_path_cachep
, GFP_NOFS
);
196 /* this also releases the path */
197 void btrfs_free_path(struct btrfs_path
*p
)
201 btrfs_release_path(p
);
202 kmem_cache_free(btrfs_path_cachep
, p
);
206 * path release drops references on the extent buffers in the path
207 * and it drops any locks held by this path
209 * It is safe to call this on paths that no locks or extent buffers held.
211 noinline
void btrfs_release_path(struct btrfs_path
*p
)
215 for (i
= 0; i
< BTRFS_MAX_LEVEL
; i
++) {
220 btrfs_tree_unlock_rw(p
->nodes
[i
], p
->locks
[i
]);
223 free_extent_buffer(p
->nodes
[i
]);
229 * We want the transaction abort to print stack trace only for errors where the
230 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
231 * caused by external factors.
233 bool __cold
abort_should_print_stack(int error
)
245 * safely gets a reference on the root node of a tree. A lock
246 * is not taken, so a concurrent writer may put a different node
247 * at the root of the tree. See btrfs_lock_root_node for the
250 * The extent buffer returned by this has a reference taken, so
251 * it won't disappear. It may stop being the root of the tree
252 * at any time because there are no locks held.
254 struct extent_buffer
*btrfs_root_node(struct btrfs_root
*root
)
256 struct extent_buffer
*eb
;
260 eb
= rcu_dereference(root
->node
);
263 * RCU really hurts here, we could free up the root node because
264 * it was COWed but we may not get the new root node yet so do
265 * the inc_not_zero dance and if it doesn't work then
266 * synchronize_rcu and try again.
268 if (atomic_inc_not_zero(&eb
->refs
)) {
279 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
280 * just get put onto a simple dirty list. Transaction walks this list to make
281 * sure they get properly updated on disk.
283 static void add_root_to_dirty_list(struct btrfs_root
*root
)
285 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
287 if (test_bit(BTRFS_ROOT_DIRTY
, &root
->state
) ||
288 !test_bit(BTRFS_ROOT_TRACK_DIRTY
, &root
->state
))
291 spin_lock(&fs_info
->trans_lock
);
292 if (!test_and_set_bit(BTRFS_ROOT_DIRTY
, &root
->state
)) {
293 /* Want the extent tree to be the last on the list */
294 if (root
->root_key
.objectid
== BTRFS_EXTENT_TREE_OBJECTID
)
295 list_move_tail(&root
->dirty_list
,
296 &fs_info
->dirty_cowonly_roots
);
298 list_move(&root
->dirty_list
,
299 &fs_info
->dirty_cowonly_roots
);
301 spin_unlock(&fs_info
->trans_lock
);
305 * used by snapshot creation to make a copy of a root for a tree with
306 * a given objectid. The buffer with the new root node is returned in
307 * cow_ret, and this func returns zero on success or a negative error code.
309 int btrfs_copy_root(struct btrfs_trans_handle
*trans
,
310 struct btrfs_root
*root
,
311 struct extent_buffer
*buf
,
312 struct extent_buffer
**cow_ret
, u64 new_root_objectid
)
314 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
315 struct extent_buffer
*cow
;
318 struct btrfs_disk_key disk_key
;
319 u64 reloc_src_root
= 0;
321 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
) &&
322 trans
->transid
!= fs_info
->running_transaction
->transid
);
323 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
) &&
324 trans
->transid
!= root
->last_trans
);
326 level
= btrfs_header_level(buf
);
328 btrfs_item_key(buf
, &disk_key
, 0);
330 btrfs_node_key(buf
, &disk_key
, 0);
332 if (new_root_objectid
== BTRFS_TREE_RELOC_OBJECTID
)
333 reloc_src_root
= btrfs_header_owner(buf
);
334 cow
= btrfs_alloc_tree_block(trans
, root
, 0, new_root_objectid
,
335 &disk_key
, level
, buf
->start
, 0,
336 reloc_src_root
, BTRFS_NESTING_NEW_ROOT
);
340 copy_extent_buffer_full(cow
, buf
);
341 btrfs_set_header_bytenr(cow
, cow
->start
);
342 btrfs_set_header_generation(cow
, trans
->transid
);
343 btrfs_set_header_backref_rev(cow
, BTRFS_MIXED_BACKREF_REV
);
344 btrfs_clear_header_flag(cow
, BTRFS_HEADER_FLAG_WRITTEN
|
345 BTRFS_HEADER_FLAG_RELOC
);
346 if (new_root_objectid
== BTRFS_TREE_RELOC_OBJECTID
)
347 btrfs_set_header_flag(cow
, BTRFS_HEADER_FLAG_RELOC
);
349 btrfs_set_header_owner(cow
, new_root_objectid
);
351 write_extent_buffer_fsid(cow
, fs_info
->fs_devices
->metadata_uuid
);
353 WARN_ON(btrfs_header_generation(buf
) > trans
->transid
);
354 if (new_root_objectid
== BTRFS_TREE_RELOC_OBJECTID
)
355 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
357 ret
= btrfs_inc_ref(trans
, root
, cow
, 0);
359 btrfs_tree_unlock(cow
);
360 free_extent_buffer(cow
);
361 btrfs_abort_transaction(trans
, ret
);
365 btrfs_mark_buffer_dirty(trans
, cow
);
371 * check if the tree block can be shared by multiple trees
373 int btrfs_block_can_be_shared(struct btrfs_root
*root
,
374 struct extent_buffer
*buf
)
377 * Tree blocks not in shareable trees and tree roots are never shared.
378 * If a block was allocated after the last snapshot and the block was
379 * not allocated by tree relocation, we know the block is not shared.
381 if (test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
) &&
382 buf
!= root
->node
&& buf
!= root
->commit_root
&&
383 (btrfs_header_generation(buf
) <=
384 btrfs_root_last_snapshot(&root
->root_item
) ||
385 btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_RELOC
)))
391 static noinline
int update_ref_for_cow(struct btrfs_trans_handle
*trans
,
392 struct btrfs_root
*root
,
393 struct extent_buffer
*buf
,
394 struct extent_buffer
*cow
,
397 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
405 * Backrefs update rules:
407 * Always use full backrefs for extent pointers in tree block
408 * allocated by tree relocation.
410 * If a shared tree block is no longer referenced by its owner
411 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
412 * use full backrefs for extent pointers in tree block.
414 * If a tree block is been relocating
415 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
416 * use full backrefs for extent pointers in tree block.
417 * The reason for this is some operations (such as drop tree)
418 * are only allowed for blocks use full backrefs.
421 if (btrfs_block_can_be_shared(root
, buf
)) {
422 ret
= btrfs_lookup_extent_info(trans
, fs_info
, buf
->start
,
423 btrfs_header_level(buf
), 1,
427 if (unlikely(refs
== 0)) {
429 "found 0 references for tree block at bytenr %llu level %d root %llu",
430 buf
->start
, btrfs_header_level(buf
),
431 btrfs_root_id(root
));
433 btrfs_abort_transaction(trans
, ret
);
438 if (root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
||
439 btrfs_header_backref_rev(buf
) < BTRFS_MIXED_BACKREF_REV
)
440 flags
= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
445 owner
= btrfs_header_owner(buf
);
446 BUG_ON(owner
== BTRFS_TREE_RELOC_OBJECTID
&&
447 !(flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
));
450 if ((owner
== root
->root_key
.objectid
||
451 root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
) &&
452 !(flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
)) {
453 ret
= btrfs_inc_ref(trans
, root
, buf
, 1);
457 if (root
->root_key
.objectid
==
458 BTRFS_TREE_RELOC_OBJECTID
) {
459 ret
= btrfs_dec_ref(trans
, root
, buf
, 0);
462 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
466 new_flags
|= BTRFS_BLOCK_FLAG_FULL_BACKREF
;
469 if (root
->root_key
.objectid
==
470 BTRFS_TREE_RELOC_OBJECTID
)
471 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
473 ret
= btrfs_inc_ref(trans
, root
, cow
, 0);
477 if (new_flags
!= 0) {
478 ret
= btrfs_set_disk_extent_flags(trans
, buf
, new_flags
);
483 if (flags
& BTRFS_BLOCK_FLAG_FULL_BACKREF
) {
484 if (root
->root_key
.objectid
==
485 BTRFS_TREE_RELOC_OBJECTID
)
486 ret
= btrfs_inc_ref(trans
, root
, cow
, 1);
488 ret
= btrfs_inc_ref(trans
, root
, cow
, 0);
491 ret
= btrfs_dec_ref(trans
, root
, buf
, 1);
495 btrfs_clear_buffer_dirty(trans
, buf
);
502 * does the dirty work in cow of a single block. The parent block (if
503 * supplied) is updated to point to the new cow copy. The new buffer is marked
504 * dirty and returned locked. If you modify the block it needs to be marked
507 * search_start -- an allocation hint for the new block
509 * empty_size -- a hint that you plan on doing more cow. This is the size in
510 * bytes the allocator should try to find free next to the block it returns.
511 * This is just a hint and may be ignored by the allocator.
513 int btrfs_force_cow_block(struct btrfs_trans_handle
*trans
,
514 struct btrfs_root
*root
,
515 struct extent_buffer
*buf
,
516 struct extent_buffer
*parent
, int parent_slot
,
517 struct extent_buffer
**cow_ret
,
518 u64 search_start
, u64 empty_size
,
519 enum btrfs_lock_nesting nest
)
521 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
522 struct btrfs_disk_key disk_key
;
523 struct extent_buffer
*cow
;
527 u64 parent_start
= 0;
528 u64 reloc_src_root
= 0;
533 btrfs_assert_tree_write_locked(buf
);
535 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
) &&
536 trans
->transid
!= fs_info
->running_transaction
->transid
);
537 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
) &&
538 trans
->transid
!= root
->last_trans
);
540 level
= btrfs_header_level(buf
);
543 btrfs_item_key(buf
, &disk_key
, 0);
545 btrfs_node_key(buf
, &disk_key
, 0);
547 if (root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
) {
549 parent_start
= parent
->start
;
550 reloc_src_root
= btrfs_header_owner(buf
);
552 cow
= btrfs_alloc_tree_block(trans
, root
, parent_start
,
553 root
->root_key
.objectid
, &disk_key
, level
,
554 search_start
, empty_size
, reloc_src_root
, nest
);
558 /* cow is set to blocking by btrfs_init_new_buffer */
560 copy_extent_buffer_full(cow
, buf
);
561 btrfs_set_header_bytenr(cow
, cow
->start
);
562 btrfs_set_header_generation(cow
, trans
->transid
);
563 btrfs_set_header_backref_rev(cow
, BTRFS_MIXED_BACKREF_REV
);
564 btrfs_clear_header_flag(cow
, BTRFS_HEADER_FLAG_WRITTEN
|
565 BTRFS_HEADER_FLAG_RELOC
);
566 if (root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
)
567 btrfs_set_header_flag(cow
, BTRFS_HEADER_FLAG_RELOC
);
569 btrfs_set_header_owner(cow
, root
->root_key
.objectid
);
571 write_extent_buffer_fsid(cow
, fs_info
->fs_devices
->metadata_uuid
);
573 ret
= update_ref_for_cow(trans
, root
, buf
, cow
, &last_ref
);
575 btrfs_tree_unlock(cow
);
576 free_extent_buffer(cow
);
577 btrfs_abort_transaction(trans
, ret
);
581 if (test_bit(BTRFS_ROOT_SHAREABLE
, &root
->state
)) {
582 ret
= btrfs_reloc_cow_block(trans
, root
, buf
, cow
);
584 btrfs_tree_unlock(cow
);
585 free_extent_buffer(cow
);
586 btrfs_abort_transaction(trans
, ret
);
591 if (buf
== root
->node
) {
592 WARN_ON(parent
&& parent
!= buf
);
593 if (root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
||
594 btrfs_header_backref_rev(buf
) < BTRFS_MIXED_BACKREF_REV
)
595 parent_start
= buf
->start
;
597 ret
= btrfs_tree_mod_log_insert_root(root
->node
, cow
, true);
599 btrfs_tree_unlock(cow
);
600 free_extent_buffer(cow
);
601 btrfs_abort_transaction(trans
, ret
);
604 atomic_inc(&cow
->refs
);
605 rcu_assign_pointer(root
->node
, cow
);
607 btrfs_free_tree_block(trans
, btrfs_root_id(root
), buf
,
608 parent_start
, last_ref
);
609 free_extent_buffer(buf
);
610 add_root_to_dirty_list(root
);
612 WARN_ON(trans
->transid
!= btrfs_header_generation(parent
));
613 ret
= btrfs_tree_mod_log_insert_key(parent
, parent_slot
,
614 BTRFS_MOD_LOG_KEY_REPLACE
);
616 btrfs_tree_unlock(cow
);
617 free_extent_buffer(cow
);
618 btrfs_abort_transaction(trans
, ret
);
621 btrfs_set_node_blockptr(parent
, parent_slot
,
623 btrfs_set_node_ptr_generation(parent
, parent_slot
,
625 btrfs_mark_buffer_dirty(trans
, parent
);
627 ret
= btrfs_tree_mod_log_free_eb(buf
);
629 btrfs_tree_unlock(cow
);
630 free_extent_buffer(cow
);
631 btrfs_abort_transaction(trans
, ret
);
635 btrfs_free_tree_block(trans
, btrfs_root_id(root
), buf
,
636 parent_start
, last_ref
);
639 btrfs_tree_unlock(buf
);
640 free_extent_buffer_stale(buf
);
641 btrfs_mark_buffer_dirty(trans
, cow
);
646 static inline int should_cow_block(struct btrfs_trans_handle
*trans
,
647 struct btrfs_root
*root
,
648 struct extent_buffer
*buf
)
650 if (btrfs_is_testing(root
->fs_info
))
653 /* Ensure we can see the FORCE_COW bit */
654 smp_mb__before_atomic();
657 * We do not need to cow a block if
658 * 1) this block is not created or changed in this transaction;
659 * 2) this block does not belong to TREE_RELOC tree;
660 * 3) the root is not forced COW.
662 * What is forced COW:
663 * when we create snapshot during committing the transaction,
664 * after we've finished copying src root, we must COW the shared
665 * block to ensure the metadata consistency.
667 if (btrfs_header_generation(buf
) == trans
->transid
&&
668 !btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_WRITTEN
) &&
669 !(root
->root_key
.objectid
!= BTRFS_TREE_RELOC_OBJECTID
&&
670 btrfs_header_flag(buf
, BTRFS_HEADER_FLAG_RELOC
)) &&
671 !test_bit(BTRFS_ROOT_FORCE_COW
, &root
->state
))
677 * COWs a single block, see btrfs_force_cow_block() for the real work.
678 * This version of it has extra checks so that a block isn't COWed more than
679 * once per transaction, as long as it hasn't been written yet
681 int btrfs_cow_block(struct btrfs_trans_handle
*trans
,
682 struct btrfs_root
*root
, struct extent_buffer
*buf
,
683 struct extent_buffer
*parent
, int parent_slot
,
684 struct extent_buffer
**cow_ret
,
685 enum btrfs_lock_nesting nest
)
687 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
691 if (unlikely(test_bit(BTRFS_ROOT_DELETING
, &root
->state
))) {
692 btrfs_abort_transaction(trans
, -EUCLEAN
);
694 "attempt to COW block %llu on root %llu that is being deleted",
695 buf
->start
, btrfs_root_id(root
));
700 * COWing must happen through a running transaction, which always
701 * matches the current fs generation (it's a transaction with a state
702 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
703 * into error state to prevent the commit of any transaction.
705 if (unlikely(trans
->transaction
!= fs_info
->running_transaction
||
706 trans
->transid
!= fs_info
->generation
)) {
707 btrfs_abort_transaction(trans
, -EUCLEAN
);
709 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
710 buf
->start
, btrfs_root_id(root
), trans
->transid
,
711 fs_info
->running_transaction
->transid
,
712 fs_info
->generation
);
716 if (!should_cow_block(trans
, root
, buf
)) {
721 search_start
= round_down(buf
->start
, SZ_1G
);
724 * Before CoWing this block for later modification, check if it's
725 * the subtree root and do the delayed subtree trace if needed.
727 * Also We don't care about the error, as it's handled internally.
729 btrfs_qgroup_trace_subtree_after_cow(trans
, root
, buf
);
730 ret
= btrfs_force_cow_block(trans
, root
, buf
, parent
, parent_slot
,
731 cow_ret
, search_start
, 0, nest
);
733 trace_btrfs_cow_block(root
, buf
, *cow_ret
);
737 ALLOW_ERROR_INJECTION(btrfs_cow_block
, ERRNO
);
740 * same as comp_keys only with two btrfs_key's
742 int __pure
btrfs_comp_cpu_keys(const struct btrfs_key
*k1
, const struct btrfs_key
*k2
)
744 if (k1
->objectid
> k2
->objectid
)
746 if (k1
->objectid
< k2
->objectid
)
748 if (k1
->type
> k2
->type
)
750 if (k1
->type
< k2
->type
)
752 if (k1
->offset
> k2
->offset
)
754 if (k1
->offset
< k2
->offset
)
760 * Search for a key in the given extent_buffer.
762 * The lower boundary for the search is specified by the slot number @first_slot.
763 * Use a value of 0 to search over the whole extent buffer. Works for both
766 * The slot in the extent buffer is returned via @slot. If the key exists in the
767 * extent buffer, then @slot will point to the slot where the key is, otherwise
768 * it points to the slot where you would insert the key.
770 * Slot may point to the total number of items (i.e. one position beyond the last
771 * key) if the key is bigger than the last key in the extent buffer.
773 int btrfs_bin_search(struct extent_buffer
*eb
, int first_slot
,
774 const struct btrfs_key
*key
, int *slot
)
779 * Use unsigned types for the low and high slots, so that we get a more
780 * efficient division in the search loop below.
782 u32 low
= first_slot
;
783 u32 high
= btrfs_header_nritems(eb
);
785 const int key_size
= sizeof(struct btrfs_disk_key
);
787 if (unlikely(low
> high
)) {
788 btrfs_err(eb
->fs_info
,
789 "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
790 __func__
, low
, high
, eb
->start
,
791 btrfs_header_owner(eb
), btrfs_header_level(eb
));
795 if (btrfs_header_level(eb
) == 0) {
796 p
= offsetof(struct btrfs_leaf
, items
);
797 item_size
= sizeof(struct btrfs_item
);
799 p
= offsetof(struct btrfs_node
, ptrs
);
800 item_size
= sizeof(struct btrfs_key_ptr
);
805 unsigned long offset
;
806 struct btrfs_disk_key
*tmp
;
807 struct btrfs_disk_key unaligned
;
810 mid
= (low
+ high
) / 2;
811 offset
= p
+ mid
* item_size
;
812 oip
= offset_in_page(offset
);
814 if (oip
+ key_size
<= PAGE_SIZE
) {
815 const unsigned long idx
= get_eb_page_index(offset
);
816 char *kaddr
= page_address(eb
->pages
[idx
]);
818 oip
= get_eb_offset_in_page(eb
, offset
);
819 tmp
= (struct btrfs_disk_key
*)(kaddr
+ oip
);
821 read_extent_buffer(eb
, &unaligned
, offset
, key_size
);
825 ret
= btrfs_comp_keys(tmp
, key
);
840 static void root_add_used_bytes(struct btrfs_root
*root
)
842 spin_lock(&root
->accounting_lock
);
843 btrfs_set_root_used(&root
->root_item
,
844 btrfs_root_used(&root
->root_item
) + root
->fs_info
->nodesize
);
845 spin_unlock(&root
->accounting_lock
);
848 static void root_sub_used_bytes(struct btrfs_root
*root
)
850 spin_lock(&root
->accounting_lock
);
851 btrfs_set_root_used(&root
->root_item
,
852 btrfs_root_used(&root
->root_item
) - root
->fs_info
->nodesize
);
853 spin_unlock(&root
->accounting_lock
);
856 /* given a node and slot number, this reads the blocks it points to. The
857 * extent buffer is returned with a reference taken (but unlocked).
859 struct extent_buffer
*btrfs_read_node_slot(struct extent_buffer
*parent
,
862 int level
= btrfs_header_level(parent
);
863 struct btrfs_tree_parent_check check
= { 0 };
864 struct extent_buffer
*eb
;
866 if (slot
< 0 || slot
>= btrfs_header_nritems(parent
))
867 return ERR_PTR(-ENOENT
);
871 check
.level
= level
- 1;
872 check
.transid
= btrfs_node_ptr_generation(parent
, slot
);
873 check
.owner_root
= btrfs_header_owner(parent
);
874 check
.has_first_key
= true;
875 btrfs_node_key_to_cpu(parent
, &check
.first_key
, slot
);
877 eb
= read_tree_block(parent
->fs_info
, btrfs_node_blockptr(parent
, slot
),
881 if (!extent_buffer_uptodate(eb
)) {
882 free_extent_buffer(eb
);
883 return ERR_PTR(-EIO
);
890 * node level balancing, used to make sure nodes are in proper order for
891 * item deletion. We balance from the top down, so we have to make sure
892 * that a deletion won't leave an node completely empty later on.
894 static noinline
int balance_level(struct btrfs_trans_handle
*trans
,
895 struct btrfs_root
*root
,
896 struct btrfs_path
*path
, int level
)
898 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
899 struct extent_buffer
*right
= NULL
;
900 struct extent_buffer
*mid
;
901 struct extent_buffer
*left
= NULL
;
902 struct extent_buffer
*parent
= NULL
;
906 int orig_slot
= path
->slots
[level
];
911 mid
= path
->nodes
[level
];
913 WARN_ON(path
->locks
[level
] != BTRFS_WRITE_LOCK
);
914 WARN_ON(btrfs_header_generation(mid
) != trans
->transid
);
916 orig_ptr
= btrfs_node_blockptr(mid
, orig_slot
);
918 if (level
< BTRFS_MAX_LEVEL
- 1) {
919 parent
= path
->nodes
[level
+ 1];
920 pslot
= path
->slots
[level
+ 1];
924 * deal with the case where there is only one pointer in the root
925 * by promoting the node below to a root
928 struct extent_buffer
*child
;
930 if (btrfs_header_nritems(mid
) != 1)
933 /* promote the child to a root */
934 child
= btrfs_read_node_slot(mid
, 0);
936 ret
= PTR_ERR(child
);
940 btrfs_tree_lock(child
);
941 ret
= btrfs_cow_block(trans
, root
, child
, mid
, 0, &child
,
944 btrfs_tree_unlock(child
);
945 free_extent_buffer(child
);
949 ret
= btrfs_tree_mod_log_insert_root(root
->node
, child
, true);
951 btrfs_tree_unlock(child
);
952 free_extent_buffer(child
);
953 btrfs_abort_transaction(trans
, ret
);
956 rcu_assign_pointer(root
->node
, child
);
958 add_root_to_dirty_list(root
);
959 btrfs_tree_unlock(child
);
961 path
->locks
[level
] = 0;
962 path
->nodes
[level
] = NULL
;
963 btrfs_clear_buffer_dirty(trans
, mid
);
964 btrfs_tree_unlock(mid
);
965 /* once for the path */
966 free_extent_buffer(mid
);
968 root_sub_used_bytes(root
);
969 btrfs_free_tree_block(trans
, btrfs_root_id(root
), mid
, 0, 1);
970 /* once for the root ptr */
971 free_extent_buffer_stale(mid
);
974 if (btrfs_header_nritems(mid
) >
975 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) / 4)
979 left
= btrfs_read_node_slot(parent
, pslot
- 1);
986 __btrfs_tree_lock(left
, BTRFS_NESTING_LEFT
);
987 wret
= btrfs_cow_block(trans
, root
, left
,
988 parent
, pslot
- 1, &left
,
989 BTRFS_NESTING_LEFT_COW
);
996 if (pslot
+ 1 < btrfs_header_nritems(parent
)) {
997 right
= btrfs_read_node_slot(parent
, pslot
+ 1);
999 ret
= PTR_ERR(right
);
1004 __btrfs_tree_lock(right
, BTRFS_NESTING_RIGHT
);
1005 wret
= btrfs_cow_block(trans
, root
, right
,
1006 parent
, pslot
+ 1, &right
,
1007 BTRFS_NESTING_RIGHT_COW
);
1014 /* first, try to make some room in the middle buffer */
1016 orig_slot
+= btrfs_header_nritems(left
);
1017 wret
= push_node_left(trans
, left
, mid
, 1);
1023 * then try to empty the right most buffer into the middle
1026 wret
= push_node_left(trans
, mid
, right
, 1);
1027 if (wret
< 0 && wret
!= -ENOSPC
)
1029 if (btrfs_header_nritems(right
) == 0) {
1030 btrfs_clear_buffer_dirty(trans
, right
);
1031 btrfs_tree_unlock(right
);
1032 ret
= btrfs_del_ptr(trans
, root
, path
, level
+ 1, pslot
+ 1);
1034 free_extent_buffer_stale(right
);
1038 root_sub_used_bytes(root
);
1039 btrfs_free_tree_block(trans
, btrfs_root_id(root
), right
,
1041 free_extent_buffer_stale(right
);
1044 struct btrfs_disk_key right_key
;
1045 btrfs_node_key(right
, &right_key
, 0);
1046 ret
= btrfs_tree_mod_log_insert_key(parent
, pslot
+ 1,
1047 BTRFS_MOD_LOG_KEY_REPLACE
);
1049 btrfs_abort_transaction(trans
, ret
);
1052 btrfs_set_node_key(parent
, &right_key
, pslot
+ 1);
1053 btrfs_mark_buffer_dirty(trans
, parent
);
1056 if (btrfs_header_nritems(mid
) == 1) {
1058 * we're not allowed to leave a node with one item in the
1059 * tree during a delete. A deletion from lower in the tree
1060 * could try to delete the only pointer in this node.
1061 * So, pull some keys from the left.
1062 * There has to be a left pointer at this point because
1063 * otherwise we would have pulled some pointers from the
1066 if (unlikely(!left
)) {
1068 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1069 parent
->start
, btrfs_header_level(parent
),
1070 mid
->start
, btrfs_root_id(root
));
1072 btrfs_abort_transaction(trans
, ret
);
1075 wret
= balance_node_right(trans
, mid
, left
);
1081 wret
= push_node_left(trans
, left
, mid
, 1);
1087 if (btrfs_header_nritems(mid
) == 0) {
1088 btrfs_clear_buffer_dirty(trans
, mid
);
1089 btrfs_tree_unlock(mid
);
1090 ret
= btrfs_del_ptr(trans
, root
, path
, level
+ 1, pslot
);
1092 free_extent_buffer_stale(mid
);
1096 root_sub_used_bytes(root
);
1097 btrfs_free_tree_block(trans
, btrfs_root_id(root
), mid
, 0, 1);
1098 free_extent_buffer_stale(mid
);
1101 /* update the parent key to reflect our changes */
1102 struct btrfs_disk_key mid_key
;
1103 btrfs_node_key(mid
, &mid_key
, 0);
1104 ret
= btrfs_tree_mod_log_insert_key(parent
, pslot
,
1105 BTRFS_MOD_LOG_KEY_REPLACE
);
1107 btrfs_abort_transaction(trans
, ret
);
1110 btrfs_set_node_key(parent
, &mid_key
, pslot
);
1111 btrfs_mark_buffer_dirty(trans
, parent
);
1114 /* update the path */
1116 if (btrfs_header_nritems(left
) > orig_slot
) {
1117 atomic_inc(&left
->refs
);
1118 /* left was locked after cow */
1119 path
->nodes
[level
] = left
;
1120 path
->slots
[level
+ 1] -= 1;
1121 path
->slots
[level
] = orig_slot
;
1123 btrfs_tree_unlock(mid
);
1124 free_extent_buffer(mid
);
1127 orig_slot
-= btrfs_header_nritems(left
);
1128 path
->slots
[level
] = orig_slot
;
1131 /* double check we haven't messed things up */
1133 btrfs_node_blockptr(path
->nodes
[level
], path
->slots
[level
]))
1137 btrfs_tree_unlock(right
);
1138 free_extent_buffer(right
);
1141 if (path
->nodes
[level
] != left
)
1142 btrfs_tree_unlock(left
);
1143 free_extent_buffer(left
);
1148 /* Node balancing for insertion. Here we only split or push nodes around
1149 * when they are completely full. This is also done top down, so we
1150 * have to be pessimistic.
1152 static noinline
int push_nodes_for_insert(struct btrfs_trans_handle
*trans
,
1153 struct btrfs_root
*root
,
1154 struct btrfs_path
*path
, int level
)
1156 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1157 struct extent_buffer
*right
= NULL
;
1158 struct extent_buffer
*mid
;
1159 struct extent_buffer
*left
= NULL
;
1160 struct extent_buffer
*parent
= NULL
;
1164 int orig_slot
= path
->slots
[level
];
1169 mid
= path
->nodes
[level
];
1170 WARN_ON(btrfs_header_generation(mid
) != trans
->transid
);
1172 if (level
< BTRFS_MAX_LEVEL
- 1) {
1173 parent
= path
->nodes
[level
+ 1];
1174 pslot
= path
->slots
[level
+ 1];
1180 /* first, try to make some room in the middle buffer */
1184 left
= btrfs_read_node_slot(parent
, pslot
- 1);
1186 return PTR_ERR(left
);
1188 __btrfs_tree_lock(left
, BTRFS_NESTING_LEFT
);
1190 left_nr
= btrfs_header_nritems(left
);
1191 if (left_nr
>= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 1) {
1194 ret
= btrfs_cow_block(trans
, root
, left
, parent
,
1196 BTRFS_NESTING_LEFT_COW
);
1200 wret
= push_node_left(trans
, left
, mid
, 0);
1206 struct btrfs_disk_key disk_key
;
1207 orig_slot
+= left_nr
;
1208 btrfs_node_key(mid
, &disk_key
, 0);
1209 ret
= btrfs_tree_mod_log_insert_key(parent
, pslot
,
1210 BTRFS_MOD_LOG_KEY_REPLACE
);
1212 btrfs_tree_unlock(left
);
1213 free_extent_buffer(left
);
1214 btrfs_abort_transaction(trans
, ret
);
1217 btrfs_set_node_key(parent
, &disk_key
, pslot
);
1218 btrfs_mark_buffer_dirty(trans
, parent
);
1219 if (btrfs_header_nritems(left
) > orig_slot
) {
1220 path
->nodes
[level
] = left
;
1221 path
->slots
[level
+ 1] -= 1;
1222 path
->slots
[level
] = orig_slot
;
1223 btrfs_tree_unlock(mid
);
1224 free_extent_buffer(mid
);
1227 btrfs_header_nritems(left
);
1228 path
->slots
[level
] = orig_slot
;
1229 btrfs_tree_unlock(left
);
1230 free_extent_buffer(left
);
1234 btrfs_tree_unlock(left
);
1235 free_extent_buffer(left
);
1239 * then try to empty the right most buffer into the middle
1241 if (pslot
+ 1 < btrfs_header_nritems(parent
)) {
1244 right
= btrfs_read_node_slot(parent
, pslot
+ 1);
1246 return PTR_ERR(right
);
1248 __btrfs_tree_lock(right
, BTRFS_NESTING_RIGHT
);
1250 right_nr
= btrfs_header_nritems(right
);
1251 if (right_nr
>= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 1) {
1254 ret
= btrfs_cow_block(trans
, root
, right
,
1256 &right
, BTRFS_NESTING_RIGHT_COW
);
1260 wret
= balance_node_right(trans
, right
, mid
);
1266 struct btrfs_disk_key disk_key
;
1268 btrfs_node_key(right
, &disk_key
, 0);
1269 ret
= btrfs_tree_mod_log_insert_key(parent
, pslot
+ 1,
1270 BTRFS_MOD_LOG_KEY_REPLACE
);
1272 btrfs_tree_unlock(right
);
1273 free_extent_buffer(right
);
1274 btrfs_abort_transaction(trans
, ret
);
1277 btrfs_set_node_key(parent
, &disk_key
, pslot
+ 1);
1278 btrfs_mark_buffer_dirty(trans
, parent
);
1280 if (btrfs_header_nritems(mid
) <= orig_slot
) {
1281 path
->nodes
[level
] = right
;
1282 path
->slots
[level
+ 1] += 1;
1283 path
->slots
[level
] = orig_slot
-
1284 btrfs_header_nritems(mid
);
1285 btrfs_tree_unlock(mid
);
1286 free_extent_buffer(mid
);
1288 btrfs_tree_unlock(right
);
1289 free_extent_buffer(right
);
1293 btrfs_tree_unlock(right
);
1294 free_extent_buffer(right
);
1300 * readahead one full node of leaves, finding things that are close
1301 * to the block in 'slot', and triggering ra on them.
1303 static void reada_for_search(struct btrfs_fs_info
*fs_info
,
1304 struct btrfs_path
*path
,
1305 int level
, int slot
, u64 objectid
)
1307 struct extent_buffer
*node
;
1308 struct btrfs_disk_key disk_key
;
1318 if (level
!= 1 && path
->reada
!= READA_FORWARD_ALWAYS
)
1321 if (!path
->nodes
[level
])
1324 node
= path
->nodes
[level
];
1327 * Since the time between visiting leaves is much shorter than the time
1328 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1329 * much IO at once (possibly random).
1331 if (path
->reada
== READA_FORWARD_ALWAYS
) {
1333 nread_max
= node
->fs_info
->nodesize
;
1335 nread_max
= SZ_128K
;
1340 search
= btrfs_node_blockptr(node
, slot
);
1341 blocksize
= fs_info
->nodesize
;
1342 if (path
->reada
!= READA_FORWARD_ALWAYS
) {
1343 struct extent_buffer
*eb
;
1345 eb
= find_extent_buffer(fs_info
, search
);
1347 free_extent_buffer(eb
);
1354 nritems
= btrfs_header_nritems(node
);
1358 if (path
->reada
== READA_BACK
) {
1362 } else if (path
->reada
== READA_FORWARD
||
1363 path
->reada
== READA_FORWARD_ALWAYS
) {
1368 if (path
->reada
== READA_BACK
&& objectid
) {
1369 btrfs_node_key(node
, &disk_key
, nr
);
1370 if (btrfs_disk_key_objectid(&disk_key
) != objectid
)
1373 search
= btrfs_node_blockptr(node
, nr
);
1374 if (path
->reada
== READA_FORWARD_ALWAYS
||
1375 (search
<= target
&& target
- search
<= 65536) ||
1376 (search
> target
&& search
- target
<= 65536)) {
1377 btrfs_readahead_node_child(node
, nr
);
1381 if (nread
> nread_max
|| nscan
> 32)
1386 static noinline
void reada_for_balance(struct btrfs_path
*path
, int level
)
1388 struct extent_buffer
*parent
;
1392 parent
= path
->nodes
[level
+ 1];
1396 nritems
= btrfs_header_nritems(parent
);
1397 slot
= path
->slots
[level
+ 1];
1400 btrfs_readahead_node_child(parent
, slot
- 1);
1401 if (slot
+ 1 < nritems
)
1402 btrfs_readahead_node_child(parent
, slot
+ 1);
1407 * when we walk down the tree, it is usually safe to unlock the higher layers
1408 * in the tree. The exceptions are when our path goes through slot 0, because
1409 * operations on the tree might require changing key pointers higher up in the
1412 * callers might also have set path->keep_locks, which tells this code to keep
1413 * the lock if the path points to the last slot in the block. This is part of
1414 * walking through the tree, and selecting the next slot in the higher block.
1416 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1417 * if lowest_unlock is 1, level 0 won't be unlocked
1419 static noinline
void unlock_up(struct btrfs_path
*path
, int level
,
1420 int lowest_unlock
, int min_write_lock_level
,
1421 int *write_lock_level
)
1424 int skip_level
= level
;
1425 bool check_skip
= true;
1427 for (i
= level
; i
< BTRFS_MAX_LEVEL
; i
++) {
1428 if (!path
->nodes
[i
])
1430 if (!path
->locks
[i
])
1434 if (path
->slots
[i
] == 0) {
1439 if (path
->keep_locks
) {
1442 nritems
= btrfs_header_nritems(path
->nodes
[i
]);
1443 if (nritems
< 1 || path
->slots
[i
] >= nritems
- 1) {
1450 if (i
>= lowest_unlock
&& i
> skip_level
) {
1452 btrfs_tree_unlock_rw(path
->nodes
[i
], path
->locks
[i
]);
1454 if (write_lock_level
&&
1455 i
> min_write_lock_level
&&
1456 i
<= *write_lock_level
) {
1457 *write_lock_level
= i
- 1;
1464 * Helper function for btrfs_search_slot() and other functions that do a search
1465 * on a btree. The goal is to find a tree block in the cache (the radix tree at
1466 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1467 * its pages from disk.
1469 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1470 * whole btree search, starting again from the current root node.
1473 read_block_for_search(struct btrfs_root
*root
, struct btrfs_path
*p
,
1474 struct extent_buffer
**eb_ret
, int level
, int slot
,
1475 const struct btrfs_key
*key
)
1477 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1478 struct btrfs_tree_parent_check check
= { 0 };
1481 struct extent_buffer
*tmp
;
1486 unlock_up
= ((level
+ 1 < BTRFS_MAX_LEVEL
) && p
->locks
[level
+ 1]);
1487 blocknr
= btrfs_node_blockptr(*eb_ret
, slot
);
1488 gen
= btrfs_node_ptr_generation(*eb_ret
, slot
);
1489 parent_level
= btrfs_header_level(*eb_ret
);
1490 btrfs_node_key_to_cpu(*eb_ret
, &check
.first_key
, slot
);
1491 check
.has_first_key
= true;
1492 check
.level
= parent_level
- 1;
1493 check
.transid
= gen
;
1494 check
.owner_root
= root
->root_key
.objectid
;
1497 * If we need to read an extent buffer from disk and we are holding locks
1498 * on upper level nodes, we unlock all the upper nodes before reading the
1499 * extent buffer, and then return -EAGAIN to the caller as it needs to
1500 * restart the search. We don't release the lock on the current level
1501 * because we need to walk this node to figure out which blocks to read.
1503 tmp
= find_extent_buffer(fs_info
, blocknr
);
1505 if (p
->reada
== READA_FORWARD_ALWAYS
)
1506 reada_for_search(fs_info
, p
, level
, slot
, key
->objectid
);
1508 /* first we do an atomic uptodate check */
1509 if (btrfs_buffer_uptodate(tmp
, gen
, 1) > 0) {
1511 * Do extra check for first_key, eb can be stale due to
1512 * being cached, read from scrub, or have multiple
1513 * parents (shared tree blocks).
1515 if (btrfs_verify_level_key(tmp
,
1516 parent_level
- 1, &check
.first_key
, gen
)) {
1517 free_extent_buffer(tmp
);
1525 free_extent_buffer(tmp
);
1530 btrfs_unlock_up_safe(p
, level
+ 1);
1532 /* now we're allowed to do a blocking uptodate check */
1533 ret
= btrfs_read_extent_buffer(tmp
, &check
);
1535 free_extent_buffer(tmp
);
1536 btrfs_release_path(p
);
1539 if (btrfs_check_eb_owner(tmp
, root
->root_key
.objectid
)) {
1540 free_extent_buffer(tmp
);
1541 btrfs_release_path(p
);
1549 } else if (p
->nowait
) {
1554 btrfs_unlock_up_safe(p
, level
+ 1);
1560 if (p
->reada
!= READA_NONE
)
1561 reada_for_search(fs_info
, p
, level
, slot
, key
->objectid
);
1563 tmp
= read_tree_block(fs_info
, blocknr
, &check
);
1565 btrfs_release_path(p
);
1566 return PTR_ERR(tmp
);
1569 * If the read above didn't mark this buffer up to date,
1570 * it will never end up being up to date. Set ret to EIO now
1571 * and give up so that our caller doesn't loop forever
1574 if (!extent_buffer_uptodate(tmp
))
1581 free_extent_buffer(tmp
);
1582 btrfs_release_path(p
);
1589 * helper function for btrfs_search_slot. This does all of the checks
1590 * for node-level blocks and does any balancing required based on
1593 * If no extra work was required, zero is returned. If we had to
1594 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1598 setup_nodes_for_search(struct btrfs_trans_handle
*trans
,
1599 struct btrfs_root
*root
, struct btrfs_path
*p
,
1600 struct extent_buffer
*b
, int level
, int ins_len
,
1601 int *write_lock_level
)
1603 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1606 if ((p
->search_for_split
|| ins_len
> 0) && btrfs_header_nritems(b
) >=
1607 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 3) {
1609 if (*write_lock_level
< level
+ 1) {
1610 *write_lock_level
= level
+ 1;
1611 btrfs_release_path(p
);
1615 reada_for_balance(p
, level
);
1616 ret
= split_node(trans
, root
, p
, level
);
1618 b
= p
->nodes
[level
];
1619 } else if (ins_len
< 0 && btrfs_header_nritems(b
) <
1620 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) / 2) {
1622 if (*write_lock_level
< level
+ 1) {
1623 *write_lock_level
= level
+ 1;
1624 btrfs_release_path(p
);
1628 reada_for_balance(p
, level
);
1629 ret
= balance_level(trans
, root
, p
, level
);
1633 b
= p
->nodes
[level
];
1635 btrfs_release_path(p
);
1638 BUG_ON(btrfs_header_nritems(b
) == 1);
1643 int btrfs_find_item(struct btrfs_root
*fs_root
, struct btrfs_path
*path
,
1644 u64 iobjectid
, u64 ioff
, u8 key_type
,
1645 struct btrfs_key
*found_key
)
1648 struct btrfs_key key
;
1649 struct extent_buffer
*eb
;
1654 key
.type
= key_type
;
1655 key
.objectid
= iobjectid
;
1658 ret
= btrfs_search_slot(NULL
, fs_root
, &key
, path
, 0, 0);
1662 eb
= path
->nodes
[0];
1663 if (ret
&& path
->slots
[0] >= btrfs_header_nritems(eb
)) {
1664 ret
= btrfs_next_leaf(fs_root
, path
);
1667 eb
= path
->nodes
[0];
1670 btrfs_item_key_to_cpu(eb
, found_key
, path
->slots
[0]);
1671 if (found_key
->type
!= key
.type
||
1672 found_key
->objectid
!= key
.objectid
)
1678 static struct extent_buffer
*btrfs_search_slot_get_root(struct btrfs_root
*root
,
1679 struct btrfs_path
*p
,
1680 int write_lock_level
)
1682 struct extent_buffer
*b
;
1686 if (p
->search_commit_root
) {
1687 b
= root
->commit_root
;
1688 atomic_inc(&b
->refs
);
1689 level
= btrfs_header_level(b
);
1691 * Ensure that all callers have set skip_locking when
1692 * p->search_commit_root = 1.
1694 ASSERT(p
->skip_locking
== 1);
1699 if (p
->skip_locking
) {
1700 b
= btrfs_root_node(root
);
1701 level
= btrfs_header_level(b
);
1705 /* We try very hard to do read locks on the root */
1706 root_lock
= BTRFS_READ_LOCK
;
1709 * If the level is set to maximum, we can skip trying to get the read
1712 if (write_lock_level
< BTRFS_MAX_LEVEL
) {
1714 * We don't know the level of the root node until we actually
1715 * have it read locked
1718 b
= btrfs_try_read_lock_root_node(root
);
1722 b
= btrfs_read_lock_root_node(root
);
1724 level
= btrfs_header_level(b
);
1725 if (level
> write_lock_level
)
1728 /* Whoops, must trade for write lock */
1729 btrfs_tree_read_unlock(b
);
1730 free_extent_buffer(b
);
1733 b
= btrfs_lock_root_node(root
);
1734 root_lock
= BTRFS_WRITE_LOCK
;
1736 /* The level might have changed, check again */
1737 level
= btrfs_header_level(b
);
1741 * The root may have failed to write out at some point, and thus is no
1742 * longer valid, return an error in this case.
1744 if (!extent_buffer_uptodate(b
)) {
1746 btrfs_tree_unlock_rw(b
, root_lock
);
1747 free_extent_buffer(b
);
1748 return ERR_PTR(-EIO
);
1751 p
->nodes
[level
] = b
;
1752 if (!p
->skip_locking
)
1753 p
->locks
[level
] = root_lock
;
1755 * Callers are responsible for dropping b's references.
1761 * Replace the extent buffer at the lowest level of the path with a cloned
1762 * version. The purpose is to be able to use it safely, after releasing the
1763 * commit root semaphore, even if relocation is happening in parallel, the
1764 * transaction used for relocation is committed and the extent buffer is
1765 * reallocated in the next transaction.
1767 * This is used in a context where the caller does not prevent transaction
1768 * commits from happening, either by holding a transaction handle or holding
1769 * some lock, while it's doing searches through a commit root.
1770 * At the moment it's only used for send operations.
1772 static int finish_need_commit_sem_search(struct btrfs_path
*path
)
1774 const int i
= path
->lowest_level
;
1775 const int slot
= path
->slots
[i
];
1776 struct extent_buffer
*lowest
= path
->nodes
[i
];
1777 struct extent_buffer
*clone
;
1779 ASSERT(path
->need_commit_sem
);
1784 lockdep_assert_held_read(&lowest
->fs_info
->commit_root_sem
);
1786 clone
= btrfs_clone_extent_buffer(lowest
);
1790 btrfs_release_path(path
);
1791 path
->nodes
[i
] = clone
;
1792 path
->slots
[i
] = slot
;
1797 static inline int search_for_key_slot(struct extent_buffer
*eb
,
1798 int search_low_slot
,
1799 const struct btrfs_key
*key
,
1804 * If a previous call to btrfs_bin_search() on a parent node returned an
1805 * exact match (prev_cmp == 0), we can safely assume the target key will
1806 * always be at slot 0 on lower levels, since each key pointer
1807 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1808 * subtree it points to. Thus we can skip searching lower levels.
1810 if (prev_cmp
== 0) {
1815 return btrfs_bin_search(eb
, search_low_slot
, key
, slot
);
1818 static int search_leaf(struct btrfs_trans_handle
*trans
,
1819 struct btrfs_root
*root
,
1820 const struct btrfs_key
*key
,
1821 struct btrfs_path
*path
,
1825 struct extent_buffer
*leaf
= path
->nodes
[0];
1826 int leaf_free_space
= -1;
1827 int search_low_slot
= 0;
1829 bool do_bin_search
= true;
1832 * If we are doing an insertion, the leaf has enough free space and the
1833 * destination slot for the key is not slot 0, then we can unlock our
1834 * write lock on the parent, and any other upper nodes, before doing the
1835 * binary search on the leaf (with search_for_key_slot()), allowing other
1836 * tasks to lock the parent and any other upper nodes.
1840 * Cache the leaf free space, since we will need it later and it
1841 * will not change until then.
1843 leaf_free_space
= btrfs_leaf_free_space(leaf
);
1846 * !path->locks[1] means we have a single node tree, the leaf is
1847 * the root of the tree.
1849 if (path
->locks
[1] && leaf_free_space
>= ins_len
) {
1850 struct btrfs_disk_key first_key
;
1852 ASSERT(btrfs_header_nritems(leaf
) > 0);
1853 btrfs_item_key(leaf
, &first_key
, 0);
1856 * Doing the extra comparison with the first key is cheap,
1857 * taking into account that the first key is very likely
1858 * already in a cache line because it immediately follows
1859 * the extent buffer's header and we have recently accessed
1860 * the header's level field.
1862 ret
= btrfs_comp_keys(&first_key
, key
);
1865 * The first key is smaller than the key we want
1866 * to insert, so we are safe to unlock all upper
1867 * nodes and we have to do the binary search.
1869 * We do use btrfs_unlock_up_safe() and not
1870 * unlock_up() because the later does not unlock
1871 * nodes with a slot of 0 - we can safely unlock
1872 * any node even if its slot is 0 since in this
1873 * case the key does not end up at slot 0 of the
1874 * leaf and there's no need to split the leaf.
1876 btrfs_unlock_up_safe(path
, 1);
1877 search_low_slot
= 1;
1880 * The first key is >= then the key we want to
1881 * insert, so we can skip the binary search as
1882 * the target key will be at slot 0.
1884 * We can not unlock upper nodes when the key is
1885 * less than the first key, because we will need
1886 * to update the key at slot 0 of the parent node
1887 * and possibly of other upper nodes too.
1888 * If the key matches the first key, then we can
1889 * unlock all the upper nodes, using
1890 * btrfs_unlock_up_safe() instead of unlock_up()
1894 btrfs_unlock_up_safe(path
, 1);
1896 * ret is already 0 or 1, matching the result of
1897 * a btrfs_bin_search() call, so there is no need
1900 do_bin_search
= false;
1906 if (do_bin_search
) {
1907 ret
= search_for_key_slot(leaf
, search_low_slot
, key
,
1908 prev_cmp
, &path
->slots
[0]);
1915 * Item key already exists. In this case, if we are allowed to
1916 * insert the item (for example, in dir_item case, item key
1917 * collision is allowed), it will be merged with the original
1918 * item. Only the item size grows, no new btrfs item will be
1919 * added. If search_for_extension is not set, ins_len already
1920 * accounts the size btrfs_item, deduct it here so leaf space
1921 * check will be correct.
1923 if (ret
== 0 && !path
->search_for_extension
) {
1924 ASSERT(ins_len
>= sizeof(struct btrfs_item
));
1925 ins_len
-= sizeof(struct btrfs_item
);
1928 ASSERT(leaf_free_space
>= 0);
1930 if (leaf_free_space
< ins_len
) {
1933 err
= split_leaf(trans
, root
, key
, path
, ins_len
,
1936 if (WARN_ON(err
> 0))
1947 * Look for a key in a tree and perform necessary modifications to preserve
1950 * @trans: Handle of transaction, used when modifying the tree
1951 * @p: Holds all btree nodes along the search path
1952 * @root: The root node of the tree
1953 * @key: The key we are looking for
1954 * @ins_len: Indicates purpose of search:
1955 * >0 for inserts it's size of item inserted (*)
1957 * 0 for plain searches, not modifying the tree
1959 * (*) If size of item inserted doesn't include
1960 * sizeof(struct btrfs_item), then p->search_for_extension must
1962 * @cow: boolean should CoW operations be performed. Must always be 1
1963 * when modifying the tree.
1965 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1966 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1968 * If @key is found, 0 is returned and you can find the item in the leaf level
1969 * of the path (level 0)
1971 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1972 * points to the slot where it should be inserted
1974 * If an error is encountered while searching the tree a negative error number
1977 int btrfs_search_slot(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
1978 const struct btrfs_key
*key
, struct btrfs_path
*p
,
1979 int ins_len
, int cow
)
1981 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1982 struct extent_buffer
*b
;
1987 int lowest_unlock
= 1;
1988 /* everything at write_lock_level or lower must be write locked */
1989 int write_lock_level
= 0;
1990 u8 lowest_level
= 0;
1991 int min_write_lock_level
;
1996 lowest_level
= p
->lowest_level
;
1997 WARN_ON(lowest_level
&& ins_len
> 0);
1998 WARN_ON(p
->nodes
[0] != NULL
);
1999 BUG_ON(!cow
&& ins_len
);
2002 * For now only allow nowait for read only operations. There's no
2003 * strict reason why we can't, we just only need it for reads so it's
2004 * only implemented for reads.
2006 ASSERT(!p
->nowait
|| !cow
);
2011 /* when we are removing items, we might have to go up to level
2012 * two as we update tree pointers Make sure we keep write
2013 * for those levels as well
2015 write_lock_level
= 2;
2016 } else if (ins_len
> 0) {
2018 * for inserting items, make sure we have a write lock on
2019 * level 1 so we can update keys
2021 write_lock_level
= 1;
2025 write_lock_level
= -1;
2027 if (cow
&& (p
->keep_locks
|| p
->lowest_level
))
2028 write_lock_level
= BTRFS_MAX_LEVEL
;
2030 min_write_lock_level
= write_lock_level
;
2032 if (p
->need_commit_sem
) {
2033 ASSERT(p
->search_commit_root
);
2035 if (!down_read_trylock(&fs_info
->commit_root_sem
))
2038 down_read(&fs_info
->commit_root_sem
);
2044 b
= btrfs_search_slot_get_root(root
, p
, write_lock_level
);
2053 level
= btrfs_header_level(b
);
2056 bool last_level
= (level
== (BTRFS_MAX_LEVEL
- 1));
2059 * if we don't really need to cow this block
2060 * then we don't want to set the path blocking,
2061 * so we test it here
2063 if (!should_cow_block(trans
, root
, b
))
2067 * must have write locks on this node and the
2070 if (level
> write_lock_level
||
2071 (level
+ 1 > write_lock_level
&&
2072 level
+ 1 < BTRFS_MAX_LEVEL
&&
2073 p
->nodes
[level
+ 1])) {
2074 write_lock_level
= level
+ 1;
2075 btrfs_release_path(p
);
2080 err
= btrfs_cow_block(trans
, root
, b
, NULL
, 0,
2084 err
= btrfs_cow_block(trans
, root
, b
,
2085 p
->nodes
[level
+ 1],
2086 p
->slots
[level
+ 1], &b
,
2094 p
->nodes
[level
] = b
;
2097 * we have a lock on b and as long as we aren't changing
2098 * the tree, there is no way to for the items in b to change.
2099 * It is safe to drop the lock on our parent before we
2100 * go through the expensive btree search on b.
2102 * If we're inserting or deleting (ins_len != 0), then we might
2103 * be changing slot zero, which may require changing the parent.
2104 * So, we can't drop the lock until after we know which slot
2105 * we're operating on.
2107 if (!ins_len
&& !p
->keep_locks
) {
2110 if (u
< BTRFS_MAX_LEVEL
&& p
->locks
[u
]) {
2111 btrfs_tree_unlock_rw(p
->nodes
[u
], p
->locks
[u
]);
2118 ASSERT(write_lock_level
>= 1);
2120 ret
= search_leaf(trans
, root
, key
, p
, ins_len
, prev_cmp
);
2121 if (!p
->search_for_split
)
2122 unlock_up(p
, level
, lowest_unlock
,
2123 min_write_lock_level
, NULL
);
2127 ret
= search_for_key_slot(b
, 0, key
, prev_cmp
, &slot
);
2132 if (ret
&& slot
> 0) {
2136 p
->slots
[level
] = slot
;
2137 err
= setup_nodes_for_search(trans
, root
, p
, b
, level
, ins_len
,
2145 b
= p
->nodes
[level
];
2146 slot
= p
->slots
[level
];
2149 * Slot 0 is special, if we change the key we have to update
2150 * the parent pointer which means we must have a write lock on
2153 if (slot
== 0 && ins_len
&& write_lock_level
< level
+ 1) {
2154 write_lock_level
= level
+ 1;
2155 btrfs_release_path(p
);
2159 unlock_up(p
, level
, lowest_unlock
, min_write_lock_level
,
2162 if (level
== lowest_level
) {
2168 err
= read_block_for_search(root
, p
, &b
, level
, slot
, key
);
2176 if (!p
->skip_locking
) {
2177 level
= btrfs_header_level(b
);
2179 btrfs_maybe_reset_lockdep_class(root
, b
);
2181 if (level
<= write_lock_level
) {
2183 p
->locks
[level
] = BTRFS_WRITE_LOCK
;
2186 if (!btrfs_try_tree_read_lock(b
)) {
2187 free_extent_buffer(b
);
2192 btrfs_tree_read_lock(b
);
2194 p
->locks
[level
] = BTRFS_READ_LOCK
;
2196 p
->nodes
[level
] = b
;
2201 if (ret
< 0 && !p
->skip_release_on_error
)
2202 btrfs_release_path(p
);
2204 if (p
->need_commit_sem
) {
2207 ret2
= finish_need_commit_sem_search(p
);
2208 up_read(&fs_info
->commit_root_sem
);
2215 ALLOW_ERROR_INJECTION(btrfs_search_slot
, ERRNO
);
2218 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2219 * current state of the tree together with the operations recorded in the tree
2220 * modification log to search for the key in a previous version of this tree, as
2221 * denoted by the time_seq parameter.
2223 * Naturally, there is no support for insert, delete or cow operations.
2225 * The resulting path and return value will be set up as if we called
2226 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2228 int btrfs_search_old_slot(struct btrfs_root
*root
, const struct btrfs_key
*key
,
2229 struct btrfs_path
*p
, u64 time_seq
)
2231 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2232 struct extent_buffer
*b
;
2237 int lowest_unlock
= 1;
2238 u8 lowest_level
= 0;
2240 lowest_level
= p
->lowest_level
;
2241 WARN_ON(p
->nodes
[0] != NULL
);
2244 if (p
->search_commit_root
) {
2246 return btrfs_search_slot(NULL
, root
, key
, p
, 0, 0);
2250 b
= btrfs_get_old_root(root
, time_seq
);
2255 level
= btrfs_header_level(b
);
2256 p
->locks
[level
] = BTRFS_READ_LOCK
;
2261 level
= btrfs_header_level(b
);
2262 p
->nodes
[level
] = b
;
2265 * we have a lock on b and as long as we aren't changing
2266 * the tree, there is no way to for the items in b to change.
2267 * It is safe to drop the lock on our parent before we
2268 * go through the expensive btree search on b.
2270 btrfs_unlock_up_safe(p
, level
+ 1);
2272 ret
= btrfs_bin_search(b
, 0, key
, &slot
);
2277 p
->slots
[level
] = slot
;
2278 unlock_up(p
, level
, lowest_unlock
, 0, NULL
);
2282 if (ret
&& slot
> 0) {
2286 p
->slots
[level
] = slot
;
2287 unlock_up(p
, level
, lowest_unlock
, 0, NULL
);
2289 if (level
== lowest_level
) {
2295 err
= read_block_for_search(root
, p
, &b
, level
, slot
, key
);
2303 level
= btrfs_header_level(b
);
2304 btrfs_tree_read_lock(b
);
2305 b
= btrfs_tree_mod_log_rewind(fs_info
, p
, b
, time_seq
);
2310 p
->locks
[level
] = BTRFS_READ_LOCK
;
2311 p
->nodes
[level
] = b
;
2316 btrfs_release_path(p
);
2322 * Search the tree again to find a leaf with smaller keys.
2323 * Returns 0 if it found something.
2324 * Returns 1 if there are no smaller keys.
2325 * Returns < 0 on error.
2327 * This may release the path, and so you may lose any locks held at the
2330 static int btrfs_prev_leaf(struct btrfs_root
*root
, struct btrfs_path
*path
)
2332 struct btrfs_key key
;
2333 struct btrfs_key orig_key
;
2334 struct btrfs_disk_key found_key
;
2337 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, 0);
2340 if (key
.offset
> 0) {
2342 } else if (key
.type
> 0) {
2344 key
.offset
= (u64
)-1;
2345 } else if (key
.objectid
> 0) {
2348 key
.offset
= (u64
)-1;
2353 btrfs_release_path(path
);
2354 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2359 * Previous key not found. Even if we were at slot 0 of the leaf we had
2360 * before releasing the path and calling btrfs_search_slot(), we now may
2361 * be in a slot pointing to the same original key - this can happen if
2362 * after we released the path, one of more items were moved from a
2363 * sibling leaf into the front of the leaf we had due to an insertion
2364 * (see push_leaf_right()).
2365 * If we hit this case and our slot is > 0 and just decrement the slot
2366 * so that the caller does not process the same key again, which may or
2367 * may not break the caller, depending on its logic.
2369 if (path
->slots
[0] < btrfs_header_nritems(path
->nodes
[0])) {
2370 btrfs_item_key(path
->nodes
[0], &found_key
, path
->slots
[0]);
2371 ret
= btrfs_comp_keys(&found_key
, &orig_key
);
2373 if (path
->slots
[0] > 0) {
2378 * At slot 0, same key as before, it means orig_key is
2379 * the lowest, leftmost, key in the tree. We're done.
2385 btrfs_item_key(path
->nodes
[0], &found_key
, 0);
2386 ret
= btrfs_comp_keys(&found_key
, &key
);
2388 * We might have had an item with the previous key in the tree right
2389 * before we released our path. And after we released our path, that
2390 * item might have been pushed to the first slot (0) of the leaf we
2391 * were holding due to a tree balance. Alternatively, an item with the
2392 * previous key can exist as the only element of a leaf (big fat item).
2393 * Therefore account for these 2 cases, so that our callers (like
2394 * btrfs_previous_item) don't miss an existing item with a key matching
2395 * the previous key we computed above.
2403 * helper to use instead of search slot if no exact match is needed but
2404 * instead the next or previous item should be returned.
2405 * When find_higher is true, the next higher item is returned, the next lower
2407 * When return_any and find_higher are both true, and no higher item is found,
2408 * return the next lower instead.
2409 * When return_any is true and find_higher is false, and no lower item is found,
2410 * return the next higher instead.
2411 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2414 int btrfs_search_slot_for_read(struct btrfs_root
*root
,
2415 const struct btrfs_key
*key
,
2416 struct btrfs_path
*p
, int find_higher
,
2420 struct extent_buffer
*leaf
;
2423 ret
= btrfs_search_slot(NULL
, root
, key
, p
, 0, 0);
2427 * a return value of 1 means the path is at the position where the
2428 * item should be inserted. Normally this is the next bigger item,
2429 * but in case the previous item is the last in a leaf, path points
2430 * to the first free slot in the previous leaf, i.e. at an invalid
2436 if (p
->slots
[0] >= btrfs_header_nritems(leaf
)) {
2437 ret
= btrfs_next_leaf(root
, p
);
2443 * no higher item found, return the next
2448 btrfs_release_path(p
);
2452 if (p
->slots
[0] == 0) {
2453 ret
= btrfs_prev_leaf(root
, p
);
2458 if (p
->slots
[0] == btrfs_header_nritems(leaf
))
2465 * no lower item found, return the next
2470 btrfs_release_path(p
);
2480 * Execute search and call btrfs_previous_item to traverse backwards if the item
2483 * Return 0 if found, 1 if not found and < 0 if error.
2485 int btrfs_search_backwards(struct btrfs_root
*root
, struct btrfs_key
*key
,
2486 struct btrfs_path
*path
)
2490 ret
= btrfs_search_slot(NULL
, root
, key
, path
, 0, 0);
2492 ret
= btrfs_previous_item(root
, path
, key
->objectid
, key
->type
);
2495 btrfs_item_key_to_cpu(path
->nodes
[0], key
, path
->slots
[0]);
2501 * Search for a valid slot for the given path.
2503 * @root: The root node of the tree.
2504 * @key: Will contain a valid item if found.
2505 * @path: The starting point to validate the slot.
2507 * Return: 0 if the item is valid
2511 int btrfs_get_next_valid_item(struct btrfs_root
*root
, struct btrfs_key
*key
,
2512 struct btrfs_path
*path
)
2514 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
2517 ret
= btrfs_next_leaf(root
, path
);
2522 btrfs_item_key_to_cpu(path
->nodes
[0], key
, path
->slots
[0]);
2527 * adjust the pointers going up the tree, starting at level
2528 * making sure the right key of each node is points to 'key'.
2529 * This is used after shifting pointers to the left, so it stops
2530 * fixing up pointers when a given leaf/node is not in slot 0 of the
2534 static void fixup_low_keys(struct btrfs_trans_handle
*trans
,
2535 struct btrfs_path
*path
,
2536 struct btrfs_disk_key
*key
, int level
)
2539 struct extent_buffer
*t
;
2542 for (i
= level
; i
< BTRFS_MAX_LEVEL
; i
++) {
2543 int tslot
= path
->slots
[i
];
2545 if (!path
->nodes
[i
])
2548 ret
= btrfs_tree_mod_log_insert_key(t
, tslot
,
2549 BTRFS_MOD_LOG_KEY_REPLACE
);
2551 btrfs_set_node_key(t
, key
, tslot
);
2552 btrfs_mark_buffer_dirty(trans
, path
->nodes
[i
]);
2561 * This function isn't completely safe. It's the caller's responsibility
2562 * that the new key won't break the order
2564 void btrfs_set_item_key_safe(struct btrfs_trans_handle
*trans
,
2565 struct btrfs_path
*path
,
2566 const struct btrfs_key
*new_key
)
2568 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2569 struct btrfs_disk_key disk_key
;
2570 struct extent_buffer
*eb
;
2573 eb
= path
->nodes
[0];
2574 slot
= path
->slots
[0];
2576 btrfs_item_key(eb
, &disk_key
, slot
- 1);
2577 if (unlikely(btrfs_comp_keys(&disk_key
, new_key
) >= 0)) {
2578 btrfs_print_leaf(eb
);
2580 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2581 slot
, btrfs_disk_key_objectid(&disk_key
),
2582 btrfs_disk_key_type(&disk_key
),
2583 btrfs_disk_key_offset(&disk_key
),
2584 new_key
->objectid
, new_key
->type
,
2589 if (slot
< btrfs_header_nritems(eb
) - 1) {
2590 btrfs_item_key(eb
, &disk_key
, slot
+ 1);
2591 if (unlikely(btrfs_comp_keys(&disk_key
, new_key
) <= 0)) {
2592 btrfs_print_leaf(eb
);
2594 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2595 slot
, btrfs_disk_key_objectid(&disk_key
),
2596 btrfs_disk_key_type(&disk_key
),
2597 btrfs_disk_key_offset(&disk_key
),
2598 new_key
->objectid
, new_key
->type
,
2604 btrfs_cpu_key_to_disk(&disk_key
, new_key
);
2605 btrfs_set_item_key(eb
, &disk_key
, slot
);
2606 btrfs_mark_buffer_dirty(trans
, eb
);
2608 fixup_low_keys(trans
, path
, &disk_key
, 1);
2612 * Check key order of two sibling extent buffers.
2614 * Return true if something is wrong.
2615 * Return false if everything is fine.
2617 * Tree-checker only works inside one tree block, thus the following
2618 * corruption can not be detected by tree-checker:
2620 * Leaf @left | Leaf @right
2621 * --------------------------------------------------------------
2622 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2624 * Key f6 in leaf @left itself is valid, but not valid when the next
2625 * key in leaf @right is 7.
2626 * This can only be checked at tree block merge time.
2627 * And since tree checker has ensured all key order in each tree block
2628 * is correct, we only need to bother the last key of @left and the first
2631 static bool check_sibling_keys(struct extent_buffer
*left
,
2632 struct extent_buffer
*right
)
2634 struct btrfs_key left_last
;
2635 struct btrfs_key right_first
;
2636 int level
= btrfs_header_level(left
);
2637 int nr_left
= btrfs_header_nritems(left
);
2638 int nr_right
= btrfs_header_nritems(right
);
2640 /* No key to check in one of the tree blocks */
2641 if (!nr_left
|| !nr_right
)
2645 btrfs_node_key_to_cpu(left
, &left_last
, nr_left
- 1);
2646 btrfs_node_key_to_cpu(right
, &right_first
, 0);
2648 btrfs_item_key_to_cpu(left
, &left_last
, nr_left
- 1);
2649 btrfs_item_key_to_cpu(right
, &right_first
, 0);
2652 if (unlikely(btrfs_comp_cpu_keys(&left_last
, &right_first
) >= 0)) {
2653 btrfs_crit(left
->fs_info
, "left extent buffer:");
2654 btrfs_print_tree(left
, false);
2655 btrfs_crit(left
->fs_info
, "right extent buffer:");
2656 btrfs_print_tree(right
, false);
2657 btrfs_crit(left
->fs_info
,
2658 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2659 left_last
.objectid
, left_last
.type
,
2660 left_last
.offset
, right_first
.objectid
,
2661 right_first
.type
, right_first
.offset
);
2668 * try to push data from one node into the next node left in the
2671 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2672 * error, and > 0 if there was no room in the left hand block.
2674 static int push_node_left(struct btrfs_trans_handle
*trans
,
2675 struct extent_buffer
*dst
,
2676 struct extent_buffer
*src
, int empty
)
2678 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2684 src_nritems
= btrfs_header_nritems(src
);
2685 dst_nritems
= btrfs_header_nritems(dst
);
2686 push_items
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - dst_nritems
;
2687 WARN_ON(btrfs_header_generation(src
) != trans
->transid
);
2688 WARN_ON(btrfs_header_generation(dst
) != trans
->transid
);
2690 if (!empty
&& src_nritems
<= 8)
2693 if (push_items
<= 0)
2697 push_items
= min(src_nritems
, push_items
);
2698 if (push_items
< src_nritems
) {
2699 /* leave at least 8 pointers in the node if
2700 * we aren't going to empty it
2702 if (src_nritems
- push_items
< 8) {
2703 if (push_items
<= 8)
2709 push_items
= min(src_nritems
- 8, push_items
);
2711 /* dst is the left eb, src is the middle eb */
2712 if (check_sibling_keys(dst
, src
)) {
2714 btrfs_abort_transaction(trans
, ret
);
2717 ret
= btrfs_tree_mod_log_eb_copy(dst
, src
, dst_nritems
, 0, push_items
);
2719 btrfs_abort_transaction(trans
, ret
);
2722 copy_extent_buffer(dst
, src
,
2723 btrfs_node_key_ptr_offset(dst
, dst_nritems
),
2724 btrfs_node_key_ptr_offset(src
, 0),
2725 push_items
* sizeof(struct btrfs_key_ptr
));
2727 if (push_items
< src_nritems
) {
2729 * btrfs_tree_mod_log_eb_copy handles logging the move, so we
2730 * don't need to do an explicit tree mod log operation for it.
2732 memmove_extent_buffer(src
, btrfs_node_key_ptr_offset(src
, 0),
2733 btrfs_node_key_ptr_offset(src
, push_items
),
2734 (src_nritems
- push_items
) *
2735 sizeof(struct btrfs_key_ptr
));
2737 btrfs_set_header_nritems(src
, src_nritems
- push_items
);
2738 btrfs_set_header_nritems(dst
, dst_nritems
+ push_items
);
2739 btrfs_mark_buffer_dirty(trans
, src
);
2740 btrfs_mark_buffer_dirty(trans
, dst
);
2746 * try to push data from one node into the next node right in the
2749 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2750 * error, and > 0 if there was no room in the right hand block.
2752 * this will only push up to 1/2 the contents of the left node over
2754 static int balance_node_right(struct btrfs_trans_handle
*trans
,
2755 struct extent_buffer
*dst
,
2756 struct extent_buffer
*src
)
2758 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
2765 WARN_ON(btrfs_header_generation(src
) != trans
->transid
);
2766 WARN_ON(btrfs_header_generation(dst
) != trans
->transid
);
2768 src_nritems
= btrfs_header_nritems(src
);
2769 dst_nritems
= btrfs_header_nritems(dst
);
2770 push_items
= BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - dst_nritems
;
2771 if (push_items
<= 0)
2774 if (src_nritems
< 4)
2777 max_push
= src_nritems
/ 2 + 1;
2778 /* don't try to empty the node */
2779 if (max_push
>= src_nritems
)
2782 if (max_push
< push_items
)
2783 push_items
= max_push
;
2785 /* dst is the right eb, src is the middle eb */
2786 if (check_sibling_keys(src
, dst
)) {
2788 btrfs_abort_transaction(trans
, ret
);
2793 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2794 * need to do an explicit tree mod log operation for it.
2796 memmove_extent_buffer(dst
, btrfs_node_key_ptr_offset(dst
, push_items
),
2797 btrfs_node_key_ptr_offset(dst
, 0),
2799 sizeof(struct btrfs_key_ptr
));
2801 ret
= btrfs_tree_mod_log_eb_copy(dst
, src
, 0, src_nritems
- push_items
,
2804 btrfs_abort_transaction(trans
, ret
);
2807 copy_extent_buffer(dst
, src
,
2808 btrfs_node_key_ptr_offset(dst
, 0),
2809 btrfs_node_key_ptr_offset(src
, src_nritems
- push_items
),
2810 push_items
* sizeof(struct btrfs_key_ptr
));
2812 btrfs_set_header_nritems(src
, src_nritems
- push_items
);
2813 btrfs_set_header_nritems(dst
, dst_nritems
+ push_items
);
2815 btrfs_mark_buffer_dirty(trans
, src
);
2816 btrfs_mark_buffer_dirty(trans
, dst
);
2822 * helper function to insert a new root level in the tree.
2823 * A new node is allocated, and a single item is inserted to
2824 * point to the existing root
2826 * returns zero on success or < 0 on failure.
2828 static noinline
int insert_new_root(struct btrfs_trans_handle
*trans
,
2829 struct btrfs_root
*root
,
2830 struct btrfs_path
*path
, int level
)
2833 struct extent_buffer
*lower
;
2834 struct extent_buffer
*c
;
2835 struct extent_buffer
*old
;
2836 struct btrfs_disk_key lower_key
;
2839 BUG_ON(path
->nodes
[level
]);
2840 BUG_ON(path
->nodes
[level
-1] != root
->node
);
2842 lower
= path
->nodes
[level
-1];
2844 btrfs_item_key(lower
, &lower_key
, 0);
2846 btrfs_node_key(lower
, &lower_key
, 0);
2848 c
= btrfs_alloc_tree_block(trans
, root
, 0, root
->root_key
.objectid
,
2849 &lower_key
, level
, root
->node
->start
, 0,
2850 0, BTRFS_NESTING_NEW_ROOT
);
2854 root_add_used_bytes(root
);
2856 btrfs_set_header_nritems(c
, 1);
2857 btrfs_set_node_key(c
, &lower_key
, 0);
2858 btrfs_set_node_blockptr(c
, 0, lower
->start
);
2859 lower_gen
= btrfs_header_generation(lower
);
2860 WARN_ON(lower_gen
!= trans
->transid
);
2862 btrfs_set_node_ptr_generation(c
, 0, lower_gen
);
2864 btrfs_mark_buffer_dirty(trans
, c
);
2867 ret
= btrfs_tree_mod_log_insert_root(root
->node
, c
, false);
2869 btrfs_free_tree_block(trans
, btrfs_root_id(root
), c
, 0, 1);
2870 btrfs_tree_unlock(c
);
2871 free_extent_buffer(c
);
2874 rcu_assign_pointer(root
->node
, c
);
2876 /* the super has an extra ref to root->node */
2877 free_extent_buffer(old
);
2879 add_root_to_dirty_list(root
);
2880 atomic_inc(&c
->refs
);
2881 path
->nodes
[level
] = c
;
2882 path
->locks
[level
] = BTRFS_WRITE_LOCK
;
2883 path
->slots
[level
] = 0;
2888 * worker function to insert a single pointer in a node.
2889 * the node should have enough room for the pointer already
2891 * slot and level indicate where you want the key to go, and
2892 * blocknr is the block the key points to.
2894 static int insert_ptr(struct btrfs_trans_handle
*trans
,
2895 struct btrfs_path
*path
,
2896 struct btrfs_disk_key
*key
, u64 bytenr
,
2897 int slot
, int level
)
2899 struct extent_buffer
*lower
;
2903 BUG_ON(!path
->nodes
[level
]);
2904 btrfs_assert_tree_write_locked(path
->nodes
[level
]);
2905 lower
= path
->nodes
[level
];
2906 nritems
= btrfs_header_nritems(lower
);
2907 BUG_ON(slot
> nritems
);
2908 BUG_ON(nritems
== BTRFS_NODEPTRS_PER_BLOCK(trans
->fs_info
));
2909 if (slot
!= nritems
) {
2911 ret
= btrfs_tree_mod_log_insert_move(lower
, slot
+ 1,
2912 slot
, nritems
- slot
);
2914 btrfs_abort_transaction(trans
, ret
);
2918 memmove_extent_buffer(lower
,
2919 btrfs_node_key_ptr_offset(lower
, slot
+ 1),
2920 btrfs_node_key_ptr_offset(lower
, slot
),
2921 (nritems
- slot
) * sizeof(struct btrfs_key_ptr
));
2924 ret
= btrfs_tree_mod_log_insert_key(lower
, slot
,
2925 BTRFS_MOD_LOG_KEY_ADD
);
2927 btrfs_abort_transaction(trans
, ret
);
2931 btrfs_set_node_key(lower
, key
, slot
);
2932 btrfs_set_node_blockptr(lower
, slot
, bytenr
);
2933 WARN_ON(trans
->transid
== 0);
2934 btrfs_set_node_ptr_generation(lower
, slot
, trans
->transid
);
2935 btrfs_set_header_nritems(lower
, nritems
+ 1);
2936 btrfs_mark_buffer_dirty(trans
, lower
);
2942 * split the node at the specified level in path in two.
2943 * The path is corrected to point to the appropriate node after the split
2945 * Before splitting this tries to make some room in the node by pushing
2946 * left and right, if either one works, it returns right away.
2948 * returns 0 on success and < 0 on failure
2950 static noinline
int split_node(struct btrfs_trans_handle
*trans
,
2951 struct btrfs_root
*root
,
2952 struct btrfs_path
*path
, int level
)
2954 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2955 struct extent_buffer
*c
;
2956 struct extent_buffer
*split
;
2957 struct btrfs_disk_key disk_key
;
2962 c
= path
->nodes
[level
];
2963 WARN_ON(btrfs_header_generation(c
) != trans
->transid
);
2964 if (c
== root
->node
) {
2966 * trying to split the root, lets make a new one
2968 * tree mod log: We don't log_removal old root in
2969 * insert_new_root, because that root buffer will be kept as a
2970 * normal node. We are going to log removal of half of the
2971 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2972 * holding a tree lock on the buffer, which is why we cannot
2973 * race with other tree_mod_log users.
2975 ret
= insert_new_root(trans
, root
, path
, level
+ 1);
2979 ret
= push_nodes_for_insert(trans
, root
, path
, level
);
2980 c
= path
->nodes
[level
];
2981 if (!ret
&& btrfs_header_nritems(c
) <
2982 BTRFS_NODEPTRS_PER_BLOCK(fs_info
) - 3)
2988 c_nritems
= btrfs_header_nritems(c
);
2989 mid
= (c_nritems
+ 1) / 2;
2990 btrfs_node_key(c
, &disk_key
, mid
);
2992 split
= btrfs_alloc_tree_block(trans
, root
, 0, root
->root_key
.objectid
,
2993 &disk_key
, level
, c
->start
, 0,
2994 0, BTRFS_NESTING_SPLIT
);
2996 return PTR_ERR(split
);
2998 root_add_used_bytes(root
);
2999 ASSERT(btrfs_header_level(c
) == level
);
3001 ret
= btrfs_tree_mod_log_eb_copy(split
, c
, 0, mid
, c_nritems
- mid
);
3003 btrfs_tree_unlock(split
);
3004 free_extent_buffer(split
);
3005 btrfs_abort_transaction(trans
, ret
);
3008 copy_extent_buffer(split
, c
,
3009 btrfs_node_key_ptr_offset(split
, 0),
3010 btrfs_node_key_ptr_offset(c
, mid
),
3011 (c_nritems
- mid
) * sizeof(struct btrfs_key_ptr
));
3012 btrfs_set_header_nritems(split
, c_nritems
- mid
);
3013 btrfs_set_header_nritems(c
, mid
);
3015 btrfs_mark_buffer_dirty(trans
, c
);
3016 btrfs_mark_buffer_dirty(trans
, split
);
3018 ret
= insert_ptr(trans
, path
, &disk_key
, split
->start
,
3019 path
->slots
[level
+ 1] + 1, level
+ 1);
3021 btrfs_tree_unlock(split
);
3022 free_extent_buffer(split
);
3026 if (path
->slots
[level
] >= mid
) {
3027 path
->slots
[level
] -= mid
;
3028 btrfs_tree_unlock(c
);
3029 free_extent_buffer(c
);
3030 path
->nodes
[level
] = split
;
3031 path
->slots
[level
+ 1] += 1;
3033 btrfs_tree_unlock(split
);
3034 free_extent_buffer(split
);
3040 * how many bytes are required to store the items in a leaf. start
3041 * and nr indicate which items in the leaf to check. This totals up the
3042 * space used both by the item structs and the item data
3044 static int leaf_space_used(const struct extent_buffer
*l
, int start
, int nr
)
3047 int nritems
= btrfs_header_nritems(l
);
3048 int end
= min(nritems
, start
+ nr
) - 1;
3052 data_len
= btrfs_item_offset(l
, start
) + btrfs_item_size(l
, start
);
3053 data_len
= data_len
- btrfs_item_offset(l
, end
);
3054 data_len
+= sizeof(struct btrfs_item
) * nr
;
3055 WARN_ON(data_len
< 0);
3060 * The space between the end of the leaf items and
3061 * the start of the leaf data. IOW, how much room
3062 * the leaf has left for both items and data
3064 int btrfs_leaf_free_space(const struct extent_buffer
*leaf
)
3066 struct btrfs_fs_info
*fs_info
= leaf
->fs_info
;
3067 int nritems
= btrfs_header_nritems(leaf
);
3070 ret
= BTRFS_LEAF_DATA_SIZE(fs_info
) - leaf_space_used(leaf
, 0, nritems
);
3073 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3075 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info
),
3076 leaf_space_used(leaf
, 0, nritems
), nritems
);
3082 * min slot controls the lowest index we're willing to push to the
3083 * right. We'll push up to and including min_slot, but no lower
3085 static noinline
int __push_leaf_right(struct btrfs_trans_handle
*trans
,
3086 struct btrfs_path
*path
,
3087 int data_size
, int empty
,
3088 struct extent_buffer
*right
,
3089 int free_space
, u32 left_nritems
,
3092 struct btrfs_fs_info
*fs_info
= right
->fs_info
;
3093 struct extent_buffer
*left
= path
->nodes
[0];
3094 struct extent_buffer
*upper
= path
->nodes
[1];
3095 struct btrfs_map_token token
;
3096 struct btrfs_disk_key disk_key
;
3109 nr
= max_t(u32
, 1, min_slot
);
3111 if (path
->slots
[0] >= left_nritems
)
3112 push_space
+= data_size
;
3114 slot
= path
->slots
[1];
3115 i
= left_nritems
- 1;
3117 if (!empty
&& push_items
> 0) {
3118 if (path
->slots
[0] > i
)
3120 if (path
->slots
[0] == i
) {
3121 int space
= btrfs_leaf_free_space(left
);
3123 if (space
+ push_space
* 2 > free_space
)
3128 if (path
->slots
[0] == i
)
3129 push_space
+= data_size
;
3131 this_item_size
= btrfs_item_size(left
, i
);
3132 if (this_item_size
+ sizeof(struct btrfs_item
) +
3133 push_space
> free_space
)
3137 push_space
+= this_item_size
+ sizeof(struct btrfs_item
);
3143 if (push_items
== 0)
3146 WARN_ON(!empty
&& push_items
== left_nritems
);
3148 /* push left to right */
3149 right_nritems
= btrfs_header_nritems(right
);
3151 push_space
= btrfs_item_data_end(left
, left_nritems
- push_items
);
3152 push_space
-= leaf_data_end(left
);
3154 /* make room in the right data area */
3155 data_end
= leaf_data_end(right
);
3156 memmove_leaf_data(right
, data_end
- push_space
, data_end
,
3157 BTRFS_LEAF_DATA_SIZE(fs_info
) - data_end
);
3159 /* copy from the left data area */
3160 copy_leaf_data(right
, left
, BTRFS_LEAF_DATA_SIZE(fs_info
) - push_space
,
3161 leaf_data_end(left
), push_space
);
3163 memmove_leaf_items(right
, push_items
, 0, right_nritems
);
3165 /* copy the items from left to right */
3166 copy_leaf_items(right
, left
, 0, left_nritems
- push_items
, push_items
);
3168 /* update the item pointers */
3169 btrfs_init_map_token(&token
, right
);
3170 right_nritems
+= push_items
;
3171 btrfs_set_header_nritems(right
, right_nritems
);
3172 push_space
= BTRFS_LEAF_DATA_SIZE(fs_info
);
3173 for (i
= 0; i
< right_nritems
; i
++) {
3174 push_space
-= btrfs_token_item_size(&token
, i
);
3175 btrfs_set_token_item_offset(&token
, i
, push_space
);
3178 left_nritems
-= push_items
;
3179 btrfs_set_header_nritems(left
, left_nritems
);
3182 btrfs_mark_buffer_dirty(trans
, left
);
3184 btrfs_clear_buffer_dirty(trans
, left
);
3186 btrfs_mark_buffer_dirty(trans
, right
);
3188 btrfs_item_key(right
, &disk_key
, 0);
3189 btrfs_set_node_key(upper
, &disk_key
, slot
+ 1);
3190 btrfs_mark_buffer_dirty(trans
, upper
);
3192 /* then fixup the leaf pointer in the path */
3193 if (path
->slots
[0] >= left_nritems
) {
3194 path
->slots
[0] -= left_nritems
;
3195 if (btrfs_header_nritems(path
->nodes
[0]) == 0)
3196 btrfs_clear_buffer_dirty(trans
, path
->nodes
[0]);
3197 btrfs_tree_unlock(path
->nodes
[0]);
3198 free_extent_buffer(path
->nodes
[0]);
3199 path
->nodes
[0] = right
;
3200 path
->slots
[1] += 1;
3202 btrfs_tree_unlock(right
);
3203 free_extent_buffer(right
);
3208 btrfs_tree_unlock(right
);
3209 free_extent_buffer(right
);
3214 * push some data in the path leaf to the right, trying to free up at
3215 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3217 * returns 1 if the push failed because the other node didn't have enough
3218 * room, 0 if everything worked out and < 0 if there were major errors.
3220 * this will push starting from min_slot to the end of the leaf. It won't
3221 * push any slot lower than min_slot
3223 static int push_leaf_right(struct btrfs_trans_handle
*trans
, struct btrfs_root
3224 *root
, struct btrfs_path
*path
,
3225 int min_data_size
, int data_size
,
3226 int empty
, u32 min_slot
)
3228 struct extent_buffer
*left
= path
->nodes
[0];
3229 struct extent_buffer
*right
;
3230 struct extent_buffer
*upper
;
3236 if (!path
->nodes
[1])
3239 slot
= path
->slots
[1];
3240 upper
= path
->nodes
[1];
3241 if (slot
>= btrfs_header_nritems(upper
) - 1)
3244 btrfs_assert_tree_write_locked(path
->nodes
[1]);
3246 right
= btrfs_read_node_slot(upper
, slot
+ 1);
3248 return PTR_ERR(right
);
3250 __btrfs_tree_lock(right
, BTRFS_NESTING_RIGHT
);
3252 free_space
= btrfs_leaf_free_space(right
);
3253 if (free_space
< data_size
)
3256 ret
= btrfs_cow_block(trans
, root
, right
, upper
,
3257 slot
+ 1, &right
, BTRFS_NESTING_RIGHT_COW
);
3261 left_nritems
= btrfs_header_nritems(left
);
3262 if (left_nritems
== 0)
3265 if (check_sibling_keys(left
, right
)) {
3267 btrfs_abort_transaction(trans
, ret
);
3268 btrfs_tree_unlock(right
);
3269 free_extent_buffer(right
);
3272 if (path
->slots
[0] == left_nritems
&& !empty
) {
3273 /* Key greater than all keys in the leaf, right neighbor has
3274 * enough room for it and we're not emptying our leaf to delete
3275 * it, therefore use right neighbor to insert the new item and
3276 * no need to touch/dirty our left leaf. */
3277 btrfs_tree_unlock(left
);
3278 free_extent_buffer(left
);
3279 path
->nodes
[0] = right
;
3285 return __push_leaf_right(trans
, path
, min_data_size
, empty
, right
,
3286 free_space
, left_nritems
, min_slot
);
3288 btrfs_tree_unlock(right
);
3289 free_extent_buffer(right
);
3294 * push some data in the path leaf to the left, trying to free up at
3295 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3297 * max_slot can put a limit on how far into the leaf we'll push items. The
3298 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3301 static noinline
int __push_leaf_left(struct btrfs_trans_handle
*trans
,
3302 struct btrfs_path
*path
, int data_size
,
3303 int empty
, struct extent_buffer
*left
,
3304 int free_space
, u32 right_nritems
,
3307 struct btrfs_fs_info
*fs_info
= left
->fs_info
;
3308 struct btrfs_disk_key disk_key
;
3309 struct extent_buffer
*right
= path
->nodes
[0];
3313 u32 old_left_nritems
;
3317 u32 old_left_item_size
;
3318 struct btrfs_map_token token
;
3321 nr
= min(right_nritems
, max_slot
);
3323 nr
= min(right_nritems
- 1, max_slot
);
3325 for (i
= 0; i
< nr
; i
++) {
3326 if (!empty
&& push_items
> 0) {
3327 if (path
->slots
[0] < i
)
3329 if (path
->slots
[0] == i
) {
3330 int space
= btrfs_leaf_free_space(right
);
3332 if (space
+ push_space
* 2 > free_space
)
3337 if (path
->slots
[0] == i
)
3338 push_space
+= data_size
;
3340 this_item_size
= btrfs_item_size(right
, i
);
3341 if (this_item_size
+ sizeof(struct btrfs_item
) + push_space
>
3346 push_space
+= this_item_size
+ sizeof(struct btrfs_item
);
3349 if (push_items
== 0) {
3353 WARN_ON(!empty
&& push_items
== btrfs_header_nritems(right
));
3355 /* push data from right to left */
3356 copy_leaf_items(left
, right
, btrfs_header_nritems(left
), 0, push_items
);
3358 push_space
= BTRFS_LEAF_DATA_SIZE(fs_info
) -
3359 btrfs_item_offset(right
, push_items
- 1);
3361 copy_leaf_data(left
, right
, leaf_data_end(left
) - push_space
,
3362 btrfs_item_offset(right
, push_items
- 1), push_space
);
3363 old_left_nritems
= btrfs_header_nritems(left
);
3364 BUG_ON(old_left_nritems
<= 0);
3366 btrfs_init_map_token(&token
, left
);
3367 old_left_item_size
= btrfs_item_offset(left
, old_left_nritems
- 1);
3368 for (i
= old_left_nritems
; i
< old_left_nritems
+ push_items
; i
++) {
3371 ioff
= btrfs_token_item_offset(&token
, i
);
3372 btrfs_set_token_item_offset(&token
, i
,
3373 ioff
- (BTRFS_LEAF_DATA_SIZE(fs_info
) - old_left_item_size
));
3375 btrfs_set_header_nritems(left
, old_left_nritems
+ push_items
);
3377 /* fixup right node */
3378 if (push_items
> right_nritems
)
3379 WARN(1, KERN_CRIT
"push items %d nr %u\n", push_items
,
3382 if (push_items
< right_nritems
) {
3383 push_space
= btrfs_item_offset(right
, push_items
- 1) -
3384 leaf_data_end(right
);
3385 memmove_leaf_data(right
,
3386 BTRFS_LEAF_DATA_SIZE(fs_info
) - push_space
,
3387 leaf_data_end(right
), push_space
);
3389 memmove_leaf_items(right
, 0, push_items
,
3390 btrfs_header_nritems(right
) - push_items
);
3393 btrfs_init_map_token(&token
, right
);
3394 right_nritems
-= push_items
;
3395 btrfs_set_header_nritems(right
, right_nritems
);
3396 push_space
= BTRFS_LEAF_DATA_SIZE(fs_info
);
3397 for (i
= 0; i
< right_nritems
; i
++) {
3398 push_space
= push_space
- btrfs_token_item_size(&token
, i
);
3399 btrfs_set_token_item_offset(&token
, i
, push_space
);
3402 btrfs_mark_buffer_dirty(trans
, left
);
3404 btrfs_mark_buffer_dirty(trans
, right
);
3406 btrfs_clear_buffer_dirty(trans
, right
);
3408 btrfs_item_key(right
, &disk_key
, 0);
3409 fixup_low_keys(trans
, path
, &disk_key
, 1);
3411 /* then fixup the leaf pointer in the path */
3412 if (path
->slots
[0] < push_items
) {
3413 path
->slots
[0] += old_left_nritems
;
3414 btrfs_tree_unlock(path
->nodes
[0]);
3415 free_extent_buffer(path
->nodes
[0]);
3416 path
->nodes
[0] = left
;
3417 path
->slots
[1] -= 1;
3419 btrfs_tree_unlock(left
);
3420 free_extent_buffer(left
);
3421 path
->slots
[0] -= push_items
;
3423 BUG_ON(path
->slots
[0] < 0);
3426 btrfs_tree_unlock(left
);
3427 free_extent_buffer(left
);
3432 * push some data in the path leaf to the left, trying to free up at
3433 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3435 * max_slot can put a limit on how far into the leaf we'll push items. The
3436 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3439 static int push_leaf_left(struct btrfs_trans_handle
*trans
, struct btrfs_root
3440 *root
, struct btrfs_path
*path
, int min_data_size
,
3441 int data_size
, int empty
, u32 max_slot
)
3443 struct extent_buffer
*right
= path
->nodes
[0];
3444 struct extent_buffer
*left
;
3450 slot
= path
->slots
[1];
3453 if (!path
->nodes
[1])
3456 right_nritems
= btrfs_header_nritems(right
);
3457 if (right_nritems
== 0)
3460 btrfs_assert_tree_write_locked(path
->nodes
[1]);
3462 left
= btrfs_read_node_slot(path
->nodes
[1], slot
- 1);
3464 return PTR_ERR(left
);
3466 __btrfs_tree_lock(left
, BTRFS_NESTING_LEFT
);
3468 free_space
= btrfs_leaf_free_space(left
);
3469 if (free_space
< data_size
) {
3474 ret
= btrfs_cow_block(trans
, root
, left
,
3475 path
->nodes
[1], slot
- 1, &left
,
3476 BTRFS_NESTING_LEFT_COW
);
3478 /* we hit -ENOSPC, but it isn't fatal here */
3484 if (check_sibling_keys(left
, right
)) {
3486 btrfs_abort_transaction(trans
, ret
);
3489 return __push_leaf_left(trans
, path
, min_data_size
, empty
, left
,
3490 free_space
, right_nritems
, max_slot
);
3492 btrfs_tree_unlock(left
);
3493 free_extent_buffer(left
);
3498 * split the path's leaf in two, making sure there is at least data_size
3499 * available for the resulting leaf level of the path.
3501 static noinline
int copy_for_split(struct btrfs_trans_handle
*trans
,
3502 struct btrfs_path
*path
,
3503 struct extent_buffer
*l
,
3504 struct extent_buffer
*right
,
3505 int slot
, int mid
, int nritems
)
3507 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
3512 struct btrfs_disk_key disk_key
;
3513 struct btrfs_map_token token
;
3515 nritems
= nritems
- mid
;
3516 btrfs_set_header_nritems(right
, nritems
);
3517 data_copy_size
= btrfs_item_data_end(l
, mid
) - leaf_data_end(l
);
3519 copy_leaf_items(right
, l
, 0, mid
, nritems
);
3521 copy_leaf_data(right
, l
, BTRFS_LEAF_DATA_SIZE(fs_info
) - data_copy_size
,
3522 leaf_data_end(l
), data_copy_size
);
3524 rt_data_off
= BTRFS_LEAF_DATA_SIZE(fs_info
) - btrfs_item_data_end(l
, mid
);
3526 btrfs_init_map_token(&token
, right
);
3527 for (i
= 0; i
< nritems
; i
++) {
3530 ioff
= btrfs_token_item_offset(&token
, i
);
3531 btrfs_set_token_item_offset(&token
, i
, ioff
+ rt_data_off
);
3534 btrfs_set_header_nritems(l
, mid
);
3535 btrfs_item_key(right
, &disk_key
, 0);
3536 ret
= insert_ptr(trans
, path
, &disk_key
, right
->start
, path
->slots
[1] + 1, 1);
3540 btrfs_mark_buffer_dirty(trans
, right
);
3541 btrfs_mark_buffer_dirty(trans
, l
);
3542 BUG_ON(path
->slots
[0] != slot
);
3545 btrfs_tree_unlock(path
->nodes
[0]);
3546 free_extent_buffer(path
->nodes
[0]);
3547 path
->nodes
[0] = right
;
3548 path
->slots
[0] -= mid
;
3549 path
->slots
[1] += 1;
3551 btrfs_tree_unlock(right
);
3552 free_extent_buffer(right
);
3555 BUG_ON(path
->slots
[0] < 0);
3561 * double splits happen when we need to insert a big item in the middle
3562 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3563 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3566 * We avoid this by trying to push the items on either side of our target
3567 * into the adjacent leaves. If all goes well we can avoid the double split
3570 static noinline
int push_for_double_split(struct btrfs_trans_handle
*trans
,
3571 struct btrfs_root
*root
,
3572 struct btrfs_path
*path
,
3579 int space_needed
= data_size
;
3581 slot
= path
->slots
[0];
3582 if (slot
< btrfs_header_nritems(path
->nodes
[0]))
3583 space_needed
-= btrfs_leaf_free_space(path
->nodes
[0]);
3586 * try to push all the items after our slot into the
3589 ret
= push_leaf_right(trans
, root
, path
, 1, space_needed
, 0, slot
);
3596 nritems
= btrfs_header_nritems(path
->nodes
[0]);
3598 * our goal is to get our slot at the start or end of a leaf. If
3599 * we've done so we're done
3601 if (path
->slots
[0] == 0 || path
->slots
[0] == nritems
)
3604 if (btrfs_leaf_free_space(path
->nodes
[0]) >= data_size
)
3607 /* try to push all the items before our slot into the next leaf */
3608 slot
= path
->slots
[0];
3609 space_needed
= data_size
;
3611 space_needed
-= btrfs_leaf_free_space(path
->nodes
[0]);
3612 ret
= push_leaf_left(trans
, root
, path
, 1, space_needed
, 0, slot
);
3625 * split the path's leaf in two, making sure there is at least data_size
3626 * available for the resulting leaf level of the path.
3628 * returns 0 if all went well and < 0 on failure.
3630 static noinline
int split_leaf(struct btrfs_trans_handle
*trans
,
3631 struct btrfs_root
*root
,
3632 const struct btrfs_key
*ins_key
,
3633 struct btrfs_path
*path
, int data_size
,
3636 struct btrfs_disk_key disk_key
;
3637 struct extent_buffer
*l
;
3641 struct extent_buffer
*right
;
3642 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
3646 int num_doubles
= 0;
3647 int tried_avoid_double
= 0;
3650 slot
= path
->slots
[0];
3651 if (extend
&& data_size
+ btrfs_item_size(l
, slot
) +
3652 sizeof(struct btrfs_item
) > BTRFS_LEAF_DATA_SIZE(fs_info
))
3655 /* first try to make some room by pushing left and right */
3656 if (data_size
&& path
->nodes
[1]) {
3657 int space_needed
= data_size
;
3659 if (slot
< btrfs_header_nritems(l
))
3660 space_needed
-= btrfs_leaf_free_space(l
);
3662 wret
= push_leaf_right(trans
, root
, path
, space_needed
,
3663 space_needed
, 0, 0);
3667 space_needed
= data_size
;
3669 space_needed
-= btrfs_leaf_free_space(l
);
3670 wret
= push_leaf_left(trans
, root
, path
, space_needed
,
3671 space_needed
, 0, (u32
)-1);
3677 /* did the pushes work? */
3678 if (btrfs_leaf_free_space(l
) >= data_size
)
3682 if (!path
->nodes
[1]) {
3683 ret
= insert_new_root(trans
, root
, path
, 1);
3690 slot
= path
->slots
[0];
3691 nritems
= btrfs_header_nritems(l
);
3692 mid
= (nritems
+ 1) / 2;
3696 leaf_space_used(l
, mid
, nritems
- mid
) + data_size
>
3697 BTRFS_LEAF_DATA_SIZE(fs_info
)) {
3698 if (slot
>= nritems
) {
3702 if (mid
!= nritems
&&
3703 leaf_space_used(l
, mid
, nritems
- mid
) +
3704 data_size
> BTRFS_LEAF_DATA_SIZE(fs_info
)) {
3705 if (data_size
&& !tried_avoid_double
)
3706 goto push_for_double
;
3712 if (leaf_space_used(l
, 0, mid
) + data_size
>
3713 BTRFS_LEAF_DATA_SIZE(fs_info
)) {
3714 if (!extend
&& data_size
&& slot
== 0) {
3716 } else if ((extend
|| !data_size
) && slot
== 0) {
3720 if (mid
!= nritems
&&
3721 leaf_space_used(l
, mid
, nritems
- mid
) +
3722 data_size
> BTRFS_LEAF_DATA_SIZE(fs_info
)) {
3723 if (data_size
&& !tried_avoid_double
)
3724 goto push_for_double
;
3732 btrfs_cpu_key_to_disk(&disk_key
, ins_key
);
3734 btrfs_item_key(l
, &disk_key
, mid
);
3737 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3738 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3739 * subclasses, which is 8 at the time of this patch, and we've maxed it
3740 * out. In the future we could add a
3741 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3742 * use BTRFS_NESTING_NEW_ROOT.
3744 right
= btrfs_alloc_tree_block(trans
, root
, 0, root
->root_key
.objectid
,
3745 &disk_key
, 0, l
->start
, 0, 0,
3746 num_doubles
? BTRFS_NESTING_NEW_ROOT
:
3747 BTRFS_NESTING_SPLIT
);
3749 return PTR_ERR(right
);
3751 root_add_used_bytes(root
);
3755 btrfs_set_header_nritems(right
, 0);
3756 ret
= insert_ptr(trans
, path
, &disk_key
,
3757 right
->start
, path
->slots
[1] + 1, 1);
3759 btrfs_tree_unlock(right
);
3760 free_extent_buffer(right
);
3763 btrfs_tree_unlock(path
->nodes
[0]);
3764 free_extent_buffer(path
->nodes
[0]);
3765 path
->nodes
[0] = right
;
3767 path
->slots
[1] += 1;
3769 btrfs_set_header_nritems(right
, 0);
3770 ret
= insert_ptr(trans
, path
, &disk_key
,
3771 right
->start
, path
->slots
[1], 1);
3773 btrfs_tree_unlock(right
);
3774 free_extent_buffer(right
);
3777 btrfs_tree_unlock(path
->nodes
[0]);
3778 free_extent_buffer(path
->nodes
[0]);
3779 path
->nodes
[0] = right
;
3781 if (path
->slots
[1] == 0)
3782 fixup_low_keys(trans
, path
, &disk_key
, 1);
3785 * We create a new leaf 'right' for the required ins_len and
3786 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3787 * the content of ins_len to 'right'.
3792 ret
= copy_for_split(trans
, path
, l
, right
, slot
, mid
, nritems
);
3794 btrfs_tree_unlock(right
);
3795 free_extent_buffer(right
);
3800 BUG_ON(num_doubles
!= 0);
3808 push_for_double_split(trans
, root
, path
, data_size
);
3809 tried_avoid_double
= 1;
3810 if (btrfs_leaf_free_space(path
->nodes
[0]) >= data_size
)
3815 static noinline
int setup_leaf_for_split(struct btrfs_trans_handle
*trans
,
3816 struct btrfs_root
*root
,
3817 struct btrfs_path
*path
, int ins_len
)
3819 struct btrfs_key key
;
3820 struct extent_buffer
*leaf
;
3821 struct btrfs_file_extent_item
*fi
;
3826 leaf
= path
->nodes
[0];
3827 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
3829 BUG_ON(key
.type
!= BTRFS_EXTENT_DATA_KEY
&&
3830 key
.type
!= BTRFS_EXTENT_CSUM_KEY
);
3832 if (btrfs_leaf_free_space(leaf
) >= ins_len
)
3835 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
3836 if (key
.type
== BTRFS_EXTENT_DATA_KEY
) {
3837 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
3838 struct btrfs_file_extent_item
);
3839 extent_len
= btrfs_file_extent_num_bytes(leaf
, fi
);
3841 btrfs_release_path(path
);
3843 path
->keep_locks
= 1;
3844 path
->search_for_split
= 1;
3845 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
3846 path
->search_for_split
= 0;
3853 leaf
= path
->nodes
[0];
3854 /* if our item isn't there, return now */
3855 if (item_size
!= btrfs_item_size(leaf
, path
->slots
[0]))
3858 /* the leaf has changed, it now has room. return now */
3859 if (btrfs_leaf_free_space(path
->nodes
[0]) >= ins_len
)
3862 if (key
.type
== BTRFS_EXTENT_DATA_KEY
) {
3863 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
3864 struct btrfs_file_extent_item
);
3865 if (extent_len
!= btrfs_file_extent_num_bytes(leaf
, fi
))
3869 ret
= split_leaf(trans
, root
, &key
, path
, ins_len
, 1);
3873 path
->keep_locks
= 0;
3874 btrfs_unlock_up_safe(path
, 1);
3877 path
->keep_locks
= 0;
3881 static noinline
int split_item(struct btrfs_trans_handle
*trans
,
3882 struct btrfs_path
*path
,
3883 const struct btrfs_key
*new_key
,
3884 unsigned long split_offset
)
3886 struct extent_buffer
*leaf
;
3887 int orig_slot
, slot
;
3892 struct btrfs_disk_key disk_key
;
3894 leaf
= path
->nodes
[0];
3896 * Shouldn't happen because the caller must have previously called
3897 * setup_leaf_for_split() to make room for the new item in the leaf.
3899 if (WARN_ON(btrfs_leaf_free_space(leaf
) < sizeof(struct btrfs_item
)))
3902 orig_slot
= path
->slots
[0];
3903 orig_offset
= btrfs_item_offset(leaf
, path
->slots
[0]);
3904 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
3906 buf
= kmalloc(item_size
, GFP_NOFS
);
3910 read_extent_buffer(leaf
, buf
, btrfs_item_ptr_offset(leaf
,
3911 path
->slots
[0]), item_size
);
3913 slot
= path
->slots
[0] + 1;
3914 nritems
= btrfs_header_nritems(leaf
);
3915 if (slot
!= nritems
) {
3916 /* shift the items */
3917 memmove_leaf_items(leaf
, slot
+ 1, slot
, nritems
- slot
);
3920 btrfs_cpu_key_to_disk(&disk_key
, new_key
);
3921 btrfs_set_item_key(leaf
, &disk_key
, slot
);
3923 btrfs_set_item_offset(leaf
, slot
, orig_offset
);
3924 btrfs_set_item_size(leaf
, slot
, item_size
- split_offset
);
3926 btrfs_set_item_offset(leaf
, orig_slot
,
3927 orig_offset
+ item_size
- split_offset
);
3928 btrfs_set_item_size(leaf
, orig_slot
, split_offset
);
3930 btrfs_set_header_nritems(leaf
, nritems
+ 1);
3932 /* write the data for the start of the original item */
3933 write_extent_buffer(leaf
, buf
,
3934 btrfs_item_ptr_offset(leaf
, path
->slots
[0]),
3937 /* write the data for the new item */
3938 write_extent_buffer(leaf
, buf
+ split_offset
,
3939 btrfs_item_ptr_offset(leaf
, slot
),
3940 item_size
- split_offset
);
3941 btrfs_mark_buffer_dirty(trans
, leaf
);
3943 BUG_ON(btrfs_leaf_free_space(leaf
) < 0);
3949 * This function splits a single item into two items,
3950 * giving 'new_key' to the new item and splitting the
3951 * old one at split_offset (from the start of the item).
3953 * The path may be released by this operation. After
3954 * the split, the path is pointing to the old item. The
3955 * new item is going to be in the same node as the old one.
3957 * Note, the item being split must be smaller enough to live alone on
3958 * a tree block with room for one extra struct btrfs_item
3960 * This allows us to split the item in place, keeping a lock on the
3961 * leaf the entire time.
3963 int btrfs_split_item(struct btrfs_trans_handle
*trans
,
3964 struct btrfs_root
*root
,
3965 struct btrfs_path
*path
,
3966 const struct btrfs_key
*new_key
,
3967 unsigned long split_offset
)
3970 ret
= setup_leaf_for_split(trans
, root
, path
,
3971 sizeof(struct btrfs_item
));
3975 ret
= split_item(trans
, path
, new_key
, split_offset
);
3980 * make the item pointed to by the path smaller. new_size indicates
3981 * how small to make it, and from_end tells us if we just chop bytes
3982 * off the end of the item or if we shift the item to chop bytes off
3985 void btrfs_truncate_item(struct btrfs_trans_handle
*trans
,
3986 struct btrfs_path
*path
, u32 new_size
, int from_end
)
3989 struct extent_buffer
*leaf
;
3991 unsigned int data_end
;
3992 unsigned int old_data_start
;
3993 unsigned int old_size
;
3994 unsigned int size_diff
;
3996 struct btrfs_map_token token
;
3998 leaf
= path
->nodes
[0];
3999 slot
= path
->slots
[0];
4001 old_size
= btrfs_item_size(leaf
, slot
);
4002 if (old_size
== new_size
)
4005 nritems
= btrfs_header_nritems(leaf
);
4006 data_end
= leaf_data_end(leaf
);
4008 old_data_start
= btrfs_item_offset(leaf
, slot
);
4010 size_diff
= old_size
- new_size
;
4013 BUG_ON(slot
>= nritems
);
4016 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4018 /* first correct the data pointers */
4019 btrfs_init_map_token(&token
, leaf
);
4020 for (i
= slot
; i
< nritems
; i
++) {
4023 ioff
= btrfs_token_item_offset(&token
, i
);
4024 btrfs_set_token_item_offset(&token
, i
, ioff
+ size_diff
);
4027 /* shift the data */
4029 memmove_leaf_data(leaf
, data_end
+ size_diff
, data_end
,
4030 old_data_start
+ new_size
- data_end
);
4032 struct btrfs_disk_key disk_key
;
4035 btrfs_item_key(leaf
, &disk_key
, slot
);
4037 if (btrfs_disk_key_type(&disk_key
) == BTRFS_EXTENT_DATA_KEY
) {
4039 struct btrfs_file_extent_item
*fi
;
4041 fi
= btrfs_item_ptr(leaf
, slot
,
4042 struct btrfs_file_extent_item
);
4043 fi
= (struct btrfs_file_extent_item
*)(
4044 (unsigned long)fi
- size_diff
);
4046 if (btrfs_file_extent_type(leaf
, fi
) ==
4047 BTRFS_FILE_EXTENT_INLINE
) {
4048 ptr
= btrfs_item_ptr_offset(leaf
, slot
);
4049 memmove_extent_buffer(leaf
, ptr
,
4051 BTRFS_FILE_EXTENT_INLINE_DATA_START
);
4055 memmove_leaf_data(leaf
, data_end
+ size_diff
, data_end
,
4056 old_data_start
- data_end
);
4058 offset
= btrfs_disk_key_offset(&disk_key
);
4059 btrfs_set_disk_key_offset(&disk_key
, offset
+ size_diff
);
4060 btrfs_set_item_key(leaf
, &disk_key
, slot
);
4062 fixup_low_keys(trans
, path
, &disk_key
, 1);
4065 btrfs_set_item_size(leaf
, slot
, new_size
);
4066 btrfs_mark_buffer_dirty(trans
, leaf
);
4068 if (btrfs_leaf_free_space(leaf
) < 0) {
4069 btrfs_print_leaf(leaf
);
4075 * make the item pointed to by the path bigger, data_size is the added size.
4077 void btrfs_extend_item(struct btrfs_trans_handle
*trans
,
4078 struct btrfs_path
*path
, u32 data_size
)
4081 struct extent_buffer
*leaf
;
4083 unsigned int data_end
;
4084 unsigned int old_data
;
4085 unsigned int old_size
;
4087 struct btrfs_map_token token
;
4089 leaf
= path
->nodes
[0];
4091 nritems
= btrfs_header_nritems(leaf
);
4092 data_end
= leaf_data_end(leaf
);
4094 if (btrfs_leaf_free_space(leaf
) < data_size
) {
4095 btrfs_print_leaf(leaf
);
4098 slot
= path
->slots
[0];
4099 old_data
= btrfs_item_data_end(leaf
, slot
);
4102 if (slot
>= nritems
) {
4103 btrfs_print_leaf(leaf
);
4104 btrfs_crit(leaf
->fs_info
, "slot %d too large, nritems %d",
4110 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4112 /* first correct the data pointers */
4113 btrfs_init_map_token(&token
, leaf
);
4114 for (i
= slot
; i
< nritems
; i
++) {
4117 ioff
= btrfs_token_item_offset(&token
, i
);
4118 btrfs_set_token_item_offset(&token
, i
, ioff
- data_size
);
4121 /* shift the data */
4122 memmove_leaf_data(leaf
, data_end
- data_size
, data_end
,
4123 old_data
- data_end
);
4125 data_end
= old_data
;
4126 old_size
= btrfs_item_size(leaf
, slot
);
4127 btrfs_set_item_size(leaf
, slot
, old_size
+ data_size
);
4128 btrfs_mark_buffer_dirty(trans
, leaf
);
4130 if (btrfs_leaf_free_space(leaf
) < 0) {
4131 btrfs_print_leaf(leaf
);
4137 * Make space in the node before inserting one or more items.
4139 * @trans: transaction handle
4140 * @root: root we are inserting items to
4141 * @path: points to the leaf/slot where we are going to insert new items
4142 * @batch: information about the batch of items to insert
4144 * Main purpose is to save stack depth by doing the bulk of the work in a
4145 * function that doesn't call btrfs_search_slot
4147 static void setup_items_for_insert(struct btrfs_trans_handle
*trans
,
4148 struct btrfs_root
*root
, struct btrfs_path
*path
,
4149 const struct btrfs_item_batch
*batch
)
4151 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4154 unsigned int data_end
;
4155 struct btrfs_disk_key disk_key
;
4156 struct extent_buffer
*leaf
;
4158 struct btrfs_map_token token
;
4162 * Before anything else, update keys in the parent and other ancestors
4163 * if needed, then release the write locks on them, so that other tasks
4164 * can use them while we modify the leaf.
4166 if (path
->slots
[0] == 0) {
4167 btrfs_cpu_key_to_disk(&disk_key
, &batch
->keys
[0]);
4168 fixup_low_keys(trans
, path
, &disk_key
, 1);
4170 btrfs_unlock_up_safe(path
, 1);
4172 leaf
= path
->nodes
[0];
4173 slot
= path
->slots
[0];
4175 nritems
= btrfs_header_nritems(leaf
);
4176 data_end
= leaf_data_end(leaf
);
4177 total_size
= batch
->total_data_size
+ (batch
->nr
* sizeof(struct btrfs_item
));
4179 if (btrfs_leaf_free_space(leaf
) < total_size
) {
4180 btrfs_print_leaf(leaf
);
4181 btrfs_crit(fs_info
, "not enough freespace need %u have %d",
4182 total_size
, btrfs_leaf_free_space(leaf
));
4186 btrfs_init_map_token(&token
, leaf
);
4187 if (slot
!= nritems
) {
4188 unsigned int old_data
= btrfs_item_data_end(leaf
, slot
);
4190 if (old_data
< data_end
) {
4191 btrfs_print_leaf(leaf
);
4193 "item at slot %d with data offset %u beyond data end of leaf %u",
4194 slot
, old_data
, data_end
);
4198 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4200 /* first correct the data pointers */
4201 for (i
= slot
; i
< nritems
; i
++) {
4204 ioff
= btrfs_token_item_offset(&token
, i
);
4205 btrfs_set_token_item_offset(&token
, i
,
4206 ioff
- batch
->total_data_size
);
4208 /* shift the items */
4209 memmove_leaf_items(leaf
, slot
+ batch
->nr
, slot
, nritems
- slot
);
4211 /* shift the data */
4212 memmove_leaf_data(leaf
, data_end
- batch
->total_data_size
,
4213 data_end
, old_data
- data_end
);
4214 data_end
= old_data
;
4217 /* setup the item for the new data */
4218 for (i
= 0; i
< batch
->nr
; i
++) {
4219 btrfs_cpu_key_to_disk(&disk_key
, &batch
->keys
[i
]);
4220 btrfs_set_item_key(leaf
, &disk_key
, slot
+ i
);
4221 data_end
-= batch
->data_sizes
[i
];
4222 btrfs_set_token_item_offset(&token
, slot
+ i
, data_end
);
4223 btrfs_set_token_item_size(&token
, slot
+ i
, batch
->data_sizes
[i
]);
4226 btrfs_set_header_nritems(leaf
, nritems
+ batch
->nr
);
4227 btrfs_mark_buffer_dirty(trans
, leaf
);
4229 if (btrfs_leaf_free_space(leaf
) < 0) {
4230 btrfs_print_leaf(leaf
);
4236 * Insert a new item into a leaf.
4238 * @trans: Transaction handle.
4239 * @root: The root of the btree.
4240 * @path: A path pointing to the target leaf and slot.
4241 * @key: The key of the new item.
4242 * @data_size: The size of the data associated with the new key.
4244 void btrfs_setup_item_for_insert(struct btrfs_trans_handle
*trans
,
4245 struct btrfs_root
*root
,
4246 struct btrfs_path
*path
,
4247 const struct btrfs_key
*key
,
4250 struct btrfs_item_batch batch
;
4253 batch
.data_sizes
= &data_size
;
4254 batch
.total_data_size
= data_size
;
4257 setup_items_for_insert(trans
, root
, path
, &batch
);
4261 * Given a key and some data, insert items into the tree.
4262 * This does all the path init required, making room in the tree if needed.
4264 int btrfs_insert_empty_items(struct btrfs_trans_handle
*trans
,
4265 struct btrfs_root
*root
,
4266 struct btrfs_path
*path
,
4267 const struct btrfs_item_batch
*batch
)
4273 total_size
= batch
->total_data_size
+ (batch
->nr
* sizeof(struct btrfs_item
));
4274 ret
= btrfs_search_slot(trans
, root
, &batch
->keys
[0], path
, total_size
, 1);
4280 slot
= path
->slots
[0];
4283 setup_items_for_insert(trans
, root
, path
, batch
);
4288 * Given a key and some data, insert an item into the tree.
4289 * This does all the path init required, making room in the tree if needed.
4291 int btrfs_insert_item(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
4292 const struct btrfs_key
*cpu_key
, void *data
,
4296 struct btrfs_path
*path
;
4297 struct extent_buffer
*leaf
;
4300 path
= btrfs_alloc_path();
4303 ret
= btrfs_insert_empty_item(trans
, root
, path
, cpu_key
, data_size
);
4305 leaf
= path
->nodes
[0];
4306 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
4307 write_extent_buffer(leaf
, data
, ptr
, data_size
);
4308 btrfs_mark_buffer_dirty(trans
, leaf
);
4310 btrfs_free_path(path
);
4315 * This function duplicates an item, giving 'new_key' to the new item.
4316 * It guarantees both items live in the same tree leaf and the new item is
4317 * contiguous with the original item.
4319 * This allows us to split a file extent in place, keeping a lock on the leaf
4322 int btrfs_duplicate_item(struct btrfs_trans_handle
*trans
,
4323 struct btrfs_root
*root
,
4324 struct btrfs_path
*path
,
4325 const struct btrfs_key
*new_key
)
4327 struct extent_buffer
*leaf
;
4331 leaf
= path
->nodes
[0];
4332 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
4333 ret
= setup_leaf_for_split(trans
, root
, path
,
4334 item_size
+ sizeof(struct btrfs_item
));
4339 btrfs_setup_item_for_insert(trans
, root
, path
, new_key
, item_size
);
4340 leaf
= path
->nodes
[0];
4341 memcpy_extent_buffer(leaf
,
4342 btrfs_item_ptr_offset(leaf
, path
->slots
[0]),
4343 btrfs_item_ptr_offset(leaf
, path
->slots
[0] - 1),
4349 * delete the pointer from a given node.
4351 * the tree should have been previously balanced so the deletion does not
4354 * This is exported for use inside btrfs-progs, don't un-export it.
4356 int btrfs_del_ptr(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
4357 struct btrfs_path
*path
, int level
, int slot
)
4359 struct extent_buffer
*parent
= path
->nodes
[level
];
4363 nritems
= btrfs_header_nritems(parent
);
4364 if (slot
!= nritems
- 1) {
4366 ret
= btrfs_tree_mod_log_insert_move(parent
, slot
,
4367 slot
+ 1, nritems
- slot
- 1);
4369 btrfs_abort_transaction(trans
, ret
);
4373 memmove_extent_buffer(parent
,
4374 btrfs_node_key_ptr_offset(parent
, slot
),
4375 btrfs_node_key_ptr_offset(parent
, slot
+ 1),
4376 sizeof(struct btrfs_key_ptr
) *
4377 (nritems
- slot
- 1));
4379 ret
= btrfs_tree_mod_log_insert_key(parent
, slot
,
4380 BTRFS_MOD_LOG_KEY_REMOVE
);
4382 btrfs_abort_transaction(trans
, ret
);
4388 btrfs_set_header_nritems(parent
, nritems
);
4389 if (nritems
== 0 && parent
== root
->node
) {
4390 BUG_ON(btrfs_header_level(root
->node
) != 1);
4391 /* just turn the root into a leaf and break */
4392 btrfs_set_header_level(root
->node
, 0);
4393 } else if (slot
== 0) {
4394 struct btrfs_disk_key disk_key
;
4396 btrfs_node_key(parent
, &disk_key
, 0);
4397 fixup_low_keys(trans
, path
, &disk_key
, level
+ 1);
4399 btrfs_mark_buffer_dirty(trans
, parent
);
4404 * a helper function to delete the leaf pointed to by path->slots[1] and
4407 * This deletes the pointer in path->nodes[1] and frees the leaf
4408 * block extent. zero is returned if it all worked out, < 0 otherwise.
4410 * The path must have already been setup for deleting the leaf, including
4411 * all the proper balancing. path->nodes[1] must be locked.
4413 static noinline
int btrfs_del_leaf(struct btrfs_trans_handle
*trans
,
4414 struct btrfs_root
*root
,
4415 struct btrfs_path
*path
,
4416 struct extent_buffer
*leaf
)
4420 WARN_ON(btrfs_header_generation(leaf
) != trans
->transid
);
4421 ret
= btrfs_del_ptr(trans
, root
, path
, 1, path
->slots
[1]);
4426 * btrfs_free_extent is expensive, we want to make sure we
4427 * aren't holding any locks when we call it
4429 btrfs_unlock_up_safe(path
, 0);
4431 root_sub_used_bytes(root
);
4433 atomic_inc(&leaf
->refs
);
4434 btrfs_free_tree_block(trans
, btrfs_root_id(root
), leaf
, 0, 1);
4435 free_extent_buffer_stale(leaf
);
4439 * delete the item at the leaf level in path. If that empties
4440 * the leaf, remove it from the tree
4442 int btrfs_del_items(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
,
4443 struct btrfs_path
*path
, int slot
, int nr
)
4445 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4446 struct extent_buffer
*leaf
;
4451 leaf
= path
->nodes
[0];
4452 nritems
= btrfs_header_nritems(leaf
);
4454 if (slot
+ nr
!= nritems
) {
4455 const u32 last_off
= btrfs_item_offset(leaf
, slot
+ nr
- 1);
4456 const int data_end
= leaf_data_end(leaf
);
4457 struct btrfs_map_token token
;
4461 for (i
= 0; i
< nr
; i
++)
4462 dsize
+= btrfs_item_size(leaf
, slot
+ i
);
4464 memmove_leaf_data(leaf
, data_end
+ dsize
, data_end
,
4465 last_off
- data_end
);
4467 btrfs_init_map_token(&token
, leaf
);
4468 for (i
= slot
+ nr
; i
< nritems
; i
++) {
4471 ioff
= btrfs_token_item_offset(&token
, i
);
4472 btrfs_set_token_item_offset(&token
, i
, ioff
+ dsize
);
4475 memmove_leaf_items(leaf
, slot
, slot
+ nr
, nritems
- slot
- nr
);
4477 btrfs_set_header_nritems(leaf
, nritems
- nr
);
4480 /* delete the leaf if we've emptied it */
4482 if (leaf
== root
->node
) {
4483 btrfs_set_header_level(leaf
, 0);
4485 btrfs_clear_buffer_dirty(trans
, leaf
);
4486 ret
= btrfs_del_leaf(trans
, root
, path
, leaf
);
4491 int used
= leaf_space_used(leaf
, 0, nritems
);
4493 struct btrfs_disk_key disk_key
;
4495 btrfs_item_key(leaf
, &disk_key
, 0);
4496 fixup_low_keys(trans
, path
, &disk_key
, 1);
4500 * Try to delete the leaf if it is mostly empty. We do this by
4501 * trying to move all its items into its left and right neighbours.
4502 * If we can't move all the items, then we don't delete it - it's
4503 * not ideal, but future insertions might fill the leaf with more
4504 * items, or items from other leaves might be moved later into our
4505 * leaf due to deletions on those leaves.
4507 if (used
< BTRFS_LEAF_DATA_SIZE(fs_info
) / 3) {
4510 /* push_leaf_left fixes the path.
4511 * make sure the path still points to our leaf
4512 * for possible call to btrfs_del_ptr below
4514 slot
= path
->slots
[1];
4515 atomic_inc(&leaf
->refs
);
4517 * We want to be able to at least push one item to the
4518 * left neighbour leaf, and that's the first item.
4520 min_push_space
= sizeof(struct btrfs_item
) +
4521 btrfs_item_size(leaf
, 0);
4522 wret
= push_leaf_left(trans
, root
, path
, 0,
4523 min_push_space
, 1, (u32
)-1);
4524 if (wret
< 0 && wret
!= -ENOSPC
)
4527 if (path
->nodes
[0] == leaf
&&
4528 btrfs_header_nritems(leaf
)) {
4530 * If we were not able to push all items from our
4531 * leaf to its left neighbour, then attempt to
4532 * either push all the remaining items to the
4533 * right neighbour or none. There's no advantage
4534 * in pushing only some items, instead of all, as
4535 * it's pointless to end up with a leaf having
4536 * too few items while the neighbours can be full
4539 nritems
= btrfs_header_nritems(leaf
);
4540 min_push_space
= leaf_space_used(leaf
, 0, nritems
);
4541 wret
= push_leaf_right(trans
, root
, path
, 0,
4542 min_push_space
, 1, 0);
4543 if (wret
< 0 && wret
!= -ENOSPC
)
4547 if (btrfs_header_nritems(leaf
) == 0) {
4548 path
->slots
[1] = slot
;
4549 ret
= btrfs_del_leaf(trans
, root
, path
, leaf
);
4552 free_extent_buffer(leaf
);
4555 /* if we're still in the path, make sure
4556 * we're dirty. Otherwise, one of the
4557 * push_leaf functions must have already
4558 * dirtied this buffer
4560 if (path
->nodes
[0] == leaf
)
4561 btrfs_mark_buffer_dirty(trans
, leaf
);
4562 free_extent_buffer(leaf
);
4565 btrfs_mark_buffer_dirty(trans
, leaf
);
4572 * A helper function to walk down the tree starting at min_key, and looking
4573 * for nodes or leaves that are have a minimum transaction id.
4574 * This is used by the btree defrag code, and tree logging
4576 * This does not cow, but it does stuff the starting key it finds back
4577 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4578 * key and get a writable path.
4580 * This honors path->lowest_level to prevent descent past a given level
4583 * min_trans indicates the oldest transaction that you are interested
4584 * in walking through. Any nodes or leaves older than min_trans are
4585 * skipped over (without reading them).
4587 * returns zero if something useful was found, < 0 on error and 1 if there
4588 * was nothing in the tree that matched the search criteria.
4590 int btrfs_search_forward(struct btrfs_root
*root
, struct btrfs_key
*min_key
,
4591 struct btrfs_path
*path
,
4594 struct extent_buffer
*cur
;
4595 struct btrfs_key found_key
;
4601 int keep_locks
= path
->keep_locks
;
4603 ASSERT(!path
->nowait
);
4604 path
->keep_locks
= 1;
4606 cur
= btrfs_read_lock_root_node(root
);
4607 level
= btrfs_header_level(cur
);
4608 WARN_ON(path
->nodes
[level
]);
4609 path
->nodes
[level
] = cur
;
4610 path
->locks
[level
] = BTRFS_READ_LOCK
;
4612 if (btrfs_header_generation(cur
) < min_trans
) {
4617 nritems
= btrfs_header_nritems(cur
);
4618 level
= btrfs_header_level(cur
);
4619 sret
= btrfs_bin_search(cur
, 0, min_key
, &slot
);
4625 /* at the lowest level, we're done, setup the path and exit */
4626 if (level
== path
->lowest_level
) {
4627 if (slot
>= nritems
)
4630 path
->slots
[level
] = slot
;
4631 btrfs_item_key_to_cpu(cur
, &found_key
, slot
);
4634 if (sret
&& slot
> 0)
4637 * check this node pointer against the min_trans parameters.
4638 * If it is too old, skip to the next one.
4640 while (slot
< nritems
) {
4643 gen
= btrfs_node_ptr_generation(cur
, slot
);
4644 if (gen
< min_trans
) {
4652 * we didn't find a candidate key in this node, walk forward
4653 * and find another one
4655 if (slot
>= nritems
) {
4656 path
->slots
[level
] = slot
;
4657 sret
= btrfs_find_next_key(root
, path
, min_key
, level
,
4660 btrfs_release_path(path
);
4666 /* save our key for returning back */
4667 btrfs_node_key_to_cpu(cur
, &found_key
, slot
);
4668 path
->slots
[level
] = slot
;
4669 if (level
== path
->lowest_level
) {
4673 cur
= btrfs_read_node_slot(cur
, slot
);
4679 btrfs_tree_read_lock(cur
);
4681 path
->locks
[level
- 1] = BTRFS_READ_LOCK
;
4682 path
->nodes
[level
- 1] = cur
;
4683 unlock_up(path
, level
, 1, 0, NULL
);
4686 path
->keep_locks
= keep_locks
;
4688 btrfs_unlock_up_safe(path
, path
->lowest_level
+ 1);
4689 memcpy(min_key
, &found_key
, sizeof(found_key
));
4695 * this is similar to btrfs_next_leaf, but does not try to preserve
4696 * and fixup the path. It looks for and returns the next key in the
4697 * tree based on the current path and the min_trans parameters.
4699 * 0 is returned if another key is found, < 0 if there are any errors
4700 * and 1 is returned if there are no higher keys in the tree
4702 * path->keep_locks should be set to 1 on the search made before
4703 * calling this function.
4705 int btrfs_find_next_key(struct btrfs_root
*root
, struct btrfs_path
*path
,
4706 struct btrfs_key
*key
, int level
, u64 min_trans
)
4709 struct extent_buffer
*c
;
4711 WARN_ON(!path
->keep_locks
&& !path
->skip_locking
);
4712 while (level
< BTRFS_MAX_LEVEL
) {
4713 if (!path
->nodes
[level
])
4716 slot
= path
->slots
[level
] + 1;
4717 c
= path
->nodes
[level
];
4719 if (slot
>= btrfs_header_nritems(c
)) {
4722 struct btrfs_key cur_key
;
4723 if (level
+ 1 >= BTRFS_MAX_LEVEL
||
4724 !path
->nodes
[level
+ 1])
4727 if (path
->locks
[level
+ 1] || path
->skip_locking
) {
4732 slot
= btrfs_header_nritems(c
) - 1;
4734 btrfs_item_key_to_cpu(c
, &cur_key
, slot
);
4736 btrfs_node_key_to_cpu(c
, &cur_key
, slot
);
4738 orig_lowest
= path
->lowest_level
;
4739 btrfs_release_path(path
);
4740 path
->lowest_level
= level
;
4741 ret
= btrfs_search_slot(NULL
, root
, &cur_key
, path
,
4743 path
->lowest_level
= orig_lowest
;
4747 c
= path
->nodes
[level
];
4748 slot
= path
->slots
[level
];
4755 btrfs_item_key_to_cpu(c
, key
, slot
);
4757 u64 gen
= btrfs_node_ptr_generation(c
, slot
);
4759 if (gen
< min_trans
) {
4763 btrfs_node_key_to_cpu(c
, key
, slot
);
4770 int btrfs_next_old_leaf(struct btrfs_root
*root
, struct btrfs_path
*path
,
4775 struct extent_buffer
*c
;
4776 struct extent_buffer
*next
;
4777 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4778 struct btrfs_key key
;
4779 bool need_commit_sem
= false;
4785 * The nowait semantics are used only for write paths, where we don't
4786 * use the tree mod log and sequence numbers.
4789 ASSERT(!path
->nowait
);
4791 nritems
= btrfs_header_nritems(path
->nodes
[0]);
4795 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, nritems
- 1);
4799 btrfs_release_path(path
);
4801 path
->keep_locks
= 1;
4804 ret
= btrfs_search_old_slot(root
, &key
, path
, time_seq
);
4806 if (path
->need_commit_sem
) {
4807 path
->need_commit_sem
= 0;
4808 need_commit_sem
= true;
4810 if (!down_read_trylock(&fs_info
->commit_root_sem
)) {
4815 down_read(&fs_info
->commit_root_sem
);
4818 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
4820 path
->keep_locks
= 0;
4825 nritems
= btrfs_header_nritems(path
->nodes
[0]);
4827 * by releasing the path above we dropped all our locks. A balance
4828 * could have added more items next to the key that used to be
4829 * at the very end of the block. So, check again here and
4830 * advance the path if there are now more items available.
4832 if (nritems
> 0 && path
->slots
[0] < nritems
- 1) {
4839 * So the above check misses one case:
4840 * - after releasing the path above, someone has removed the item that
4841 * used to be at the very end of the block, and balance between leafs
4842 * gets another one with bigger key.offset to replace it.
4844 * This one should be returned as well, or we can get leaf corruption
4845 * later(esp. in __btrfs_drop_extents()).
4847 * And a bit more explanation about this check,
4848 * with ret > 0, the key isn't found, the path points to the slot
4849 * where it should be inserted, so the path->slots[0] item must be the
4852 if (nritems
> 0 && ret
> 0 && path
->slots
[0] == nritems
- 1) {
4857 while (level
< BTRFS_MAX_LEVEL
) {
4858 if (!path
->nodes
[level
]) {
4863 slot
= path
->slots
[level
] + 1;
4864 c
= path
->nodes
[level
];
4865 if (slot
>= btrfs_header_nritems(c
)) {
4867 if (level
== BTRFS_MAX_LEVEL
) {
4876 * Our current level is where we're going to start from, and to
4877 * make sure lockdep doesn't complain we need to drop our locks
4878 * and nodes from 0 to our current level.
4880 for (i
= 0; i
< level
; i
++) {
4881 if (path
->locks
[level
]) {
4882 btrfs_tree_read_unlock(path
->nodes
[i
]);
4885 free_extent_buffer(path
->nodes
[i
]);
4886 path
->nodes
[i
] = NULL
;
4890 ret
= read_block_for_search(root
, path
, &next
, level
,
4892 if (ret
== -EAGAIN
&& !path
->nowait
)
4896 btrfs_release_path(path
);
4900 if (!path
->skip_locking
) {
4901 ret
= btrfs_try_tree_read_lock(next
);
4902 if (!ret
&& path
->nowait
) {
4906 if (!ret
&& time_seq
) {
4908 * If we don't get the lock, we may be racing
4909 * with push_leaf_left, holding that lock while
4910 * itself waiting for the leaf we've currently
4911 * locked. To solve this situation, we give up
4912 * on our lock and cycle.
4914 free_extent_buffer(next
);
4915 btrfs_release_path(path
);
4920 btrfs_tree_read_lock(next
);
4924 path
->slots
[level
] = slot
;
4927 path
->nodes
[level
] = next
;
4928 path
->slots
[level
] = 0;
4929 if (!path
->skip_locking
)
4930 path
->locks
[level
] = BTRFS_READ_LOCK
;
4934 ret
= read_block_for_search(root
, path
, &next
, level
,
4936 if (ret
== -EAGAIN
&& !path
->nowait
)
4940 btrfs_release_path(path
);
4944 if (!path
->skip_locking
) {
4946 if (!btrfs_try_tree_read_lock(next
)) {
4951 btrfs_tree_read_lock(next
);
4957 unlock_up(path
, 0, 1, 0, NULL
);
4958 if (need_commit_sem
) {
4961 path
->need_commit_sem
= 1;
4962 ret2
= finish_need_commit_sem_search(path
);
4963 up_read(&fs_info
->commit_root_sem
);
4971 int btrfs_next_old_item(struct btrfs_root
*root
, struct btrfs_path
*path
, u64 time_seq
)
4974 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0]))
4975 return btrfs_next_old_leaf(root
, path
, time_seq
);
4980 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4981 * searching until it gets past min_objectid or finds an item of 'type'
4983 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4985 int btrfs_previous_item(struct btrfs_root
*root
,
4986 struct btrfs_path
*path
, u64 min_objectid
,
4989 struct btrfs_key found_key
;
4990 struct extent_buffer
*leaf
;
4995 if (path
->slots
[0] == 0) {
4996 ret
= btrfs_prev_leaf(root
, path
);
5002 leaf
= path
->nodes
[0];
5003 nritems
= btrfs_header_nritems(leaf
);
5006 if (path
->slots
[0] == nritems
)
5009 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
5010 if (found_key
.objectid
< min_objectid
)
5012 if (found_key
.type
== type
)
5014 if (found_key
.objectid
== min_objectid
&&
5015 found_key
.type
< type
)
5022 * search in extent tree to find a previous Metadata/Data extent item with
5025 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5027 int btrfs_previous_extent_item(struct btrfs_root
*root
,
5028 struct btrfs_path
*path
, u64 min_objectid
)
5030 struct btrfs_key found_key
;
5031 struct extent_buffer
*leaf
;
5036 if (path
->slots
[0] == 0) {
5037 ret
= btrfs_prev_leaf(root
, path
);
5043 leaf
= path
->nodes
[0];
5044 nritems
= btrfs_header_nritems(leaf
);
5047 if (path
->slots
[0] == nritems
)
5050 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
5051 if (found_key
.objectid
< min_objectid
)
5053 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
||
5054 found_key
.type
== BTRFS_METADATA_ITEM_KEY
)
5056 if (found_key
.objectid
== min_objectid
&&
5057 found_key
.type
< BTRFS_EXTENT_ITEM_KEY
)
5063 int __init
btrfs_ctree_init(void)
5065 btrfs_path_cachep
= kmem_cache_create("btrfs_path",
5066 sizeof(struct btrfs_path
), 0,
5067 SLAB_MEM_SPREAD
, NULL
);
5068 if (!btrfs_path_cachep
)
5073 void __cold
btrfs_ctree_exit(void)
5075 kmem_cache_destroy(btrfs_path_cachep
);