1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
7 #include <linux/slab.h>
8 #include <linux/iversion.h>
13 #include "delayed-inode.h"
15 #include "transaction.h"
18 #include "inode-item.h"
19 #include "space-info.h"
20 #include "accessors.h"
21 #include "file-item.h"
23 #define BTRFS_DELAYED_WRITEBACK 512
24 #define BTRFS_DELAYED_BACKGROUND 128
25 #define BTRFS_DELAYED_BATCH 16
27 static struct kmem_cache
*delayed_node_cache
;
29 int __init
btrfs_delayed_inode_init(void)
31 delayed_node_cache
= kmem_cache_create("btrfs_delayed_node",
32 sizeof(struct btrfs_delayed_node
),
36 if (!delayed_node_cache
)
41 void __cold
btrfs_delayed_inode_exit(void)
43 kmem_cache_destroy(delayed_node_cache
);
46 static inline void btrfs_init_delayed_node(
47 struct btrfs_delayed_node
*delayed_node
,
48 struct btrfs_root
*root
, u64 inode_id
)
50 delayed_node
->root
= root
;
51 delayed_node
->inode_id
= inode_id
;
52 refcount_set(&delayed_node
->refs
, 0);
53 delayed_node
->ins_root
= RB_ROOT_CACHED
;
54 delayed_node
->del_root
= RB_ROOT_CACHED
;
55 mutex_init(&delayed_node
->mutex
);
56 INIT_LIST_HEAD(&delayed_node
->n_list
);
57 INIT_LIST_HEAD(&delayed_node
->p_list
);
60 static struct btrfs_delayed_node
*btrfs_get_delayed_node(
61 struct btrfs_inode
*btrfs_inode
)
63 struct btrfs_root
*root
= btrfs_inode
->root
;
64 u64 ino
= btrfs_ino(btrfs_inode
);
65 struct btrfs_delayed_node
*node
;
67 node
= READ_ONCE(btrfs_inode
->delayed_node
);
69 refcount_inc(&node
->refs
);
73 spin_lock(&root
->inode_lock
);
74 node
= radix_tree_lookup(&root
->delayed_nodes_tree
, ino
);
77 if (btrfs_inode
->delayed_node
) {
78 refcount_inc(&node
->refs
); /* can be accessed */
79 BUG_ON(btrfs_inode
->delayed_node
!= node
);
80 spin_unlock(&root
->inode_lock
);
85 * It's possible that we're racing into the middle of removing
86 * this node from the radix tree. In this case, the refcount
87 * was zero and it should never go back to one. Just return
88 * NULL like it was never in the radix at all; our release
89 * function is in the process of removing it.
91 * Some implementations of refcount_inc refuse to bump the
92 * refcount once it has hit zero. If we don't do this dance
93 * here, refcount_inc() may decide to just WARN_ONCE() instead
94 * of actually bumping the refcount.
96 * If this node is properly in the radix, we want to bump the
97 * refcount twice, once for the inode and once for this get
100 if (refcount_inc_not_zero(&node
->refs
)) {
101 refcount_inc(&node
->refs
);
102 btrfs_inode
->delayed_node
= node
;
107 spin_unlock(&root
->inode_lock
);
110 spin_unlock(&root
->inode_lock
);
115 /* Will return either the node or PTR_ERR(-ENOMEM) */
116 static struct btrfs_delayed_node
*btrfs_get_or_create_delayed_node(
117 struct btrfs_inode
*btrfs_inode
)
119 struct btrfs_delayed_node
*node
;
120 struct btrfs_root
*root
= btrfs_inode
->root
;
121 u64 ino
= btrfs_ino(btrfs_inode
);
125 node
= btrfs_get_delayed_node(btrfs_inode
);
129 node
= kmem_cache_zalloc(delayed_node_cache
, GFP_NOFS
);
131 return ERR_PTR(-ENOMEM
);
132 btrfs_init_delayed_node(node
, root
, ino
);
134 /* cached in the btrfs inode and can be accessed */
135 refcount_set(&node
->refs
, 2);
137 ret
= radix_tree_preload(GFP_NOFS
);
139 kmem_cache_free(delayed_node_cache
, node
);
143 spin_lock(&root
->inode_lock
);
144 ret
= radix_tree_insert(&root
->delayed_nodes_tree
, ino
, node
);
145 if (ret
== -EEXIST
) {
146 spin_unlock(&root
->inode_lock
);
147 kmem_cache_free(delayed_node_cache
, node
);
148 radix_tree_preload_end();
151 btrfs_inode
->delayed_node
= node
;
152 spin_unlock(&root
->inode_lock
);
153 radix_tree_preload_end();
159 * Call it when holding delayed_node->mutex
161 * If mod = 1, add this node into the prepared list.
163 static void btrfs_queue_delayed_node(struct btrfs_delayed_root
*root
,
164 struct btrfs_delayed_node
*node
,
167 spin_lock(&root
->lock
);
168 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST
, &node
->flags
)) {
169 if (!list_empty(&node
->p_list
))
170 list_move_tail(&node
->p_list
, &root
->prepare_list
);
172 list_add_tail(&node
->p_list
, &root
->prepare_list
);
174 list_add_tail(&node
->n_list
, &root
->node_list
);
175 list_add_tail(&node
->p_list
, &root
->prepare_list
);
176 refcount_inc(&node
->refs
); /* inserted into list */
178 set_bit(BTRFS_DELAYED_NODE_IN_LIST
, &node
->flags
);
180 spin_unlock(&root
->lock
);
183 /* Call it when holding delayed_node->mutex */
184 static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root
*root
,
185 struct btrfs_delayed_node
*node
)
187 spin_lock(&root
->lock
);
188 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST
, &node
->flags
)) {
190 refcount_dec(&node
->refs
); /* not in the list */
191 list_del_init(&node
->n_list
);
192 if (!list_empty(&node
->p_list
))
193 list_del_init(&node
->p_list
);
194 clear_bit(BTRFS_DELAYED_NODE_IN_LIST
, &node
->flags
);
196 spin_unlock(&root
->lock
);
199 static struct btrfs_delayed_node
*btrfs_first_delayed_node(
200 struct btrfs_delayed_root
*delayed_root
)
203 struct btrfs_delayed_node
*node
= NULL
;
205 spin_lock(&delayed_root
->lock
);
206 if (list_empty(&delayed_root
->node_list
))
209 p
= delayed_root
->node_list
.next
;
210 node
= list_entry(p
, struct btrfs_delayed_node
, n_list
);
211 refcount_inc(&node
->refs
);
213 spin_unlock(&delayed_root
->lock
);
218 static struct btrfs_delayed_node
*btrfs_next_delayed_node(
219 struct btrfs_delayed_node
*node
)
221 struct btrfs_delayed_root
*delayed_root
;
223 struct btrfs_delayed_node
*next
= NULL
;
225 delayed_root
= node
->root
->fs_info
->delayed_root
;
226 spin_lock(&delayed_root
->lock
);
227 if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST
, &node
->flags
)) {
228 /* not in the list */
229 if (list_empty(&delayed_root
->node_list
))
231 p
= delayed_root
->node_list
.next
;
232 } else if (list_is_last(&node
->n_list
, &delayed_root
->node_list
))
235 p
= node
->n_list
.next
;
237 next
= list_entry(p
, struct btrfs_delayed_node
, n_list
);
238 refcount_inc(&next
->refs
);
240 spin_unlock(&delayed_root
->lock
);
245 static void __btrfs_release_delayed_node(
246 struct btrfs_delayed_node
*delayed_node
,
249 struct btrfs_delayed_root
*delayed_root
;
254 delayed_root
= delayed_node
->root
->fs_info
->delayed_root
;
256 mutex_lock(&delayed_node
->mutex
);
257 if (delayed_node
->count
)
258 btrfs_queue_delayed_node(delayed_root
, delayed_node
, mod
);
260 btrfs_dequeue_delayed_node(delayed_root
, delayed_node
);
261 mutex_unlock(&delayed_node
->mutex
);
263 if (refcount_dec_and_test(&delayed_node
->refs
)) {
264 struct btrfs_root
*root
= delayed_node
->root
;
266 spin_lock(&root
->inode_lock
);
268 * Once our refcount goes to zero, nobody is allowed to bump it
269 * back up. We can delete it now.
271 ASSERT(refcount_read(&delayed_node
->refs
) == 0);
272 radix_tree_delete(&root
->delayed_nodes_tree
,
273 delayed_node
->inode_id
);
274 spin_unlock(&root
->inode_lock
);
275 kmem_cache_free(delayed_node_cache
, delayed_node
);
279 static inline void btrfs_release_delayed_node(struct btrfs_delayed_node
*node
)
281 __btrfs_release_delayed_node(node
, 0);
284 static struct btrfs_delayed_node
*btrfs_first_prepared_delayed_node(
285 struct btrfs_delayed_root
*delayed_root
)
288 struct btrfs_delayed_node
*node
= NULL
;
290 spin_lock(&delayed_root
->lock
);
291 if (list_empty(&delayed_root
->prepare_list
))
294 p
= delayed_root
->prepare_list
.next
;
296 node
= list_entry(p
, struct btrfs_delayed_node
, p_list
);
297 refcount_inc(&node
->refs
);
299 spin_unlock(&delayed_root
->lock
);
304 static inline void btrfs_release_prepared_delayed_node(
305 struct btrfs_delayed_node
*node
)
307 __btrfs_release_delayed_node(node
, 1);
310 static struct btrfs_delayed_item
*btrfs_alloc_delayed_item(u16 data_len
,
311 struct btrfs_delayed_node
*node
,
312 enum btrfs_delayed_item_type type
)
314 struct btrfs_delayed_item
*item
;
316 item
= kmalloc(sizeof(*item
) + data_len
, GFP_NOFS
);
318 item
->data_len
= data_len
;
320 item
->bytes_reserved
= 0;
321 item
->delayed_node
= node
;
322 RB_CLEAR_NODE(&item
->rb_node
);
323 INIT_LIST_HEAD(&item
->log_list
);
324 item
->logged
= false;
325 refcount_set(&item
->refs
, 1);
331 * __btrfs_lookup_delayed_item - look up the delayed item by key
332 * @delayed_node: pointer to the delayed node
333 * @index: the dir index value to lookup (offset of a dir index key)
335 * Note: if we don't find the right item, we will return the prev item and
338 static struct btrfs_delayed_item
*__btrfs_lookup_delayed_item(
339 struct rb_root
*root
,
342 struct rb_node
*node
= root
->rb_node
;
343 struct btrfs_delayed_item
*delayed_item
= NULL
;
346 delayed_item
= rb_entry(node
, struct btrfs_delayed_item
,
348 if (delayed_item
->index
< index
)
349 node
= node
->rb_right
;
350 else if (delayed_item
->index
> index
)
351 node
= node
->rb_left
;
359 static int __btrfs_add_delayed_item(struct btrfs_delayed_node
*delayed_node
,
360 struct btrfs_delayed_item
*ins
)
362 struct rb_node
**p
, *node
;
363 struct rb_node
*parent_node
= NULL
;
364 struct rb_root_cached
*root
;
365 struct btrfs_delayed_item
*item
;
366 bool leftmost
= true;
368 if (ins
->type
== BTRFS_DELAYED_INSERTION_ITEM
)
369 root
= &delayed_node
->ins_root
;
371 root
= &delayed_node
->del_root
;
373 p
= &root
->rb_root
.rb_node
;
374 node
= &ins
->rb_node
;
378 item
= rb_entry(parent_node
, struct btrfs_delayed_item
,
381 if (item
->index
< ins
->index
) {
384 } else if (item
->index
> ins
->index
) {
391 rb_link_node(node
, parent_node
, p
);
392 rb_insert_color_cached(node
, root
, leftmost
);
394 if (ins
->type
== BTRFS_DELAYED_INSERTION_ITEM
&&
395 ins
->index
>= delayed_node
->index_cnt
)
396 delayed_node
->index_cnt
= ins
->index
+ 1;
398 delayed_node
->count
++;
399 atomic_inc(&delayed_node
->root
->fs_info
->delayed_root
->items
);
403 static void finish_one_item(struct btrfs_delayed_root
*delayed_root
)
405 int seq
= atomic_inc_return(&delayed_root
->items_seq
);
407 /* atomic_dec_return implies a barrier */
408 if ((atomic_dec_return(&delayed_root
->items
) <
409 BTRFS_DELAYED_BACKGROUND
|| seq
% BTRFS_DELAYED_BATCH
== 0))
410 cond_wake_up_nomb(&delayed_root
->wait
);
413 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item
*delayed_item
)
415 struct rb_root_cached
*root
;
416 struct btrfs_delayed_root
*delayed_root
;
418 /* Not inserted, ignore it. */
419 if (RB_EMPTY_NODE(&delayed_item
->rb_node
))
422 delayed_root
= delayed_item
->delayed_node
->root
->fs_info
->delayed_root
;
424 BUG_ON(!delayed_root
);
426 if (delayed_item
->type
== BTRFS_DELAYED_INSERTION_ITEM
)
427 root
= &delayed_item
->delayed_node
->ins_root
;
429 root
= &delayed_item
->delayed_node
->del_root
;
431 rb_erase_cached(&delayed_item
->rb_node
, root
);
432 RB_CLEAR_NODE(&delayed_item
->rb_node
);
433 delayed_item
->delayed_node
->count
--;
435 finish_one_item(delayed_root
);
438 static void btrfs_release_delayed_item(struct btrfs_delayed_item
*item
)
441 __btrfs_remove_delayed_item(item
);
442 if (refcount_dec_and_test(&item
->refs
))
447 static struct btrfs_delayed_item
*__btrfs_first_delayed_insertion_item(
448 struct btrfs_delayed_node
*delayed_node
)
451 struct btrfs_delayed_item
*item
= NULL
;
453 p
= rb_first_cached(&delayed_node
->ins_root
);
455 item
= rb_entry(p
, struct btrfs_delayed_item
, rb_node
);
460 static struct btrfs_delayed_item
*__btrfs_first_delayed_deletion_item(
461 struct btrfs_delayed_node
*delayed_node
)
464 struct btrfs_delayed_item
*item
= NULL
;
466 p
= rb_first_cached(&delayed_node
->del_root
);
468 item
= rb_entry(p
, struct btrfs_delayed_item
, rb_node
);
473 static struct btrfs_delayed_item
*__btrfs_next_delayed_item(
474 struct btrfs_delayed_item
*item
)
477 struct btrfs_delayed_item
*next
= NULL
;
479 p
= rb_next(&item
->rb_node
);
481 next
= rb_entry(p
, struct btrfs_delayed_item
, rb_node
);
486 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle
*trans
,
487 struct btrfs_delayed_item
*item
)
489 struct btrfs_block_rsv
*src_rsv
;
490 struct btrfs_block_rsv
*dst_rsv
;
491 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
495 if (!trans
->bytes_reserved
)
498 src_rsv
= trans
->block_rsv
;
499 dst_rsv
= &fs_info
->delayed_block_rsv
;
501 num_bytes
= btrfs_calc_insert_metadata_size(fs_info
, 1);
504 * Here we migrate space rsv from transaction rsv, since have already
505 * reserved space when starting a transaction. So no need to reserve
508 ret
= btrfs_block_rsv_migrate(src_rsv
, dst_rsv
, num_bytes
, true);
510 trace_btrfs_space_reservation(fs_info
, "delayed_item",
511 item
->delayed_node
->inode_id
,
514 * For insertions we track reserved metadata space by accounting
515 * for the number of leaves that will be used, based on the delayed
516 * node's index_items_size field.
518 if (item
->type
== BTRFS_DELAYED_DELETION_ITEM
)
519 item
->bytes_reserved
= num_bytes
;
525 static void btrfs_delayed_item_release_metadata(struct btrfs_root
*root
,
526 struct btrfs_delayed_item
*item
)
528 struct btrfs_block_rsv
*rsv
;
529 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
531 if (!item
->bytes_reserved
)
534 rsv
= &fs_info
->delayed_block_rsv
;
536 * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
537 * to release/reserve qgroup space.
539 trace_btrfs_space_reservation(fs_info
, "delayed_item",
540 item
->delayed_node
->inode_id
,
541 item
->bytes_reserved
, 0);
542 btrfs_block_rsv_release(fs_info
, rsv
, item
->bytes_reserved
, NULL
);
545 static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node
*node
,
546 unsigned int num_leaves
)
548 struct btrfs_fs_info
*fs_info
= node
->root
->fs_info
;
549 const u64 bytes
= btrfs_calc_insert_metadata_size(fs_info
, num_leaves
);
551 /* There are no space reservations during log replay, bail out. */
552 if (test_bit(BTRFS_FS_LOG_RECOVERING
, &fs_info
->flags
))
555 trace_btrfs_space_reservation(fs_info
, "delayed_item", node
->inode_id
,
557 btrfs_block_rsv_release(fs_info
, &fs_info
->delayed_block_rsv
, bytes
, NULL
);
560 static int btrfs_delayed_inode_reserve_metadata(
561 struct btrfs_trans_handle
*trans
,
562 struct btrfs_root
*root
,
563 struct btrfs_delayed_node
*node
)
565 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
566 struct btrfs_block_rsv
*src_rsv
;
567 struct btrfs_block_rsv
*dst_rsv
;
571 src_rsv
= trans
->block_rsv
;
572 dst_rsv
= &fs_info
->delayed_block_rsv
;
574 num_bytes
= btrfs_calc_metadata_size(fs_info
, 1);
577 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
578 * which doesn't reserve space for speed. This is a problem since we
579 * still need to reserve space for this update, so try to reserve the
582 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
583 * we always reserve enough to update the inode item.
585 if (!src_rsv
|| (!trans
->bytes_reserved
&&
586 src_rsv
->type
!= BTRFS_BLOCK_RSV_DELALLOC
)) {
587 ret
= btrfs_qgroup_reserve_meta(root
, num_bytes
,
588 BTRFS_QGROUP_RSV_META_PREALLOC
, true);
591 ret
= btrfs_block_rsv_add(fs_info
, dst_rsv
, num_bytes
,
592 BTRFS_RESERVE_NO_FLUSH
);
593 /* NO_FLUSH could only fail with -ENOSPC */
594 ASSERT(ret
== 0 || ret
== -ENOSPC
);
596 btrfs_qgroup_free_meta_prealloc(root
, num_bytes
);
598 ret
= btrfs_block_rsv_migrate(src_rsv
, dst_rsv
, num_bytes
, true);
602 trace_btrfs_space_reservation(fs_info
, "delayed_inode",
603 node
->inode_id
, num_bytes
, 1);
604 node
->bytes_reserved
= num_bytes
;
610 static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info
*fs_info
,
611 struct btrfs_delayed_node
*node
,
614 struct btrfs_block_rsv
*rsv
;
616 if (!node
->bytes_reserved
)
619 rsv
= &fs_info
->delayed_block_rsv
;
620 trace_btrfs_space_reservation(fs_info
, "delayed_inode",
621 node
->inode_id
, node
->bytes_reserved
, 0);
622 btrfs_block_rsv_release(fs_info
, rsv
, node
->bytes_reserved
, NULL
);
624 btrfs_qgroup_free_meta_prealloc(node
->root
,
625 node
->bytes_reserved
);
627 btrfs_qgroup_convert_reserved_meta(node
->root
,
628 node
->bytes_reserved
);
629 node
->bytes_reserved
= 0;
633 * Insert a single delayed item or a batch of delayed items, as many as possible
634 * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
635 * in the rbtree, and if there's a gap between two consecutive dir index items,
636 * then it means at some point we had delayed dir indexes to add but they got
637 * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
638 * into the subvolume tree. Dir index keys also have their offsets coming from a
639 * monotonically increasing counter, so we can't get new keys with an offset that
640 * fits within a gap between delayed dir index items.
642 static int btrfs_insert_delayed_item(struct btrfs_trans_handle
*trans
,
643 struct btrfs_root
*root
,
644 struct btrfs_path
*path
,
645 struct btrfs_delayed_item
*first_item
)
647 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
648 struct btrfs_delayed_node
*node
= first_item
->delayed_node
;
649 LIST_HEAD(item_list
);
650 struct btrfs_delayed_item
*curr
;
651 struct btrfs_delayed_item
*next
;
652 const int max_size
= BTRFS_LEAF_DATA_SIZE(fs_info
);
653 struct btrfs_item_batch batch
;
654 struct btrfs_key first_key
;
655 const u32 first_data_size
= first_item
->data_len
;
657 char *ins_data
= NULL
;
659 bool continuous_keys_only
= false;
661 lockdep_assert_held(&node
->mutex
);
664 * During normal operation the delayed index offset is continuously
665 * increasing, so we can batch insert all items as there will not be any
666 * overlapping keys in the tree.
668 * The exception to this is log replay, where we may have interleaved
669 * offsets in the tree, so our batch needs to be continuous keys only in
670 * order to ensure we do not end up with out of order items in our leaf.
672 if (test_bit(BTRFS_FS_LOG_RECOVERING
, &fs_info
->flags
))
673 continuous_keys_only
= true;
676 * For delayed items to insert, we track reserved metadata bytes based
677 * on the number of leaves that we will use.
678 * See btrfs_insert_delayed_dir_index() and
679 * btrfs_delayed_item_reserve_metadata()).
681 ASSERT(first_item
->bytes_reserved
== 0);
683 list_add_tail(&first_item
->tree_list
, &item_list
);
684 batch
.total_data_size
= first_data_size
;
686 total_size
= first_data_size
+ sizeof(struct btrfs_item
);
692 next
= __btrfs_next_delayed_item(curr
);
697 * We cannot allow gaps in the key space if we're doing log
700 if (continuous_keys_only
&& (next
->index
!= curr
->index
+ 1))
703 ASSERT(next
->bytes_reserved
== 0);
705 next_size
= next
->data_len
+ sizeof(struct btrfs_item
);
706 if (total_size
+ next_size
> max_size
)
709 list_add_tail(&next
->tree_list
, &item_list
);
711 total_size
+= next_size
;
712 batch
.total_data_size
+= next
->data_len
;
717 first_key
.objectid
= node
->inode_id
;
718 first_key
.type
= BTRFS_DIR_INDEX_KEY
;
719 first_key
.offset
= first_item
->index
;
720 batch
.keys
= &first_key
;
721 batch
.data_sizes
= &first_data_size
;
723 struct btrfs_key
*ins_keys
;
727 ins_data
= kmalloc(batch
.nr
* sizeof(u32
) +
728 batch
.nr
* sizeof(struct btrfs_key
), GFP_NOFS
);
733 ins_sizes
= (u32
*)ins_data
;
734 ins_keys
= (struct btrfs_key
*)(ins_data
+ batch
.nr
* sizeof(u32
));
735 batch
.keys
= ins_keys
;
736 batch
.data_sizes
= ins_sizes
;
737 list_for_each_entry(curr
, &item_list
, tree_list
) {
738 ins_keys
[i
].objectid
= node
->inode_id
;
739 ins_keys
[i
].type
= BTRFS_DIR_INDEX_KEY
;
740 ins_keys
[i
].offset
= curr
->index
;
741 ins_sizes
[i
] = curr
->data_len
;
746 ret
= btrfs_insert_empty_items(trans
, root
, path
, &batch
);
750 list_for_each_entry(curr
, &item_list
, tree_list
) {
753 data_ptr
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0], char);
754 write_extent_buffer(path
->nodes
[0], &curr
->data
,
755 (unsigned long)data_ptr
, curr
->data_len
);
760 * Now release our path before releasing the delayed items and their
761 * metadata reservations, so that we don't block other tasks for more
764 btrfs_release_path(path
);
766 ASSERT(node
->index_item_leaves
> 0);
769 * For normal operations we will batch an entire leaf's worth of delayed
770 * items, so if there are more items to process we can decrement
771 * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
773 * However for log replay we may not have inserted an entire leaf's
774 * worth of items, we may have not had continuous items, so decrementing
775 * here would mess up the index_item_leaves accounting. For this case
776 * only clean up the accounting when there are no items left.
778 if (next
&& !continuous_keys_only
) {
780 * We inserted one batch of items into a leaf a there are more
781 * items to flush in a future batch, now release one unit of
782 * metadata space from the delayed block reserve, corresponding
783 * the leaf we just flushed to.
785 btrfs_delayed_item_release_leaves(node
, 1);
786 node
->index_item_leaves
--;
789 * There are no more items to insert. We can have a number of
790 * reserved leaves > 1 here - this happens when many dir index
791 * items are added and then removed before they are flushed (file
792 * names with a very short life, never span a transaction). So
793 * release all remaining leaves.
795 btrfs_delayed_item_release_leaves(node
, node
->index_item_leaves
);
796 node
->index_item_leaves
= 0;
799 list_for_each_entry_safe(curr
, next
, &item_list
, tree_list
) {
800 list_del(&curr
->tree_list
);
801 btrfs_release_delayed_item(curr
);
808 static int btrfs_insert_delayed_items(struct btrfs_trans_handle
*trans
,
809 struct btrfs_path
*path
,
810 struct btrfs_root
*root
,
811 struct btrfs_delayed_node
*node
)
816 struct btrfs_delayed_item
*curr
;
818 mutex_lock(&node
->mutex
);
819 curr
= __btrfs_first_delayed_insertion_item(node
);
821 mutex_unlock(&node
->mutex
);
824 ret
= btrfs_insert_delayed_item(trans
, root
, path
, curr
);
825 mutex_unlock(&node
->mutex
);
831 static int btrfs_batch_delete_items(struct btrfs_trans_handle
*trans
,
832 struct btrfs_root
*root
,
833 struct btrfs_path
*path
,
834 struct btrfs_delayed_item
*item
)
836 const u64 ino
= item
->delayed_node
->inode_id
;
837 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
838 struct btrfs_delayed_item
*curr
, *next
;
839 struct extent_buffer
*leaf
= path
->nodes
[0];
840 LIST_HEAD(batch_list
);
841 int nitems
, slot
, last_slot
;
843 u64 total_reserved_size
= item
->bytes_reserved
;
845 ASSERT(leaf
!= NULL
);
847 slot
= path
->slots
[0];
848 last_slot
= btrfs_header_nritems(leaf
) - 1;
850 * Our caller always gives us a path pointing to an existing item, so
851 * this can not happen.
853 ASSERT(slot
<= last_slot
);
854 if (WARN_ON(slot
> last_slot
))
859 list_add_tail(&curr
->tree_list
, &batch_list
);
862 * Keep checking if the next delayed item matches the next item in the
863 * leaf - if so, we can add it to the batch of items to delete from the
866 while (slot
< last_slot
) {
867 struct btrfs_key key
;
869 next
= __btrfs_next_delayed_item(curr
);
874 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
875 if (key
.objectid
!= ino
||
876 key
.type
!= BTRFS_DIR_INDEX_KEY
||
877 key
.offset
!= next
->index
)
881 list_add_tail(&curr
->tree_list
, &batch_list
);
882 total_reserved_size
+= curr
->bytes_reserved
;
885 ret
= btrfs_del_items(trans
, root
, path
, path
->slots
[0], nitems
);
889 /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
890 if (total_reserved_size
> 0) {
892 * Check btrfs_delayed_item_reserve_metadata() to see why we
893 * don't need to release/reserve qgroup space.
895 trace_btrfs_space_reservation(fs_info
, "delayed_item", ino
,
896 total_reserved_size
, 0);
897 btrfs_block_rsv_release(fs_info
, &fs_info
->delayed_block_rsv
,
898 total_reserved_size
, NULL
);
901 list_for_each_entry_safe(curr
, next
, &batch_list
, tree_list
) {
902 list_del(&curr
->tree_list
);
903 btrfs_release_delayed_item(curr
);
909 static int btrfs_delete_delayed_items(struct btrfs_trans_handle
*trans
,
910 struct btrfs_path
*path
,
911 struct btrfs_root
*root
,
912 struct btrfs_delayed_node
*node
)
914 struct btrfs_key key
;
917 key
.objectid
= node
->inode_id
;
918 key
.type
= BTRFS_DIR_INDEX_KEY
;
921 struct btrfs_delayed_item
*item
;
923 mutex_lock(&node
->mutex
);
924 item
= __btrfs_first_delayed_deletion_item(node
);
926 mutex_unlock(&node
->mutex
);
930 key
.offset
= item
->index
;
931 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
934 * There's no matching item in the leaf. This means we
935 * have already deleted this item in a past run of the
936 * delayed items. We ignore errors when running delayed
937 * items from an async context, through a work queue job
938 * running btrfs_async_run_delayed_root(), and don't
939 * release delayed items that failed to complete. This
940 * is because we will retry later, and at transaction
941 * commit time we always run delayed items and will
942 * then deal with errors if they fail to run again.
944 * So just release delayed items for which we can't find
945 * an item in the tree, and move to the next item.
947 btrfs_release_path(path
);
948 btrfs_release_delayed_item(item
);
950 } else if (ret
== 0) {
951 ret
= btrfs_batch_delete_items(trans
, root
, path
, item
);
952 btrfs_release_path(path
);
956 * We unlock and relock on each iteration, this is to prevent
957 * blocking other tasks for too long while we are being run from
958 * the async context (work queue job). Those tasks are typically
959 * running system calls like creat/mkdir/rename/unlink/etc which
960 * need to add delayed items to this delayed node.
962 mutex_unlock(&node
->mutex
);
968 static void btrfs_release_delayed_inode(struct btrfs_delayed_node
*delayed_node
)
970 struct btrfs_delayed_root
*delayed_root
;
973 test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
)) {
974 BUG_ON(!delayed_node
->root
);
975 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
);
976 delayed_node
->count
--;
978 delayed_root
= delayed_node
->root
->fs_info
->delayed_root
;
979 finish_one_item(delayed_root
);
983 static void btrfs_release_delayed_iref(struct btrfs_delayed_node
*delayed_node
)
986 if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF
, &delayed_node
->flags
)) {
987 struct btrfs_delayed_root
*delayed_root
;
989 ASSERT(delayed_node
->root
);
990 delayed_node
->count
--;
992 delayed_root
= delayed_node
->root
->fs_info
->delayed_root
;
993 finish_one_item(delayed_root
);
997 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle
*trans
,
998 struct btrfs_root
*root
,
999 struct btrfs_path
*path
,
1000 struct btrfs_delayed_node
*node
)
1002 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1003 struct btrfs_key key
;
1004 struct btrfs_inode_item
*inode_item
;
1005 struct extent_buffer
*leaf
;
1009 key
.objectid
= node
->inode_id
;
1010 key
.type
= BTRFS_INODE_ITEM_KEY
;
1013 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF
, &node
->flags
))
1018 ret
= btrfs_lookup_inode(trans
, root
, path
, &key
, mod
);
1024 leaf
= path
->nodes
[0];
1025 inode_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
1026 struct btrfs_inode_item
);
1027 write_extent_buffer(leaf
, &node
->inode_item
, (unsigned long)inode_item
,
1028 sizeof(struct btrfs_inode_item
));
1029 btrfs_mark_buffer_dirty(leaf
);
1031 if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF
, &node
->flags
))
1035 if (path
->slots
[0] >= btrfs_header_nritems(leaf
))
1038 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1039 if (key
.objectid
!= node
->inode_id
)
1042 if (key
.type
!= BTRFS_INODE_REF_KEY
&&
1043 key
.type
!= BTRFS_INODE_EXTREF_KEY
)
1047 * Delayed iref deletion is for the inode who has only one link,
1048 * so there is only one iref. The case that several irefs are
1049 * in the same item doesn't exist.
1051 ret
= btrfs_del_item(trans
, root
, path
);
1053 btrfs_release_delayed_iref(node
);
1054 btrfs_release_path(path
);
1056 btrfs_delayed_inode_release_metadata(fs_info
, node
, (ret
< 0));
1057 btrfs_release_delayed_inode(node
);
1060 * If we fail to update the delayed inode we need to abort the
1061 * transaction, because we could leave the inode with the improper
1064 if (ret
&& ret
!= -ENOENT
)
1065 btrfs_abort_transaction(trans
, ret
);
1070 btrfs_release_path(path
);
1072 key
.type
= BTRFS_INODE_EXTREF_KEY
;
1075 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
1081 leaf
= path
->nodes
[0];
1086 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle
*trans
,
1087 struct btrfs_root
*root
,
1088 struct btrfs_path
*path
,
1089 struct btrfs_delayed_node
*node
)
1093 mutex_lock(&node
->mutex
);
1094 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &node
->flags
)) {
1095 mutex_unlock(&node
->mutex
);
1099 ret
= __btrfs_update_delayed_inode(trans
, root
, path
, node
);
1100 mutex_unlock(&node
->mutex
);
1105 __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle
*trans
,
1106 struct btrfs_path
*path
,
1107 struct btrfs_delayed_node
*node
)
1111 ret
= btrfs_insert_delayed_items(trans
, path
, node
->root
, node
);
1115 ret
= btrfs_delete_delayed_items(trans
, path
, node
->root
, node
);
1119 ret
= btrfs_update_delayed_inode(trans
, node
->root
, path
, node
);
1124 * Called when committing the transaction.
1125 * Returns 0 on success.
1126 * Returns < 0 on error and returns with an aborted transaction with any
1127 * outstanding delayed items cleaned up.
1129 static int __btrfs_run_delayed_items(struct btrfs_trans_handle
*trans
, int nr
)
1131 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1132 struct btrfs_delayed_root
*delayed_root
;
1133 struct btrfs_delayed_node
*curr_node
, *prev_node
;
1134 struct btrfs_path
*path
;
1135 struct btrfs_block_rsv
*block_rsv
;
1137 bool count
= (nr
> 0);
1139 if (TRANS_ABORTED(trans
))
1142 path
= btrfs_alloc_path();
1146 block_rsv
= trans
->block_rsv
;
1147 trans
->block_rsv
= &fs_info
->delayed_block_rsv
;
1149 delayed_root
= fs_info
->delayed_root
;
1151 curr_node
= btrfs_first_delayed_node(delayed_root
);
1152 while (curr_node
&& (!count
|| nr
--)) {
1153 ret
= __btrfs_commit_inode_delayed_items(trans
, path
,
1156 btrfs_abort_transaction(trans
, ret
);
1160 prev_node
= curr_node
;
1161 curr_node
= btrfs_next_delayed_node(curr_node
);
1163 * See the comment below about releasing path before releasing
1164 * node. If the commit of delayed items was successful the path
1165 * should always be released, but in case of an error, it may
1166 * point to locked extent buffers (a leaf at the very least).
1168 ASSERT(path
->nodes
[0] == NULL
);
1169 btrfs_release_delayed_node(prev_node
);
1173 * Release the path to avoid a potential deadlock and lockdep splat when
1174 * releasing the delayed node, as that requires taking the delayed node's
1175 * mutex. If another task starts running delayed items before we take
1176 * the mutex, it will first lock the mutex and then it may try to lock
1177 * the same btree path (leaf).
1179 btrfs_free_path(path
);
1182 btrfs_release_delayed_node(curr_node
);
1183 trans
->block_rsv
= block_rsv
;
1188 int btrfs_run_delayed_items(struct btrfs_trans_handle
*trans
)
1190 return __btrfs_run_delayed_items(trans
, -1);
1193 int btrfs_run_delayed_items_nr(struct btrfs_trans_handle
*trans
, int nr
)
1195 return __btrfs_run_delayed_items(trans
, nr
);
1198 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle
*trans
,
1199 struct btrfs_inode
*inode
)
1201 struct btrfs_delayed_node
*delayed_node
= btrfs_get_delayed_node(inode
);
1202 struct btrfs_path
*path
;
1203 struct btrfs_block_rsv
*block_rsv
;
1209 mutex_lock(&delayed_node
->mutex
);
1210 if (!delayed_node
->count
) {
1211 mutex_unlock(&delayed_node
->mutex
);
1212 btrfs_release_delayed_node(delayed_node
);
1215 mutex_unlock(&delayed_node
->mutex
);
1217 path
= btrfs_alloc_path();
1219 btrfs_release_delayed_node(delayed_node
);
1223 block_rsv
= trans
->block_rsv
;
1224 trans
->block_rsv
= &delayed_node
->root
->fs_info
->delayed_block_rsv
;
1226 ret
= __btrfs_commit_inode_delayed_items(trans
, path
, delayed_node
);
1228 btrfs_release_delayed_node(delayed_node
);
1229 btrfs_free_path(path
);
1230 trans
->block_rsv
= block_rsv
;
1235 int btrfs_commit_inode_delayed_inode(struct btrfs_inode
*inode
)
1237 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1238 struct btrfs_trans_handle
*trans
;
1239 struct btrfs_delayed_node
*delayed_node
= btrfs_get_delayed_node(inode
);
1240 struct btrfs_path
*path
;
1241 struct btrfs_block_rsv
*block_rsv
;
1247 mutex_lock(&delayed_node
->mutex
);
1248 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
)) {
1249 mutex_unlock(&delayed_node
->mutex
);
1250 btrfs_release_delayed_node(delayed_node
);
1253 mutex_unlock(&delayed_node
->mutex
);
1255 trans
= btrfs_join_transaction(delayed_node
->root
);
1256 if (IS_ERR(trans
)) {
1257 ret
= PTR_ERR(trans
);
1261 path
= btrfs_alloc_path();
1267 block_rsv
= trans
->block_rsv
;
1268 trans
->block_rsv
= &fs_info
->delayed_block_rsv
;
1270 mutex_lock(&delayed_node
->mutex
);
1271 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
))
1272 ret
= __btrfs_update_delayed_inode(trans
, delayed_node
->root
,
1273 path
, delayed_node
);
1276 mutex_unlock(&delayed_node
->mutex
);
1278 btrfs_free_path(path
);
1279 trans
->block_rsv
= block_rsv
;
1281 btrfs_end_transaction(trans
);
1282 btrfs_btree_balance_dirty(fs_info
);
1284 btrfs_release_delayed_node(delayed_node
);
1289 void btrfs_remove_delayed_node(struct btrfs_inode
*inode
)
1291 struct btrfs_delayed_node
*delayed_node
;
1293 delayed_node
= READ_ONCE(inode
->delayed_node
);
1297 inode
->delayed_node
= NULL
;
1298 btrfs_release_delayed_node(delayed_node
);
1301 struct btrfs_async_delayed_work
{
1302 struct btrfs_delayed_root
*delayed_root
;
1304 struct btrfs_work work
;
1307 static void btrfs_async_run_delayed_root(struct btrfs_work
*work
)
1309 struct btrfs_async_delayed_work
*async_work
;
1310 struct btrfs_delayed_root
*delayed_root
;
1311 struct btrfs_trans_handle
*trans
;
1312 struct btrfs_path
*path
;
1313 struct btrfs_delayed_node
*delayed_node
= NULL
;
1314 struct btrfs_root
*root
;
1315 struct btrfs_block_rsv
*block_rsv
;
1318 async_work
= container_of(work
, struct btrfs_async_delayed_work
, work
);
1319 delayed_root
= async_work
->delayed_root
;
1321 path
= btrfs_alloc_path();
1326 if (atomic_read(&delayed_root
->items
) <
1327 BTRFS_DELAYED_BACKGROUND
/ 2)
1330 delayed_node
= btrfs_first_prepared_delayed_node(delayed_root
);
1334 root
= delayed_node
->root
;
1336 trans
= btrfs_join_transaction(root
);
1337 if (IS_ERR(trans
)) {
1338 btrfs_release_path(path
);
1339 btrfs_release_prepared_delayed_node(delayed_node
);
1344 block_rsv
= trans
->block_rsv
;
1345 trans
->block_rsv
= &root
->fs_info
->delayed_block_rsv
;
1347 __btrfs_commit_inode_delayed_items(trans
, path
, delayed_node
);
1349 trans
->block_rsv
= block_rsv
;
1350 btrfs_end_transaction(trans
);
1351 btrfs_btree_balance_dirty_nodelay(root
->fs_info
);
1353 btrfs_release_path(path
);
1354 btrfs_release_prepared_delayed_node(delayed_node
);
1357 } while ((async_work
->nr
== 0 && total_done
< BTRFS_DELAYED_WRITEBACK
)
1358 || total_done
< async_work
->nr
);
1360 btrfs_free_path(path
);
1362 wake_up(&delayed_root
->wait
);
1367 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root
*delayed_root
,
1368 struct btrfs_fs_info
*fs_info
, int nr
)
1370 struct btrfs_async_delayed_work
*async_work
;
1372 async_work
= kmalloc(sizeof(*async_work
), GFP_NOFS
);
1376 async_work
->delayed_root
= delayed_root
;
1377 btrfs_init_work(&async_work
->work
, btrfs_async_run_delayed_root
, NULL
,
1379 async_work
->nr
= nr
;
1381 btrfs_queue_work(fs_info
->delayed_workers
, &async_work
->work
);
1385 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info
*fs_info
)
1387 WARN_ON(btrfs_first_delayed_node(fs_info
->delayed_root
));
1390 static int could_end_wait(struct btrfs_delayed_root
*delayed_root
, int seq
)
1392 int val
= atomic_read(&delayed_root
->items_seq
);
1394 if (val
< seq
|| val
>= seq
+ BTRFS_DELAYED_BATCH
)
1397 if (atomic_read(&delayed_root
->items
) < BTRFS_DELAYED_BACKGROUND
)
1403 void btrfs_balance_delayed_items(struct btrfs_fs_info
*fs_info
)
1405 struct btrfs_delayed_root
*delayed_root
= fs_info
->delayed_root
;
1407 if ((atomic_read(&delayed_root
->items
) < BTRFS_DELAYED_BACKGROUND
) ||
1408 btrfs_workqueue_normal_congested(fs_info
->delayed_workers
))
1411 if (atomic_read(&delayed_root
->items
) >= BTRFS_DELAYED_WRITEBACK
) {
1415 seq
= atomic_read(&delayed_root
->items_seq
);
1417 ret
= btrfs_wq_run_delayed_node(delayed_root
, fs_info
, 0);
1421 wait_event_interruptible(delayed_root
->wait
,
1422 could_end_wait(delayed_root
, seq
));
1426 btrfs_wq_run_delayed_node(delayed_root
, fs_info
, BTRFS_DELAYED_BATCH
);
1429 /* Will return 0 or -ENOMEM */
1430 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle
*trans
,
1431 const char *name
, int name_len
,
1432 struct btrfs_inode
*dir
,
1433 struct btrfs_disk_key
*disk_key
, u8 flags
,
1436 struct btrfs_fs_info
*fs_info
= trans
->fs_info
;
1437 const unsigned int leaf_data_size
= BTRFS_LEAF_DATA_SIZE(fs_info
);
1438 struct btrfs_delayed_node
*delayed_node
;
1439 struct btrfs_delayed_item
*delayed_item
;
1440 struct btrfs_dir_item
*dir_item
;
1441 bool reserve_leaf_space
;
1445 delayed_node
= btrfs_get_or_create_delayed_node(dir
);
1446 if (IS_ERR(delayed_node
))
1447 return PTR_ERR(delayed_node
);
1449 delayed_item
= btrfs_alloc_delayed_item(sizeof(*dir_item
) + name_len
,
1451 BTRFS_DELAYED_INSERTION_ITEM
);
1452 if (!delayed_item
) {
1457 delayed_item
->index
= index
;
1459 dir_item
= (struct btrfs_dir_item
*)delayed_item
->data
;
1460 dir_item
->location
= *disk_key
;
1461 btrfs_set_stack_dir_transid(dir_item
, trans
->transid
);
1462 btrfs_set_stack_dir_data_len(dir_item
, 0);
1463 btrfs_set_stack_dir_name_len(dir_item
, name_len
);
1464 btrfs_set_stack_dir_flags(dir_item
, flags
);
1465 memcpy((char *)(dir_item
+ 1), name
, name_len
);
1467 data_len
= delayed_item
->data_len
+ sizeof(struct btrfs_item
);
1469 mutex_lock(&delayed_node
->mutex
);
1471 if (delayed_node
->index_item_leaves
== 0 ||
1472 delayed_node
->curr_index_batch_size
+ data_len
> leaf_data_size
) {
1473 delayed_node
->curr_index_batch_size
= data_len
;
1474 reserve_leaf_space
= true;
1476 delayed_node
->curr_index_batch_size
+= data_len
;
1477 reserve_leaf_space
= false;
1480 if (reserve_leaf_space
) {
1481 ret
= btrfs_delayed_item_reserve_metadata(trans
, delayed_item
);
1483 * Space was reserved for a dir index item insertion when we
1484 * started the transaction, so getting a failure here should be
1488 mutex_unlock(&delayed_node
->mutex
);
1489 btrfs_release_delayed_item(delayed_item
);
1493 delayed_node
->index_item_leaves
++;
1494 } else if (!test_bit(BTRFS_FS_LOG_RECOVERING
, &fs_info
->flags
)) {
1495 const u64 bytes
= btrfs_calc_insert_metadata_size(fs_info
, 1);
1498 * Adding the new dir index item does not require touching another
1499 * leaf, so we can release 1 unit of metadata that was previously
1500 * reserved when starting the transaction. This applies only to
1501 * the case where we had a transaction start and excludes the
1502 * transaction join case (when replaying log trees).
1504 trace_btrfs_space_reservation(fs_info
, "transaction",
1505 trans
->transid
, bytes
, 0);
1506 btrfs_block_rsv_release(fs_info
, trans
->block_rsv
, bytes
, NULL
);
1507 ASSERT(trans
->bytes_reserved
>= bytes
);
1508 trans
->bytes_reserved
-= bytes
;
1511 ret
= __btrfs_add_delayed_item(delayed_node
, delayed_item
);
1512 if (unlikely(ret
)) {
1513 btrfs_err(trans
->fs_info
,
1514 "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
1515 name_len
, name
, delayed_node
->root
->root_key
.objectid
,
1516 delayed_node
->inode_id
, ret
);
1519 mutex_unlock(&delayed_node
->mutex
);
1522 btrfs_release_delayed_node(delayed_node
);
1526 static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info
*fs_info
,
1527 struct btrfs_delayed_node
*node
,
1530 struct btrfs_delayed_item
*item
;
1532 mutex_lock(&node
->mutex
);
1533 item
= __btrfs_lookup_delayed_item(&node
->ins_root
.rb_root
, index
);
1535 mutex_unlock(&node
->mutex
);
1540 * For delayed items to insert, we track reserved metadata bytes based
1541 * on the number of leaves that we will use.
1542 * See btrfs_insert_delayed_dir_index() and
1543 * btrfs_delayed_item_reserve_metadata()).
1545 ASSERT(item
->bytes_reserved
== 0);
1546 ASSERT(node
->index_item_leaves
> 0);
1549 * If there's only one leaf reserved, we can decrement this item from the
1550 * current batch, otherwise we can not because we don't know which leaf
1551 * it belongs to. With the current limit on delayed items, we rarely
1552 * accumulate enough dir index items to fill more than one leaf (even
1553 * when using a leaf size of 4K).
1555 if (node
->index_item_leaves
== 1) {
1556 const u32 data_len
= item
->data_len
+ sizeof(struct btrfs_item
);
1558 ASSERT(node
->curr_index_batch_size
>= data_len
);
1559 node
->curr_index_batch_size
-= data_len
;
1562 btrfs_release_delayed_item(item
);
1564 /* If we now have no more dir index items, we can release all leaves. */
1565 if (RB_EMPTY_ROOT(&node
->ins_root
.rb_root
)) {
1566 btrfs_delayed_item_release_leaves(node
, node
->index_item_leaves
);
1567 node
->index_item_leaves
= 0;
1570 mutex_unlock(&node
->mutex
);
1574 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle
*trans
,
1575 struct btrfs_inode
*dir
, u64 index
)
1577 struct btrfs_delayed_node
*node
;
1578 struct btrfs_delayed_item
*item
;
1581 node
= btrfs_get_or_create_delayed_node(dir
);
1583 return PTR_ERR(node
);
1585 ret
= btrfs_delete_delayed_insertion_item(trans
->fs_info
, node
, index
);
1589 item
= btrfs_alloc_delayed_item(0, node
, BTRFS_DELAYED_DELETION_ITEM
);
1595 item
->index
= index
;
1597 ret
= btrfs_delayed_item_reserve_metadata(trans
, item
);
1599 * we have reserved enough space when we start a new transaction,
1600 * so reserving metadata failure is impossible.
1603 btrfs_err(trans
->fs_info
,
1604 "metadata reservation failed for delayed dir item deltiona, should have been reserved");
1605 btrfs_release_delayed_item(item
);
1609 mutex_lock(&node
->mutex
);
1610 ret
= __btrfs_add_delayed_item(node
, item
);
1611 if (unlikely(ret
)) {
1612 btrfs_err(trans
->fs_info
,
1613 "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
1614 index
, node
->root
->root_key
.objectid
,
1615 node
->inode_id
, ret
);
1616 btrfs_delayed_item_release_metadata(dir
->root
, item
);
1617 btrfs_release_delayed_item(item
);
1619 mutex_unlock(&node
->mutex
);
1621 btrfs_release_delayed_node(node
);
1625 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode
*inode
)
1627 struct btrfs_delayed_node
*delayed_node
= btrfs_get_delayed_node(inode
);
1633 * Since we have held i_mutex of this directory, it is impossible that
1634 * a new directory index is added into the delayed node and index_cnt
1635 * is updated now. So we needn't lock the delayed node.
1637 if (!delayed_node
->index_cnt
) {
1638 btrfs_release_delayed_node(delayed_node
);
1642 inode
->index_cnt
= delayed_node
->index_cnt
;
1643 btrfs_release_delayed_node(delayed_node
);
1647 bool btrfs_readdir_get_delayed_items(struct inode
*inode
,
1649 struct list_head
*ins_list
,
1650 struct list_head
*del_list
)
1652 struct btrfs_delayed_node
*delayed_node
;
1653 struct btrfs_delayed_item
*item
;
1655 delayed_node
= btrfs_get_delayed_node(BTRFS_I(inode
));
1660 * We can only do one readdir with delayed items at a time because of
1661 * item->readdir_list.
1663 btrfs_inode_unlock(BTRFS_I(inode
), BTRFS_ILOCK_SHARED
);
1664 btrfs_inode_lock(BTRFS_I(inode
), 0);
1666 mutex_lock(&delayed_node
->mutex
);
1667 item
= __btrfs_first_delayed_insertion_item(delayed_node
);
1668 while (item
&& item
->index
<= last_index
) {
1669 refcount_inc(&item
->refs
);
1670 list_add_tail(&item
->readdir_list
, ins_list
);
1671 item
= __btrfs_next_delayed_item(item
);
1674 item
= __btrfs_first_delayed_deletion_item(delayed_node
);
1675 while (item
&& item
->index
<= last_index
) {
1676 refcount_inc(&item
->refs
);
1677 list_add_tail(&item
->readdir_list
, del_list
);
1678 item
= __btrfs_next_delayed_item(item
);
1680 mutex_unlock(&delayed_node
->mutex
);
1682 * This delayed node is still cached in the btrfs inode, so refs
1683 * must be > 1 now, and we needn't check it is going to be freed
1686 * Besides that, this function is used to read dir, we do not
1687 * insert/delete delayed items in this period. So we also needn't
1688 * requeue or dequeue this delayed node.
1690 refcount_dec(&delayed_node
->refs
);
1695 void btrfs_readdir_put_delayed_items(struct inode
*inode
,
1696 struct list_head
*ins_list
,
1697 struct list_head
*del_list
)
1699 struct btrfs_delayed_item
*curr
, *next
;
1701 list_for_each_entry_safe(curr
, next
, ins_list
, readdir_list
) {
1702 list_del(&curr
->readdir_list
);
1703 if (refcount_dec_and_test(&curr
->refs
))
1707 list_for_each_entry_safe(curr
, next
, del_list
, readdir_list
) {
1708 list_del(&curr
->readdir_list
);
1709 if (refcount_dec_and_test(&curr
->refs
))
1714 * The VFS is going to do up_read(), so we need to downgrade back to a
1717 downgrade_write(&inode
->i_rwsem
);
1720 int btrfs_should_delete_dir_index(struct list_head
*del_list
,
1723 struct btrfs_delayed_item
*curr
;
1726 list_for_each_entry(curr
, del_list
, readdir_list
) {
1727 if (curr
->index
> index
)
1729 if (curr
->index
== index
) {
1738 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1741 int btrfs_readdir_delayed_dir_index(struct dir_context
*ctx
,
1742 struct list_head
*ins_list
)
1744 struct btrfs_dir_item
*di
;
1745 struct btrfs_delayed_item
*curr
, *next
;
1746 struct btrfs_key location
;
1750 unsigned char d_type
;
1753 * Changing the data of the delayed item is impossible. So
1754 * we needn't lock them. And we have held i_mutex of the
1755 * directory, nobody can delete any directory indexes now.
1757 list_for_each_entry_safe(curr
, next
, ins_list
, readdir_list
) {
1758 list_del(&curr
->readdir_list
);
1760 if (curr
->index
< ctx
->pos
) {
1761 if (refcount_dec_and_test(&curr
->refs
))
1766 ctx
->pos
= curr
->index
;
1768 di
= (struct btrfs_dir_item
*)curr
->data
;
1769 name
= (char *)(di
+ 1);
1770 name_len
= btrfs_stack_dir_name_len(di
);
1772 d_type
= fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di
->type
));
1773 btrfs_disk_key_to_cpu(&location
, &di
->location
);
1775 over
= !dir_emit(ctx
, name
, name_len
,
1776 location
.objectid
, d_type
);
1778 if (refcount_dec_and_test(&curr
->refs
))
1788 static void fill_stack_inode_item(struct btrfs_trans_handle
*trans
,
1789 struct btrfs_inode_item
*inode_item
,
1790 struct inode
*inode
)
1794 btrfs_set_stack_inode_uid(inode_item
, i_uid_read(inode
));
1795 btrfs_set_stack_inode_gid(inode_item
, i_gid_read(inode
));
1796 btrfs_set_stack_inode_size(inode_item
, BTRFS_I(inode
)->disk_i_size
);
1797 btrfs_set_stack_inode_mode(inode_item
, inode
->i_mode
);
1798 btrfs_set_stack_inode_nlink(inode_item
, inode
->i_nlink
);
1799 btrfs_set_stack_inode_nbytes(inode_item
, inode_get_bytes(inode
));
1800 btrfs_set_stack_inode_generation(inode_item
,
1801 BTRFS_I(inode
)->generation
);
1802 btrfs_set_stack_inode_sequence(inode_item
,
1803 inode_peek_iversion(inode
));
1804 btrfs_set_stack_inode_transid(inode_item
, trans
->transid
);
1805 btrfs_set_stack_inode_rdev(inode_item
, inode
->i_rdev
);
1806 flags
= btrfs_inode_combine_flags(BTRFS_I(inode
)->flags
,
1807 BTRFS_I(inode
)->ro_flags
);
1808 btrfs_set_stack_inode_flags(inode_item
, flags
);
1809 btrfs_set_stack_inode_block_group(inode_item
, 0);
1811 btrfs_set_stack_timespec_sec(&inode_item
->atime
,
1812 inode
->i_atime
.tv_sec
);
1813 btrfs_set_stack_timespec_nsec(&inode_item
->atime
,
1814 inode
->i_atime
.tv_nsec
);
1816 btrfs_set_stack_timespec_sec(&inode_item
->mtime
,
1817 inode
->i_mtime
.tv_sec
);
1818 btrfs_set_stack_timespec_nsec(&inode_item
->mtime
,
1819 inode
->i_mtime
.tv_nsec
);
1821 btrfs_set_stack_timespec_sec(&inode_item
->ctime
,
1822 inode
->i_ctime
.tv_sec
);
1823 btrfs_set_stack_timespec_nsec(&inode_item
->ctime
,
1824 inode
->i_ctime
.tv_nsec
);
1826 btrfs_set_stack_timespec_sec(&inode_item
->otime
,
1827 BTRFS_I(inode
)->i_otime
.tv_sec
);
1828 btrfs_set_stack_timespec_nsec(&inode_item
->otime
,
1829 BTRFS_I(inode
)->i_otime
.tv_nsec
);
1832 int btrfs_fill_inode(struct inode
*inode
, u32
*rdev
)
1834 struct btrfs_fs_info
*fs_info
= BTRFS_I(inode
)->root
->fs_info
;
1835 struct btrfs_delayed_node
*delayed_node
;
1836 struct btrfs_inode_item
*inode_item
;
1838 delayed_node
= btrfs_get_delayed_node(BTRFS_I(inode
));
1842 mutex_lock(&delayed_node
->mutex
);
1843 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
)) {
1844 mutex_unlock(&delayed_node
->mutex
);
1845 btrfs_release_delayed_node(delayed_node
);
1849 inode_item
= &delayed_node
->inode_item
;
1851 i_uid_write(inode
, btrfs_stack_inode_uid(inode_item
));
1852 i_gid_write(inode
, btrfs_stack_inode_gid(inode_item
));
1853 btrfs_i_size_write(BTRFS_I(inode
), btrfs_stack_inode_size(inode_item
));
1854 btrfs_inode_set_file_extent_range(BTRFS_I(inode
), 0,
1855 round_up(i_size_read(inode
), fs_info
->sectorsize
));
1856 inode
->i_mode
= btrfs_stack_inode_mode(inode_item
);
1857 set_nlink(inode
, btrfs_stack_inode_nlink(inode_item
));
1858 inode_set_bytes(inode
, btrfs_stack_inode_nbytes(inode_item
));
1859 BTRFS_I(inode
)->generation
= btrfs_stack_inode_generation(inode_item
);
1860 BTRFS_I(inode
)->last_trans
= btrfs_stack_inode_transid(inode_item
);
1862 inode_set_iversion_queried(inode
,
1863 btrfs_stack_inode_sequence(inode_item
));
1865 *rdev
= btrfs_stack_inode_rdev(inode_item
);
1866 btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item
),
1867 &BTRFS_I(inode
)->flags
, &BTRFS_I(inode
)->ro_flags
);
1869 inode
->i_atime
.tv_sec
= btrfs_stack_timespec_sec(&inode_item
->atime
);
1870 inode
->i_atime
.tv_nsec
= btrfs_stack_timespec_nsec(&inode_item
->atime
);
1872 inode
->i_mtime
.tv_sec
= btrfs_stack_timespec_sec(&inode_item
->mtime
);
1873 inode
->i_mtime
.tv_nsec
= btrfs_stack_timespec_nsec(&inode_item
->mtime
);
1875 inode
->i_ctime
.tv_sec
= btrfs_stack_timespec_sec(&inode_item
->ctime
);
1876 inode
->i_ctime
.tv_nsec
= btrfs_stack_timespec_nsec(&inode_item
->ctime
);
1878 BTRFS_I(inode
)->i_otime
.tv_sec
=
1879 btrfs_stack_timespec_sec(&inode_item
->otime
);
1880 BTRFS_I(inode
)->i_otime
.tv_nsec
=
1881 btrfs_stack_timespec_nsec(&inode_item
->otime
);
1883 inode
->i_generation
= BTRFS_I(inode
)->generation
;
1884 BTRFS_I(inode
)->index_cnt
= (u64
)-1;
1886 mutex_unlock(&delayed_node
->mutex
);
1887 btrfs_release_delayed_node(delayed_node
);
1891 int btrfs_delayed_update_inode(struct btrfs_trans_handle
*trans
,
1892 struct btrfs_root
*root
,
1893 struct btrfs_inode
*inode
)
1895 struct btrfs_delayed_node
*delayed_node
;
1898 delayed_node
= btrfs_get_or_create_delayed_node(inode
);
1899 if (IS_ERR(delayed_node
))
1900 return PTR_ERR(delayed_node
);
1902 mutex_lock(&delayed_node
->mutex
);
1903 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
)) {
1904 fill_stack_inode_item(trans
, &delayed_node
->inode_item
,
1909 ret
= btrfs_delayed_inode_reserve_metadata(trans
, root
, delayed_node
);
1913 fill_stack_inode_item(trans
, &delayed_node
->inode_item
, &inode
->vfs_inode
);
1914 set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
);
1915 delayed_node
->count
++;
1916 atomic_inc(&root
->fs_info
->delayed_root
->items
);
1918 mutex_unlock(&delayed_node
->mutex
);
1919 btrfs_release_delayed_node(delayed_node
);
1923 int btrfs_delayed_delete_inode_ref(struct btrfs_inode
*inode
)
1925 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1926 struct btrfs_delayed_node
*delayed_node
;
1929 * we don't do delayed inode updates during log recovery because it
1930 * leads to enospc problems. This means we also can't do
1931 * delayed inode refs
1933 if (test_bit(BTRFS_FS_LOG_RECOVERING
, &fs_info
->flags
))
1936 delayed_node
= btrfs_get_or_create_delayed_node(inode
);
1937 if (IS_ERR(delayed_node
))
1938 return PTR_ERR(delayed_node
);
1941 * We don't reserve space for inode ref deletion is because:
1942 * - We ONLY do async inode ref deletion for the inode who has only
1943 * one link(i_nlink == 1), it means there is only one inode ref.
1944 * And in most case, the inode ref and the inode item are in the
1945 * same leaf, and we will deal with them at the same time.
1946 * Since we are sure we will reserve the space for the inode item,
1947 * it is unnecessary to reserve space for inode ref deletion.
1948 * - If the inode ref and the inode item are not in the same leaf,
1949 * We also needn't worry about enospc problem, because we reserve
1950 * much more space for the inode update than it needs.
1951 * - At the worst, we can steal some space from the global reservation.
1954 mutex_lock(&delayed_node
->mutex
);
1955 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF
, &delayed_node
->flags
))
1958 set_bit(BTRFS_DELAYED_NODE_DEL_IREF
, &delayed_node
->flags
);
1959 delayed_node
->count
++;
1960 atomic_inc(&fs_info
->delayed_root
->items
);
1962 mutex_unlock(&delayed_node
->mutex
);
1963 btrfs_release_delayed_node(delayed_node
);
1967 static void __btrfs_kill_delayed_node(struct btrfs_delayed_node
*delayed_node
)
1969 struct btrfs_root
*root
= delayed_node
->root
;
1970 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1971 struct btrfs_delayed_item
*curr_item
, *prev_item
;
1973 mutex_lock(&delayed_node
->mutex
);
1974 curr_item
= __btrfs_first_delayed_insertion_item(delayed_node
);
1976 prev_item
= curr_item
;
1977 curr_item
= __btrfs_next_delayed_item(prev_item
);
1978 btrfs_release_delayed_item(prev_item
);
1981 if (delayed_node
->index_item_leaves
> 0) {
1982 btrfs_delayed_item_release_leaves(delayed_node
,
1983 delayed_node
->index_item_leaves
);
1984 delayed_node
->index_item_leaves
= 0;
1987 curr_item
= __btrfs_first_delayed_deletion_item(delayed_node
);
1989 btrfs_delayed_item_release_metadata(root
, curr_item
);
1990 prev_item
= curr_item
;
1991 curr_item
= __btrfs_next_delayed_item(prev_item
);
1992 btrfs_release_delayed_item(prev_item
);
1995 btrfs_release_delayed_iref(delayed_node
);
1997 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY
, &delayed_node
->flags
)) {
1998 btrfs_delayed_inode_release_metadata(fs_info
, delayed_node
, false);
1999 btrfs_release_delayed_inode(delayed_node
);
2001 mutex_unlock(&delayed_node
->mutex
);
2004 void btrfs_kill_delayed_inode_items(struct btrfs_inode
*inode
)
2006 struct btrfs_delayed_node
*delayed_node
;
2008 delayed_node
= btrfs_get_delayed_node(inode
);
2012 __btrfs_kill_delayed_node(delayed_node
);
2013 btrfs_release_delayed_node(delayed_node
);
2016 void btrfs_kill_all_delayed_nodes(struct btrfs_root
*root
)
2019 struct btrfs_delayed_node
*delayed_nodes
[8];
2023 spin_lock(&root
->inode_lock
);
2024 n
= radix_tree_gang_lookup(&root
->delayed_nodes_tree
,
2025 (void **)delayed_nodes
, inode_id
,
2026 ARRAY_SIZE(delayed_nodes
));
2028 spin_unlock(&root
->inode_lock
);
2032 inode_id
= delayed_nodes
[n
- 1]->inode_id
+ 1;
2033 for (i
= 0; i
< n
; i
++) {
2035 * Don't increase refs in case the node is dead and
2036 * about to be removed from the tree in the loop below
2038 if (!refcount_inc_not_zero(&delayed_nodes
[i
]->refs
))
2039 delayed_nodes
[i
] = NULL
;
2041 spin_unlock(&root
->inode_lock
);
2043 for (i
= 0; i
< n
; i
++) {
2044 if (!delayed_nodes
[i
])
2046 __btrfs_kill_delayed_node(delayed_nodes
[i
]);
2047 btrfs_release_delayed_node(delayed_nodes
[i
]);
2052 void btrfs_destroy_delayed_inodes(struct btrfs_fs_info
*fs_info
)
2054 struct btrfs_delayed_node
*curr_node
, *prev_node
;
2056 curr_node
= btrfs_first_delayed_node(fs_info
->delayed_root
);
2058 __btrfs_kill_delayed_node(curr_node
);
2060 prev_node
= curr_node
;
2061 curr_node
= btrfs_next_delayed_node(curr_node
);
2062 btrfs_release_delayed_node(prev_node
);
2066 void btrfs_log_get_delayed_items(struct btrfs_inode
*inode
,
2067 struct list_head
*ins_list
,
2068 struct list_head
*del_list
)
2070 struct btrfs_delayed_node
*node
;
2071 struct btrfs_delayed_item
*item
;
2073 node
= btrfs_get_delayed_node(inode
);
2077 mutex_lock(&node
->mutex
);
2078 item
= __btrfs_first_delayed_insertion_item(node
);
2081 * It's possible that the item is already in a log list. This
2082 * can happen in case two tasks are trying to log the same
2083 * directory. For example if we have tasks A and task B:
2085 * Task A collected the delayed items into a log list while
2086 * under the inode's log_mutex (at btrfs_log_inode()), but it
2087 * only releases the items after logging the inodes they point
2088 * to (if they are new inodes), which happens after unlocking
2091 * Task B enters btrfs_log_inode() and acquires the log_mutex
2092 * of the same directory inode, before task B releases the
2093 * delayed items. This can happen for example when logging some
2094 * inode we need to trigger logging of its parent directory, so
2095 * logging two files that have the same parent directory can
2098 * If this happens, just ignore delayed items already in a log
2099 * list. All the tasks logging the directory are under a log
2100 * transaction and whichever finishes first can not sync the log
2101 * before the other completes and leaves the log transaction.
2103 if (!item
->logged
&& list_empty(&item
->log_list
)) {
2104 refcount_inc(&item
->refs
);
2105 list_add_tail(&item
->log_list
, ins_list
);
2107 item
= __btrfs_next_delayed_item(item
);
2110 item
= __btrfs_first_delayed_deletion_item(node
);
2112 /* It may be non-empty, for the same reason mentioned above. */
2113 if (!item
->logged
&& list_empty(&item
->log_list
)) {
2114 refcount_inc(&item
->refs
);
2115 list_add_tail(&item
->log_list
, del_list
);
2117 item
= __btrfs_next_delayed_item(item
);
2119 mutex_unlock(&node
->mutex
);
2122 * We are called during inode logging, which means the inode is in use
2123 * and can not be evicted before we finish logging the inode. So we never
2124 * have the last reference on the delayed inode.
2125 * Also, we don't use btrfs_release_delayed_node() because that would
2126 * requeue the delayed inode (change its order in the list of prepared
2127 * nodes) and we don't want to do such change because we don't create or
2128 * delete delayed items.
2130 ASSERT(refcount_read(&node
->refs
) > 1);
2131 refcount_dec(&node
->refs
);
2134 void btrfs_log_put_delayed_items(struct btrfs_inode
*inode
,
2135 struct list_head
*ins_list
,
2136 struct list_head
*del_list
)
2138 struct btrfs_delayed_node
*node
;
2139 struct btrfs_delayed_item
*item
;
2140 struct btrfs_delayed_item
*next
;
2142 node
= btrfs_get_delayed_node(inode
);
2146 mutex_lock(&node
->mutex
);
2148 list_for_each_entry_safe(item
, next
, ins_list
, log_list
) {
2149 item
->logged
= true;
2150 list_del_init(&item
->log_list
);
2151 if (refcount_dec_and_test(&item
->refs
))
2155 list_for_each_entry_safe(item
, next
, del_list
, log_list
) {
2156 item
->logged
= true;
2157 list_del_init(&item
->log_list
);
2158 if (refcount_dec_and_test(&item
->refs
))
2162 mutex_unlock(&node
->mutex
);
2165 * We are called during inode logging, which means the inode is in use
2166 * and can not be evicted before we finish logging the inode. So we never
2167 * have the last reference on the delayed inode.
2168 * Also, we don't use btrfs_release_delayed_node() because that would
2169 * requeue the delayed inode (change its order in the list of prepared
2170 * nodes) and we don't want to do such change because we don't create or
2171 * delete delayed items.
2173 ASSERT(refcount_read(&node
->refs
) > 1);
2174 refcount_dec(&node
->refs
);