1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Alexander Block. All rights reserved.
6 #include <linux/bsearch.h>
8 #include <linux/file.h>
9 #include <linux/sort.h>
10 #include <linux/mount.h>
11 #include <linux/xattr.h>
12 #include <linux/posix_acl_xattr.h>
13 #include <linux/radix-tree.h>
14 #include <linux/vmalloc.h>
15 #include <linux/string.h>
16 #include <linux/compat.h>
17 #include <linux/crc32c.h>
18 #include <linux/fsverity.h>
24 #include "btrfs_inode.h"
25 #include "transaction.h"
26 #include "compression.h"
27 #include "print-tree.h"
28 #include "accessors.h"
30 #include "file-item.h"
33 #include "lru_cache.h"
36 * Maximum number of references an extent can have in order for us to attempt to
37 * issue clone operations instead of write operations. This currently exists to
38 * avoid hitting limitations of the backreference walking code (taking a lot of
39 * time and using too much memory for extents with large number of references).
41 #define SEND_MAX_EXTENT_REFS 1024
44 * A fs_path is a helper to dynamically build path names with unknown size.
45 * It reallocates the internal buffer on demand.
46 * It allows fast adding of path elements on the right side (normal path) and
47 * fast adding to the left side (reversed path). A reversed path can also be
48 * unreversed if needed.
57 unsigned short buf_len
:15;
58 unsigned short reversed
:1;
62 * Average path length does not exceed 200 bytes, we'll have
63 * better packing in the slab and higher chance to satisfy
64 * an allocation later during send.
69 #define FS_PATH_INLINE_SIZE \
70 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
73 /* reused for each extent */
75 struct btrfs_root
*root
;
82 #define SEND_MAX_NAME_CACHE_SIZE 256
85 * Limit the root_ids array of struct backref_cache_entry to 17 elements.
86 * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
87 * can be satisfied from the kmalloc-192 slab, without wasting any space.
88 * The most common case is to have a single root for cloning, which corresponds
89 * to the send root. Having the user specify more than 16 clone roots is not
90 * common, and in such rare cases we simply don't use caching if the number of
91 * cloning roots that lead down to a leaf is more than 17.
93 #define SEND_MAX_BACKREF_CACHE_ROOTS 17
96 * Max number of entries in the cache.
97 * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
98 * maple tree's internal nodes, is 24K.
100 #define SEND_MAX_BACKREF_CACHE_SIZE 128
103 * A backref cache entry maps a leaf to a list of IDs of roots from which the
104 * leaf is accessible and we can use for clone operations.
105 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
108 struct backref_cache_entry
{
109 struct btrfs_lru_cache_entry entry
;
110 u64 root_ids
[SEND_MAX_BACKREF_CACHE_ROOTS
];
111 /* Number of valid elements in the root_ids array. */
115 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
116 static_assert(offsetof(struct backref_cache_entry
, entry
) == 0);
119 * Max number of entries in the cache that stores directories that were already
120 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
121 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
122 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
124 #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
127 * Max number of entries in the cache that stores directories that were already
128 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
129 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
130 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
132 #define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
135 struct file
*send_filp
;
141 * Whether BTRFS_SEND_A_DATA attribute was already added to current
142 * command (since protocol v2, data must be the last attribute).
145 struct page
**send_buf_pages
;
146 u64 flags
; /* 'flags' member of btrfs_ioctl_send_args is u64 */
147 /* Protocol version compatibility requested */
150 struct btrfs_root
*send_root
;
151 struct btrfs_root
*parent_root
;
152 struct clone_root
*clone_roots
;
155 /* current state of the compare_tree call */
156 struct btrfs_path
*left_path
;
157 struct btrfs_path
*right_path
;
158 struct btrfs_key
*cmp_key
;
161 * Keep track of the generation of the last transaction that was used
162 * for relocating a block group. This is periodically checked in order
163 * to detect if a relocation happened since the last check, so that we
164 * don't operate on stale extent buffers for nodes (level >= 1) or on
165 * stale disk_bytenr values of file extent items.
167 u64 last_reloc_trans
;
170 * infos of the currently processed inode. In case of deleted inodes,
171 * these are the values from the deleted inode.
178 u64 cur_inode_last_extent
;
179 u64 cur_inode_next_write_offset
;
180 struct fs_path cur_inode_path
;
182 bool cur_inode_new_gen
;
183 bool cur_inode_deleted
;
184 bool ignore_cur_inode
;
185 bool cur_inode_needs_verity
;
186 void *verity_descriptor
;
190 struct list_head new_refs
;
191 struct list_head deleted_refs
;
193 struct btrfs_lru_cache name_cache
;
196 * The inode we are currently processing. It's not NULL only when we
197 * need to issue write commands for data extents from this inode.
199 struct inode
*cur_inode
;
200 struct file_ra_state ra
;
201 u64 page_cache_clear_start
;
202 bool clean_page_cache
;
205 * We process inodes by their increasing order, so if before an
206 * incremental send we reverse the parent/child relationship of
207 * directories such that a directory with a lower inode number was
208 * the parent of a directory with a higher inode number, and the one
209 * becoming the new parent got renamed too, we can't rename/move the
210 * directory with lower inode number when we finish processing it - we
211 * must process the directory with higher inode number first, then
212 * rename/move it and then rename/move the directory with lower inode
213 * number. Example follows.
215 * Tree state when the first send was performed:
227 * Tree state when the second (incremental) send is performed:
236 * The sequence of steps that lead to the second state was:
238 * mv /a/b/c/d /a/b/c2/d2
239 * mv /a/b/c /a/b/c2/d2/cc
241 * "c" has lower inode number, but we can't move it (2nd mv operation)
242 * before we move "d", which has higher inode number.
244 * So we just memorize which move/rename operations must be performed
245 * later when their respective parent is processed and moved/renamed.
248 /* Indexed by parent directory inode number. */
249 struct rb_root pending_dir_moves
;
252 * Reverse index, indexed by the inode number of a directory that
253 * is waiting for the move/rename of its immediate parent before its
254 * own move/rename can be performed.
256 struct rb_root waiting_dir_moves
;
259 * A directory that is going to be rm'ed might have a child directory
260 * which is in the pending directory moves index above. In this case,
261 * the directory can only be removed after the move/rename of its child
262 * is performed. Example:
282 * Sequence of steps that lead to the send snapshot:
283 * rm -f /a/b/c/foo.txt
285 * mv /a/b/c/x /a/b/YY
288 * When the child is processed, its move/rename is delayed until its
289 * parent is processed (as explained above), but all other operations
290 * like update utimes, chown, chgrp, etc, are performed and the paths
291 * that it uses for those operations must use the orphanized name of
292 * its parent (the directory we're going to rm later), so we need to
293 * memorize that name.
295 * Indexed by the inode number of the directory to be deleted.
297 struct rb_root orphan_dirs
;
299 struct rb_root rbtree_new_refs
;
300 struct rb_root rbtree_deleted_refs
;
302 struct btrfs_lru_cache backref_cache
;
303 u64 backref_cache_last_reloc_trans
;
305 struct btrfs_lru_cache dir_created_cache
;
306 struct btrfs_lru_cache dir_utimes_cache
;
309 struct pending_dir_move
{
311 struct list_head list
;
315 struct list_head update_refs
;
318 struct waiting_dir_move
{
322 * There might be some directory that could not be removed because it
323 * was waiting for this directory inode to be moved first. Therefore
324 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
331 struct orphan_dir_info
{
335 u64 last_dir_index_offset
;
336 u64 dir_high_seq_ino
;
339 struct name_cache_entry
{
341 * The key in the entry is an inode number, and the generation matches
342 * the inode's generation.
344 struct btrfs_lru_cache_entry entry
;
348 int need_later_update
;
349 /* Name length without NUL terminator. */
351 /* Not NUL terminated. */
352 char name
[] __counted_by(name_len
) __nonstring
;
355 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
356 static_assert(offsetof(struct name_cache_entry
, entry
) == 0);
359 #define ADVANCE_ONLY_NEXT -1
361 enum btrfs_compare_tree_result
{
362 BTRFS_COMPARE_TREE_NEW
,
363 BTRFS_COMPARE_TREE_DELETED
,
364 BTRFS_COMPARE_TREE_CHANGED
,
365 BTRFS_COMPARE_TREE_SAME
,
369 static void inconsistent_snapshot_error(struct send_ctx
*sctx
,
370 enum btrfs_compare_tree_result result
,
373 const char *result_string
;
376 case BTRFS_COMPARE_TREE_NEW
:
377 result_string
= "new";
379 case BTRFS_COMPARE_TREE_DELETED
:
380 result_string
= "deleted";
382 case BTRFS_COMPARE_TREE_CHANGED
:
383 result_string
= "updated";
385 case BTRFS_COMPARE_TREE_SAME
:
386 DEBUG_WARN("no change between trees");
387 result_string
= "unchanged";
390 DEBUG_WARN("unexpected comparison result %d", result
);
391 result_string
= "unexpected";
394 btrfs_err(sctx
->send_root
->fs_info
,
395 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
396 result_string
, what
, sctx
->cmp_key
->objectid
,
397 btrfs_root_id(sctx
->send_root
),
398 (sctx
->parent_root
? btrfs_root_id(sctx
->parent_root
) : 0));
402 static bool proto_cmd_ok(const struct send_ctx
*sctx
, int cmd
)
404 switch (sctx
->proto
) {
405 case 1: return cmd
<= BTRFS_SEND_C_MAX_V1
;
406 case 2: return cmd
<= BTRFS_SEND_C_MAX_V2
;
407 case 3: return cmd
<= BTRFS_SEND_C_MAX_V3
;
408 default: return false;
412 static int is_waiting_for_move(struct send_ctx
*sctx
, u64 ino
);
414 static struct waiting_dir_move
*
415 get_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
);
417 static int is_waiting_for_rm(struct send_ctx
*sctx
, u64 dir_ino
, u64 gen
);
419 static int need_send_hole(struct send_ctx
*sctx
)
421 return (sctx
->parent_root
&& !sctx
->cur_inode_new
&&
422 !sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
&&
423 S_ISREG(sctx
->cur_inode_mode
));
426 static void fs_path_reset(struct fs_path
*p
)
429 p
->start
= p
->buf
+ p
->buf_len
- 1;
437 static void init_path(struct fs_path
*p
)
440 p
->buf
= p
->inline_buf
;
441 p
->buf_len
= FS_PATH_INLINE_SIZE
;
445 static struct fs_path
*fs_path_alloc(void)
449 p
= kmalloc(sizeof(*p
), GFP_KERNEL
);
456 static struct fs_path
*fs_path_alloc_reversed(void)
468 static void fs_path_free(struct fs_path
*p
)
472 if (p
->buf
!= p
->inline_buf
)
477 static inline int fs_path_len(const struct fs_path
*p
)
479 return p
->end
- p
->start
;
482 static int fs_path_ensure_buf(struct fs_path
*p
, int len
)
490 if (p
->buf_len
>= len
)
493 if (WARN_ON(len
> PATH_MAX
))
494 return -ENAMETOOLONG
;
496 path_len
= fs_path_len(p
);
497 old_buf_len
= p
->buf_len
;
500 * Allocate to the next largest kmalloc bucket size, to let
501 * the fast path happen most of the time.
503 len
= kmalloc_size_roundup(len
);
505 * First time the inline_buf does not suffice
507 if (p
->buf
== p
->inline_buf
) {
508 tmp_buf
= kmalloc(len
, GFP_KERNEL
);
510 memcpy(tmp_buf
, p
->buf
, old_buf_len
);
512 tmp_buf
= krealloc(p
->buf
, len
, GFP_KERNEL
);
520 tmp_buf
= p
->buf
+ old_buf_len
- path_len
- 1;
521 p
->end
= p
->buf
+ p
->buf_len
- 1;
522 p
->start
= p
->end
- path_len
;
523 memmove(p
->start
, tmp_buf
, path_len
+ 1);
526 p
->end
= p
->start
+ path_len
;
531 static int fs_path_prepare_for_add(struct fs_path
*p
, int name_len
,
537 new_len
= fs_path_len(p
) + name_len
;
538 if (p
->start
!= p
->end
)
540 ret
= fs_path_ensure_buf(p
, new_len
);
545 if (p
->start
!= p
->end
)
547 p
->start
-= name_len
;
548 *prepared
= p
->start
;
550 if (p
->start
!= p
->end
)
560 static int fs_path_add(struct fs_path
*p
, const char *name
, int name_len
)
565 ret
= fs_path_prepare_for_add(p
, name_len
, &prepared
);
568 memcpy(prepared
, name
, name_len
);
573 static inline int fs_path_add_path(struct fs_path
*p
, const struct fs_path
*p2
)
575 return fs_path_add(p
, p2
->start
, fs_path_len(p2
));
578 static int fs_path_add_from_extent_buffer(struct fs_path
*p
,
579 struct extent_buffer
*eb
,
580 unsigned long off
, int len
)
585 ret
= fs_path_prepare_for_add(p
, len
, &prepared
);
589 read_extent_buffer(eb
, prepared
, off
, len
);
594 static int fs_path_copy(struct fs_path
*p
, struct fs_path
*from
)
596 p
->reversed
= from
->reversed
;
599 return fs_path_add_path(p
, from
);
602 static void fs_path_unreverse(struct fs_path
*p
)
611 len
= fs_path_len(p
);
613 p
->end
= p
->start
+ len
;
614 memmove(p
->start
, tmp
, len
+ 1);
618 static inline bool is_current_inode_path(const struct send_ctx
*sctx
,
619 const struct fs_path
*path
)
621 const struct fs_path
*cur
= &sctx
->cur_inode_path
;
623 return (strncmp(path
->start
, cur
->start
, fs_path_len(cur
)) == 0);
626 static struct btrfs_path
*alloc_path_for_send(void)
628 struct btrfs_path
*path
;
630 path
= btrfs_alloc_path();
633 path
->search_commit_root
= 1;
634 path
->skip_locking
= 1;
635 path
->need_commit_sem
= 1;
639 static int write_buf(struct file
*filp
, const void *buf
, u32 len
, loff_t
*off
)
645 ret
= kernel_write(filp
, buf
+ pos
, len
- pos
, off
);
656 static int tlv_put(struct send_ctx
*sctx
, u16 attr
, const void *data
, int len
)
658 struct btrfs_tlv_header
*hdr
;
659 int total_len
= sizeof(*hdr
) + len
;
660 int left
= sctx
->send_max_size
- sctx
->send_size
;
662 if (WARN_ON_ONCE(sctx
->put_data
))
665 if (unlikely(left
< total_len
))
668 hdr
= (struct btrfs_tlv_header
*) (sctx
->send_buf
+ sctx
->send_size
);
669 put_unaligned_le16(attr
, &hdr
->tlv_type
);
670 put_unaligned_le16(len
, &hdr
->tlv_len
);
671 memcpy(hdr
+ 1, data
, len
);
672 sctx
->send_size
+= total_len
;
677 #define TLV_PUT_DEFINE_INT(bits) \
678 static int tlv_put_u##bits(struct send_ctx *sctx, \
679 u##bits attr, u##bits value) \
681 __le##bits __tmp = cpu_to_le##bits(value); \
682 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
685 TLV_PUT_DEFINE_INT(8)
686 TLV_PUT_DEFINE_INT(32)
687 TLV_PUT_DEFINE_INT(64)
689 static int tlv_put_string(struct send_ctx
*sctx
, u16 attr
,
690 const char *str
, int len
)
694 return tlv_put(sctx
, attr
, str
, len
);
697 static int tlv_put_uuid(struct send_ctx
*sctx
, u16 attr
,
700 return tlv_put(sctx
, attr
, uuid
, BTRFS_UUID_SIZE
);
703 static int tlv_put_btrfs_timespec(struct send_ctx
*sctx
, u16 attr
,
704 struct extent_buffer
*eb
,
705 struct btrfs_timespec
*ts
)
707 struct btrfs_timespec bts
;
708 read_extent_buffer(eb
, &bts
, (unsigned long)ts
, sizeof(bts
));
709 return tlv_put(sctx
, attr
, &bts
, sizeof(bts
));
713 #define TLV_PUT(sctx, attrtype, data, attrlen) \
715 ret = tlv_put(sctx, attrtype, data, attrlen); \
717 goto tlv_put_failure; \
720 #define TLV_PUT_INT(sctx, attrtype, bits, value) \
722 ret = tlv_put_u##bits(sctx, attrtype, value); \
724 goto tlv_put_failure; \
727 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
728 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
729 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
730 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
731 #define TLV_PUT_STRING(sctx, attrtype, str, len) \
733 ret = tlv_put_string(sctx, attrtype, str, len); \
735 goto tlv_put_failure; \
737 #define TLV_PUT_PATH(sctx, attrtype, p) \
739 ret = tlv_put_string(sctx, attrtype, p->start, \
742 goto tlv_put_failure; \
744 #define TLV_PUT_UUID(sctx, attrtype, uuid) \
746 ret = tlv_put_uuid(sctx, attrtype, uuid); \
748 goto tlv_put_failure; \
750 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
752 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
754 goto tlv_put_failure; \
757 static int send_header(struct send_ctx
*sctx
)
759 struct btrfs_stream_header hdr
;
761 strcpy(hdr
.magic
, BTRFS_SEND_STREAM_MAGIC
);
762 hdr
.version
= cpu_to_le32(sctx
->proto
);
763 return write_buf(sctx
->send_filp
, &hdr
, sizeof(hdr
),
768 * For each command/item we want to send to userspace, we call this function.
770 static int begin_cmd(struct send_ctx
*sctx
, int cmd
)
772 struct btrfs_cmd_header
*hdr
;
774 if (WARN_ON(!sctx
->send_buf
))
777 if (unlikely(sctx
->send_size
!= 0)) {
778 btrfs_err(sctx
->send_root
->fs_info
,
779 "send: command header buffer not empty cmd %d offset %llu",
780 cmd
, sctx
->send_off
);
784 sctx
->send_size
+= sizeof(*hdr
);
785 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
786 put_unaligned_le16(cmd
, &hdr
->cmd
);
791 static int send_cmd(struct send_ctx
*sctx
)
794 struct btrfs_cmd_header
*hdr
;
797 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
798 put_unaligned_le32(sctx
->send_size
- sizeof(*hdr
), &hdr
->len
);
799 put_unaligned_le32(0, &hdr
->crc
);
801 crc
= crc32c(0, (unsigned char *)sctx
->send_buf
, sctx
->send_size
);
802 put_unaligned_le32(crc
, &hdr
->crc
);
804 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
, sctx
->send_size
,
808 sctx
->put_data
= false;
814 * Sends a move instruction to user space
816 static int send_rename(struct send_ctx
*sctx
,
817 struct fs_path
*from
, struct fs_path
*to
)
821 ret
= begin_cmd(sctx
, BTRFS_SEND_C_RENAME
);
825 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, from
);
826 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_TO
, to
);
828 ret
= send_cmd(sctx
);
835 * Sends a link instruction to user space
837 static int send_link(struct send_ctx
*sctx
,
838 struct fs_path
*path
, struct fs_path
*lnk
)
842 ret
= begin_cmd(sctx
, BTRFS_SEND_C_LINK
);
846 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
847 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_LINK
, lnk
);
849 ret
= send_cmd(sctx
);
856 * Sends an unlink instruction to user space
858 static int send_unlink(struct send_ctx
*sctx
, struct fs_path
*path
)
862 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UNLINK
);
866 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
868 ret
= send_cmd(sctx
);
875 * Sends a rmdir instruction to user space
877 static int send_rmdir(struct send_ctx
*sctx
, struct fs_path
*path
)
881 ret
= begin_cmd(sctx
, BTRFS_SEND_C_RMDIR
);
885 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
887 ret
= send_cmd(sctx
);
893 struct btrfs_inode_info
{
905 * Helper function to retrieve some fields from an inode item.
907 static int get_inode_info(struct btrfs_root
*root
, u64 ino
,
908 struct btrfs_inode_info
*info
)
911 struct btrfs_path
*path
;
912 struct btrfs_inode_item
*ii
;
913 struct btrfs_key key
;
915 path
= alloc_path_for_send();
920 key
.type
= BTRFS_INODE_ITEM_KEY
;
922 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
932 ii
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
933 struct btrfs_inode_item
);
934 info
->size
= btrfs_inode_size(path
->nodes
[0], ii
);
935 info
->gen
= btrfs_inode_generation(path
->nodes
[0], ii
);
936 info
->mode
= btrfs_inode_mode(path
->nodes
[0], ii
);
937 info
->uid
= btrfs_inode_uid(path
->nodes
[0], ii
);
938 info
->gid
= btrfs_inode_gid(path
->nodes
[0], ii
);
939 info
->rdev
= btrfs_inode_rdev(path
->nodes
[0], ii
);
940 info
->nlink
= btrfs_inode_nlink(path
->nodes
[0], ii
);
942 * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
943 * otherwise logically split to 32/32 parts.
945 info
->fileattr
= btrfs_inode_flags(path
->nodes
[0], ii
);
948 btrfs_free_path(path
);
952 static int get_inode_gen(struct btrfs_root
*root
, u64 ino
, u64
*gen
)
955 struct btrfs_inode_info info
= { 0 };
959 ret
= get_inode_info(root
, ino
, &info
);
964 typedef int (*iterate_inode_ref_t
)(u64 dir
, struct fs_path
*p
, void *ctx
);
967 * Helper function to iterate the entries in ONE btrfs_inode_ref or
968 * btrfs_inode_extref.
969 * The iterate callback may return a non zero value to stop iteration. This can
970 * be a negative value for error codes or 1 to simply stop it.
972 * path must point to the INODE_REF or INODE_EXTREF when called.
974 static int iterate_inode_ref(struct btrfs_root
*root
, struct btrfs_path
*path
,
975 struct btrfs_key
*found_key
, int resolve
,
976 iterate_inode_ref_t iterate
, void *ctx
)
978 struct extent_buffer
*eb
= path
->nodes
[0];
979 struct btrfs_inode_ref
*iref
;
980 struct btrfs_inode_extref
*extref
;
981 struct btrfs_path
*tmp_path
;
985 int slot
= path
->slots
[0];
990 unsigned long name_off
;
991 unsigned long elem_size
;
994 p
= fs_path_alloc_reversed();
998 tmp_path
= alloc_path_for_send();
1005 if (found_key
->type
== BTRFS_INODE_REF_KEY
) {
1006 ptr
= (unsigned long)btrfs_item_ptr(eb
, slot
,
1007 struct btrfs_inode_ref
);
1008 total
= btrfs_item_size(eb
, slot
);
1009 elem_size
= sizeof(*iref
);
1011 ptr
= btrfs_item_ptr_offset(eb
, slot
);
1012 total
= btrfs_item_size(eb
, slot
);
1013 elem_size
= sizeof(*extref
);
1016 while (cur
< total
) {
1019 if (found_key
->type
== BTRFS_INODE_REF_KEY
) {
1020 iref
= (struct btrfs_inode_ref
*)(ptr
+ cur
);
1021 name_len
= btrfs_inode_ref_name_len(eb
, iref
);
1022 name_off
= (unsigned long)(iref
+ 1);
1023 dir
= found_key
->offset
;
1025 extref
= (struct btrfs_inode_extref
*)(ptr
+ cur
);
1026 name_len
= btrfs_inode_extref_name_len(eb
, extref
);
1027 name_off
= (unsigned long)&extref
->name
;
1028 dir
= btrfs_inode_extref_parent(eb
, extref
);
1032 start
= btrfs_ref_to_path(root
, tmp_path
, name_len
,
1034 p
->buf
, p
->buf_len
);
1035 if (IS_ERR(start
)) {
1036 ret
= PTR_ERR(start
);
1039 if (start
< p
->buf
) {
1040 /* overflow , try again with larger buffer */
1041 ret
= fs_path_ensure_buf(p
,
1042 p
->buf_len
+ p
->buf
- start
);
1045 start
= btrfs_ref_to_path(root
, tmp_path
,
1048 p
->buf
, p
->buf_len
);
1049 if (IS_ERR(start
)) {
1050 ret
= PTR_ERR(start
);
1053 if (unlikely(start
< p
->buf
)) {
1054 btrfs_err(root
->fs_info
,
1055 "send: path ref buffer underflow for key (%llu %u %llu)",
1056 found_key
->objectid
,
1065 ret
= fs_path_add_from_extent_buffer(p
, eb
, name_off
,
1071 cur
+= elem_size
+ name_len
;
1072 ret
= iterate(dir
, p
, ctx
);
1078 btrfs_free_path(tmp_path
);
1083 typedef int (*iterate_dir_item_t
)(int num
, struct btrfs_key
*di_key
,
1084 const char *name
, int name_len
,
1085 const char *data
, int data_len
,
1089 * Helper function to iterate the entries in ONE btrfs_dir_item.
1090 * The iterate callback may return a non zero value to stop iteration. This can
1091 * be a negative value for error codes or 1 to simply stop it.
1093 * path must point to the dir item when called.
1095 static int iterate_dir_item(struct btrfs_root
*root
, struct btrfs_path
*path
,
1096 iterate_dir_item_t iterate
, void *ctx
)
1099 struct extent_buffer
*eb
;
1100 struct btrfs_dir_item
*di
;
1101 struct btrfs_key di_key
;
1113 * Start with a small buffer (1 page). If later we end up needing more
1114 * space, which can happen for xattrs on a fs with a leaf size greater
1115 * than the page size, attempt to increase the buffer. Typically xattr
1119 buf
= kmalloc(buf_len
, GFP_KERNEL
);
1125 eb
= path
->nodes
[0];
1126 slot
= path
->slots
[0];
1127 di
= btrfs_item_ptr(eb
, slot
, struct btrfs_dir_item
);
1130 total
= btrfs_item_size(eb
, slot
);
1133 while (cur
< total
) {
1134 name_len
= btrfs_dir_name_len(eb
, di
);
1135 data_len
= btrfs_dir_data_len(eb
, di
);
1136 btrfs_dir_item_key_to_cpu(eb
, di
, &di_key
);
1138 if (btrfs_dir_ftype(eb
, di
) == BTRFS_FT_XATTR
) {
1139 if (name_len
> XATTR_NAME_MAX
) {
1140 ret
= -ENAMETOOLONG
;
1143 if (name_len
+ data_len
>
1144 BTRFS_MAX_XATTR_SIZE(root
->fs_info
)) {
1152 if (name_len
+ data_len
> PATH_MAX
) {
1153 ret
= -ENAMETOOLONG
;
1158 if (name_len
+ data_len
> buf_len
) {
1159 buf_len
= name_len
+ data_len
;
1160 if (is_vmalloc_addr(buf
)) {
1164 char *tmp
= krealloc(buf
, buf_len
,
1165 GFP_KERNEL
| __GFP_NOWARN
);
1172 buf
= kvmalloc(buf_len
, GFP_KERNEL
);
1180 read_extent_buffer(eb
, buf
, (unsigned long)(di
+ 1),
1181 name_len
+ data_len
);
1183 len
= sizeof(*di
) + name_len
+ data_len
;
1184 di
= (struct btrfs_dir_item
*)((char *)di
+ len
);
1187 ret
= iterate(num
, &di_key
, buf
, name_len
, buf
+ name_len
,
1204 static int __copy_first_ref(u64 dir
, struct fs_path
*p
, void *ctx
)
1207 struct fs_path
*pt
= ctx
;
1209 ret
= fs_path_copy(pt
, p
);
1213 /* we want the first only */
1218 * Retrieve the first path of an inode. If an inode has more then one
1219 * ref/hardlink, this is ignored.
1221 static int get_inode_path(struct btrfs_root
*root
,
1222 u64 ino
, struct fs_path
*path
)
1225 struct btrfs_key key
, found_key
;
1226 struct btrfs_path
*p
;
1228 p
= alloc_path_for_send();
1232 fs_path_reset(path
);
1235 key
.type
= BTRFS_INODE_REF_KEY
;
1238 ret
= btrfs_search_slot_for_read(root
, &key
, p
, 1, 0);
1245 btrfs_item_key_to_cpu(p
->nodes
[0], &found_key
, p
->slots
[0]);
1246 if (found_key
.objectid
!= ino
||
1247 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
1248 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)) {
1253 ret
= iterate_inode_ref(root
, p
, &found_key
, 1,
1254 __copy_first_ref
, path
);
1264 struct backref_ctx
{
1265 struct send_ctx
*sctx
;
1267 /* number of total found references */
1271 * used for clones found in send_root. clones found behind cur_objectid
1272 * and cur_offset are not considered as allowed clones.
1277 /* may be truncated in case it's the last extent in a file */
1280 /* The bytenr the file extent item we are processing refers to. */
1282 /* The owner (root id) of the data backref for the current extent. */
1284 /* The offset of the data backref for the current extent. */
1288 static int __clone_root_cmp_bsearch(const void *key
, const void *elt
)
1290 u64 root
= (u64
)(uintptr_t)key
;
1291 const struct clone_root
*cr
= elt
;
1293 if (root
< btrfs_root_id(cr
->root
))
1295 if (root
> btrfs_root_id(cr
->root
))
1300 static int __clone_root_cmp_sort(const void *e1
, const void *e2
)
1302 const struct clone_root
*cr1
= e1
;
1303 const struct clone_root
*cr2
= e2
;
1305 if (btrfs_root_id(cr1
->root
) < btrfs_root_id(cr2
->root
))
1307 if (btrfs_root_id(cr1
->root
) > btrfs_root_id(cr2
->root
))
1313 * Called for every backref that is found for the current extent.
1314 * Results are collected in sctx->clone_roots->ino/offset.
1316 static int iterate_backrefs(u64 ino
, u64 offset
, u64 num_bytes
, u64 root_id
,
1319 struct backref_ctx
*bctx
= ctx_
;
1320 struct clone_root
*clone_root
;
1322 /* First check if the root is in the list of accepted clone sources */
1323 clone_root
= bsearch((void *)(uintptr_t)root_id
, bctx
->sctx
->clone_roots
,
1324 bctx
->sctx
->clone_roots_cnt
,
1325 sizeof(struct clone_root
),
1326 __clone_root_cmp_bsearch
);
1330 /* This is our own reference, bail out as we can't clone from it. */
1331 if (clone_root
->root
== bctx
->sctx
->send_root
&&
1332 ino
== bctx
->cur_objectid
&&
1333 offset
== bctx
->cur_offset
)
1337 * Make sure we don't consider clones from send_root that are
1338 * behind the current inode/offset.
1340 if (clone_root
->root
== bctx
->sctx
->send_root
) {
1342 * If the source inode was not yet processed we can't issue a
1343 * clone operation, as the source extent does not exist yet at
1344 * the destination of the stream.
1346 if (ino
> bctx
->cur_objectid
)
1349 * We clone from the inode currently being sent as long as the
1350 * source extent is already processed, otherwise we could try
1351 * to clone from an extent that does not exist yet at the
1352 * destination of the stream.
1354 if (ino
== bctx
->cur_objectid
&&
1355 offset
+ bctx
->extent_len
>
1356 bctx
->sctx
->cur_inode_next_write_offset
)
1361 clone_root
->found_ref
= true;
1364 * If the given backref refers to a file extent item with a larger
1365 * number of bytes than what we found before, use the new one so that
1366 * we clone more optimally and end up doing less writes and getting
1367 * less exclusive, non-shared extents at the destination.
1369 if (num_bytes
> clone_root
->num_bytes
) {
1370 clone_root
->ino
= ino
;
1371 clone_root
->offset
= offset
;
1372 clone_root
->num_bytes
= num_bytes
;
1375 * Found a perfect candidate, so there's no need to continue
1378 if (num_bytes
>= bctx
->extent_len
)
1379 return BTRFS_ITERATE_EXTENT_INODES_STOP
;
1385 static bool lookup_backref_cache(u64 leaf_bytenr
, void *ctx
,
1386 const u64
**root_ids_ret
, int *root_count_ret
)
1388 struct backref_ctx
*bctx
= ctx
;
1389 struct send_ctx
*sctx
= bctx
->sctx
;
1390 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1391 const u64 key
= leaf_bytenr
>> fs_info
->sectorsize_bits
;
1392 struct btrfs_lru_cache_entry
*raw_entry
;
1393 struct backref_cache_entry
*entry
;
1395 if (sctx
->backref_cache
.size
== 0)
1399 * If relocation happened since we first filled the cache, then we must
1400 * empty the cache and can not use it, because even though we operate on
1401 * read-only roots, their leaves and nodes may have been reallocated and
1402 * now be used for different nodes/leaves of the same tree or some other
1405 * We are called from iterate_extent_inodes() while either holding a
1406 * transaction handle or holding fs_info->commit_root_sem, so no need
1407 * to take any lock here.
1409 if (fs_info
->last_reloc_trans
> sctx
->backref_cache_last_reloc_trans
) {
1410 btrfs_lru_cache_clear(&sctx
->backref_cache
);
1414 raw_entry
= btrfs_lru_cache_lookup(&sctx
->backref_cache
, key
, 0);
1418 entry
= container_of(raw_entry
, struct backref_cache_entry
, entry
);
1419 *root_ids_ret
= entry
->root_ids
;
1420 *root_count_ret
= entry
->num_roots
;
1425 static void store_backref_cache(u64 leaf_bytenr
, const struct ulist
*root_ids
,
1428 struct backref_ctx
*bctx
= ctx
;
1429 struct send_ctx
*sctx
= bctx
->sctx
;
1430 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1431 struct backref_cache_entry
*new_entry
;
1432 struct ulist_iterator uiter
;
1433 struct ulist_node
*node
;
1437 * We're called while holding a transaction handle or while holding
1438 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
1441 new_entry
= kmalloc(sizeof(struct backref_cache_entry
), GFP_NOFS
);
1442 /* No worries, cache is optional. */
1446 new_entry
->entry
.key
= leaf_bytenr
>> fs_info
->sectorsize_bits
;
1447 new_entry
->entry
.gen
= 0;
1448 new_entry
->num_roots
= 0;
1449 ULIST_ITER_INIT(&uiter
);
1450 while ((node
= ulist_next(root_ids
, &uiter
)) != NULL
) {
1451 const u64 root_id
= node
->val
;
1452 struct clone_root
*root
;
1454 root
= bsearch((void *)(uintptr_t)root_id
, sctx
->clone_roots
,
1455 sctx
->clone_roots_cnt
, sizeof(struct clone_root
),
1456 __clone_root_cmp_bsearch
);
1460 /* Too many roots, just exit, no worries as caching is optional. */
1461 if (new_entry
->num_roots
>= SEND_MAX_BACKREF_CACHE_ROOTS
) {
1466 new_entry
->root_ids
[new_entry
->num_roots
] = root_id
;
1467 new_entry
->num_roots
++;
1471 * We may have not added any roots to the new cache entry, which means
1472 * none of the roots is part of the list of roots from which we are
1473 * allowed to clone. Cache the new entry as it's still useful to avoid
1474 * backref walking to determine which roots have a path to the leaf.
1476 * Also use GFP_NOFS because we're called while holding a transaction
1477 * handle or while holding fs_info->commit_root_sem.
1479 ret
= btrfs_lru_cache_store(&sctx
->backref_cache
, &new_entry
->entry
,
1481 ASSERT(ret
== 0 || ret
== -ENOMEM
);
1483 /* Caching is optional, no worries. */
1489 * We are called from iterate_extent_inodes() while either holding a
1490 * transaction handle or holding fs_info->commit_root_sem, so no need
1491 * to take any lock here.
1493 if (sctx
->backref_cache
.size
== 1)
1494 sctx
->backref_cache_last_reloc_trans
= fs_info
->last_reloc_trans
;
1497 static int check_extent_item(u64 bytenr
, const struct btrfs_extent_item
*ei
,
1498 const struct extent_buffer
*leaf
, void *ctx
)
1500 const u64 refs
= btrfs_extent_refs(leaf
, ei
);
1501 const struct backref_ctx
*bctx
= ctx
;
1502 const struct send_ctx
*sctx
= bctx
->sctx
;
1504 if (bytenr
== bctx
->bytenr
) {
1505 const u64 flags
= btrfs_extent_flags(leaf
, ei
);
1507 if (WARN_ON(flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
))
1511 * If we have only one reference and only the send root as a
1512 * clone source - meaning no clone roots were given in the
1513 * struct btrfs_ioctl_send_args passed to the send ioctl - then
1514 * it's our reference and there's no point in doing backref
1515 * walking which is expensive, so exit early.
1517 if (refs
== 1 && sctx
->clone_roots_cnt
== 1)
1522 * Backreference walking (iterate_extent_inodes() below) is currently
1523 * too expensive when an extent has a large number of references, both
1524 * in time spent and used memory. So for now just fallback to write
1525 * operations instead of clone operations when an extent has more than
1526 * a certain amount of references.
1528 if (refs
> SEND_MAX_EXTENT_REFS
)
1534 static bool skip_self_data_ref(u64 root
, u64 ino
, u64 offset
, void *ctx
)
1536 const struct backref_ctx
*bctx
= ctx
;
1538 if (ino
== bctx
->cur_objectid
&&
1539 root
== bctx
->backref_owner
&&
1540 offset
== bctx
->backref_offset
)
1547 * Given an inode, offset and extent item, it finds a good clone for a clone
1548 * instruction. Returns -ENOENT when none could be found. The function makes
1549 * sure that the returned clone is usable at the point where sending is at the
1550 * moment. This means, that no clones are accepted which lie behind the current
1553 * path must point to the extent item when called.
1555 static int find_extent_clone(struct send_ctx
*sctx
,
1556 struct btrfs_path
*path
,
1557 u64 ino
, u64 data_offset
,
1559 struct clone_root
**found
)
1561 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
1566 struct btrfs_file_extent_item
*fi
;
1567 struct extent_buffer
*eb
= path
->nodes
[0];
1568 struct backref_ctx backref_ctx
= { 0 };
1569 struct btrfs_backref_walk_ctx backref_walk_ctx
= { 0 };
1570 struct clone_root
*cur_clone_root
;
1575 * With fallocate we can get prealloc extents beyond the inode's i_size,
1576 * so we don't do anything here because clone operations can not clone
1577 * to a range beyond i_size without increasing the i_size of the
1578 * destination inode.
1580 if (data_offset
>= ino_size
)
1583 fi
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_file_extent_item
);
1584 extent_type
= btrfs_file_extent_type(eb
, fi
);
1585 if (extent_type
== BTRFS_FILE_EXTENT_INLINE
)
1588 disk_byte
= btrfs_file_extent_disk_bytenr(eb
, fi
);
1592 compressed
= btrfs_file_extent_compression(eb
, fi
);
1593 num_bytes
= btrfs_file_extent_num_bytes(eb
, fi
);
1596 * Setup the clone roots.
1598 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
1599 cur_clone_root
= sctx
->clone_roots
+ i
;
1600 cur_clone_root
->ino
= (u64
)-1;
1601 cur_clone_root
->offset
= 0;
1602 cur_clone_root
->num_bytes
= 0;
1603 cur_clone_root
->found_ref
= false;
1606 backref_ctx
.sctx
= sctx
;
1607 backref_ctx
.cur_objectid
= ino
;
1608 backref_ctx
.cur_offset
= data_offset
;
1609 backref_ctx
.bytenr
= disk_byte
;
1611 * Use the header owner and not the send root's id, because in case of a
1612 * snapshot we can have shared subtrees.
1614 backref_ctx
.backref_owner
= btrfs_header_owner(eb
);
1615 backref_ctx
.backref_offset
= data_offset
- btrfs_file_extent_offset(eb
, fi
);
1618 * The last extent of a file may be too large due to page alignment.
1619 * We need to adjust extent_len in this case so that the checks in
1620 * iterate_backrefs() work.
1622 if (data_offset
+ num_bytes
>= ino_size
)
1623 backref_ctx
.extent_len
= ino_size
- data_offset
;
1625 backref_ctx
.extent_len
= num_bytes
;
1628 * Now collect all backrefs.
1630 backref_walk_ctx
.bytenr
= disk_byte
;
1631 if (compressed
== BTRFS_COMPRESS_NONE
)
1632 backref_walk_ctx
.extent_item_pos
= btrfs_file_extent_offset(eb
, fi
);
1633 backref_walk_ctx
.fs_info
= fs_info
;
1634 backref_walk_ctx
.cache_lookup
= lookup_backref_cache
;
1635 backref_walk_ctx
.cache_store
= store_backref_cache
;
1636 backref_walk_ctx
.indirect_ref_iterator
= iterate_backrefs
;
1637 backref_walk_ctx
.check_extent_item
= check_extent_item
;
1638 backref_walk_ctx
.user_ctx
= &backref_ctx
;
1641 * If have a single clone root, then it's the send root and we can tell
1642 * the backref walking code to skip our own backref and not resolve it,
1643 * since we can not use it for cloning - the source and destination
1644 * ranges can't overlap and in case the leaf is shared through a subtree
1645 * due to snapshots, we can't use those other roots since they are not
1646 * in the list of clone roots.
1648 if (sctx
->clone_roots_cnt
== 1)
1649 backref_walk_ctx
.skip_data_ref
= skip_self_data_ref
;
1651 ret
= iterate_extent_inodes(&backref_walk_ctx
, true, iterate_backrefs
,
1656 down_read(&fs_info
->commit_root_sem
);
1657 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
1659 * A transaction commit for a transaction in which block group
1660 * relocation was done just happened.
1661 * The disk_bytenr of the file extent item we processed is
1662 * possibly stale, referring to the extent's location before
1663 * relocation. So act as if we haven't found any clone sources
1664 * and fallback to write commands, which will read the correct
1665 * data from the new extent location. Otherwise we will fail
1666 * below because we haven't found our own back reference or we
1667 * could be getting incorrect sources in case the old extent
1668 * was already reallocated after the relocation.
1670 up_read(&fs_info
->commit_root_sem
);
1673 up_read(&fs_info
->commit_root_sem
);
1675 if (!backref_ctx
.found
)
1678 cur_clone_root
= NULL
;
1679 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
1680 struct clone_root
*clone_root
= &sctx
->clone_roots
[i
];
1682 if (!clone_root
->found_ref
)
1686 * Choose the root from which we can clone more bytes, to
1687 * minimize write operations and therefore have more extent
1688 * sharing at the destination (the same as in the source).
1690 if (!cur_clone_root
||
1691 clone_root
->num_bytes
> cur_clone_root
->num_bytes
) {
1692 cur_clone_root
= clone_root
;
1695 * We found an optimal clone candidate (any inode from
1696 * any root is fine), so we're done.
1698 if (clone_root
->num_bytes
>= backref_ctx
.extent_len
)
1703 if (cur_clone_root
) {
1704 *found
= cur_clone_root
;
1713 static int read_symlink(struct btrfs_root
*root
,
1715 struct fs_path
*dest
)
1718 struct btrfs_path
*path
;
1719 struct btrfs_key key
;
1720 struct btrfs_file_extent_item
*ei
;
1726 path
= alloc_path_for_send();
1731 key
.type
= BTRFS_EXTENT_DATA_KEY
;
1733 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1738 * An empty symlink inode. Can happen in rare error paths when
1739 * creating a symlink (transaction committed before the inode
1740 * eviction handler removed the symlink inode items and a crash
1741 * happened in between or the subvol was snapshoted in between).
1742 * Print an informative message to dmesg/syslog so that the user
1743 * can delete the symlink.
1745 btrfs_err(root
->fs_info
,
1746 "Found empty symlink inode %llu at root %llu",
1747 ino
, btrfs_root_id(root
));
1752 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1753 struct btrfs_file_extent_item
);
1754 type
= btrfs_file_extent_type(path
->nodes
[0], ei
);
1755 if (unlikely(type
!= BTRFS_FILE_EXTENT_INLINE
)) {
1757 btrfs_crit(root
->fs_info
,
1758 "send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
1759 ino
, btrfs_root_id(root
), type
);
1762 compression
= btrfs_file_extent_compression(path
->nodes
[0], ei
);
1763 if (unlikely(compression
!= BTRFS_COMPRESS_NONE
)) {
1765 btrfs_crit(root
->fs_info
,
1766 "send: found symlink extent with compression, ino %llu root %llu compression type %d",
1767 ino
, btrfs_root_id(root
), compression
);
1771 off
= btrfs_file_extent_inline_start(ei
);
1772 len
= btrfs_file_extent_ram_bytes(path
->nodes
[0], ei
);
1774 ret
= fs_path_add_from_extent_buffer(dest
, path
->nodes
[0], off
, len
);
1777 btrfs_free_path(path
);
1782 * Helper function to generate a file name that is unique in the root of
1783 * send_root and parent_root. This is used to generate names for orphan inodes.
1785 static int gen_unique_name(struct send_ctx
*sctx
,
1787 struct fs_path
*dest
)
1790 struct btrfs_path
*path
;
1791 struct btrfs_dir_item
*di
;
1796 path
= alloc_path_for_send();
1801 struct fscrypt_str tmp_name
;
1803 len
= snprintf(tmp
, sizeof(tmp
), "o%llu-%llu-%llu",
1805 ASSERT(len
< sizeof(tmp
));
1806 tmp_name
.name
= tmp
;
1807 tmp_name
.len
= strlen(tmp
);
1809 di
= btrfs_lookup_dir_item(NULL
, sctx
->send_root
,
1810 path
, BTRFS_FIRST_FREE_OBJECTID
,
1812 btrfs_release_path(path
);
1818 /* not unique, try again */
1823 if (!sctx
->parent_root
) {
1829 di
= btrfs_lookup_dir_item(NULL
, sctx
->parent_root
,
1830 path
, BTRFS_FIRST_FREE_OBJECTID
,
1832 btrfs_release_path(path
);
1838 /* not unique, try again */
1846 ret
= fs_path_add(dest
, tmp
, strlen(tmp
));
1849 btrfs_free_path(path
);
1854 inode_state_no_change
,
1855 inode_state_will_create
,
1856 inode_state_did_create
,
1857 inode_state_will_delete
,
1858 inode_state_did_delete
,
1861 static int get_cur_inode_state(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
1862 u64
*send_gen
, u64
*parent_gen
)
1869 struct btrfs_inode_info info
;
1871 ret
= get_inode_info(sctx
->send_root
, ino
, &info
);
1872 if (ret
< 0 && ret
!= -ENOENT
)
1874 left_ret
= (info
.nlink
== 0) ? -ENOENT
: ret
;
1875 left_gen
= info
.gen
;
1877 *send_gen
= ((left_ret
== -ENOENT
) ? 0 : info
.gen
);
1879 if (!sctx
->parent_root
) {
1880 right_ret
= -ENOENT
;
1882 ret
= get_inode_info(sctx
->parent_root
, ino
, &info
);
1883 if (ret
< 0 && ret
!= -ENOENT
)
1885 right_ret
= (info
.nlink
== 0) ? -ENOENT
: ret
;
1886 right_gen
= info
.gen
;
1888 *parent_gen
= ((right_ret
== -ENOENT
) ? 0 : info
.gen
);
1891 if (!left_ret
&& !right_ret
) {
1892 if (left_gen
== gen
&& right_gen
== gen
) {
1893 ret
= inode_state_no_change
;
1894 } else if (left_gen
== gen
) {
1895 if (ino
< sctx
->send_progress
)
1896 ret
= inode_state_did_create
;
1898 ret
= inode_state_will_create
;
1899 } else if (right_gen
== gen
) {
1900 if (ino
< sctx
->send_progress
)
1901 ret
= inode_state_did_delete
;
1903 ret
= inode_state_will_delete
;
1907 } else if (!left_ret
) {
1908 if (left_gen
== gen
) {
1909 if (ino
< sctx
->send_progress
)
1910 ret
= inode_state_did_create
;
1912 ret
= inode_state_will_create
;
1916 } else if (!right_ret
) {
1917 if (right_gen
== gen
) {
1918 if (ino
< sctx
->send_progress
)
1919 ret
= inode_state_did_delete
;
1921 ret
= inode_state_will_delete
;
1932 static int is_inode_existent(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
1933 u64
*send_gen
, u64
*parent_gen
)
1937 if (ino
== BTRFS_FIRST_FREE_OBJECTID
)
1940 ret
= get_cur_inode_state(sctx
, ino
, gen
, send_gen
, parent_gen
);
1944 if (ret
== inode_state_no_change
||
1945 ret
== inode_state_did_create
||
1946 ret
== inode_state_will_delete
)
1953 * Helper function to lookup a dir item in a dir.
1955 static int lookup_dir_item_inode(struct btrfs_root
*root
,
1956 u64 dir
, const char *name
, int name_len
,
1960 struct btrfs_dir_item
*di
;
1961 struct btrfs_key key
;
1962 struct btrfs_path
*path
;
1963 struct fscrypt_str name_str
= FSTR_INIT((char *)name
, name_len
);
1965 path
= alloc_path_for_send();
1969 di
= btrfs_lookup_dir_item(NULL
, root
, path
, dir
, &name_str
, 0);
1970 if (IS_ERR_OR_NULL(di
)) {
1971 ret
= di
? PTR_ERR(di
) : -ENOENT
;
1974 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &key
);
1975 if (key
.type
== BTRFS_ROOT_ITEM_KEY
) {
1979 *found_inode
= key
.objectid
;
1982 btrfs_free_path(path
);
1987 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1988 * generation of the parent dir and the name of the dir entry.
1990 static int get_first_ref(struct btrfs_root
*root
, u64 ino
,
1991 u64
*dir
, u64
*dir_gen
, struct fs_path
*name
)
1994 struct btrfs_key key
;
1995 struct btrfs_key found_key
;
1996 struct btrfs_path
*path
;
2000 path
= alloc_path_for_send();
2005 key
.type
= BTRFS_INODE_REF_KEY
;
2008 ret
= btrfs_search_slot_for_read(root
, &key
, path
, 1, 0);
2012 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
2014 if (ret
|| found_key
.objectid
!= ino
||
2015 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
2016 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
)) {
2021 if (found_key
.type
== BTRFS_INODE_REF_KEY
) {
2022 struct btrfs_inode_ref
*iref
;
2023 iref
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2024 struct btrfs_inode_ref
);
2025 len
= btrfs_inode_ref_name_len(path
->nodes
[0], iref
);
2026 ret
= fs_path_add_from_extent_buffer(name
, path
->nodes
[0],
2027 (unsigned long)(iref
+ 1),
2029 parent_dir
= found_key
.offset
;
2031 struct btrfs_inode_extref
*extref
;
2032 extref
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2033 struct btrfs_inode_extref
);
2034 len
= btrfs_inode_extref_name_len(path
->nodes
[0], extref
);
2035 ret
= fs_path_add_from_extent_buffer(name
, path
->nodes
[0],
2036 (unsigned long)&extref
->name
, len
);
2037 parent_dir
= btrfs_inode_extref_parent(path
->nodes
[0], extref
);
2041 btrfs_release_path(path
);
2044 ret
= get_inode_gen(root
, parent_dir
, dir_gen
);
2052 btrfs_free_path(path
);
2056 static int is_first_ref(struct btrfs_root
*root
,
2058 const char *name
, int name_len
)
2061 struct fs_path
*tmp_name
;
2064 tmp_name
= fs_path_alloc();
2068 ret
= get_first_ref(root
, ino
, &tmp_dir
, NULL
, tmp_name
);
2072 if (dir
!= tmp_dir
|| name_len
!= fs_path_len(tmp_name
)) {
2077 ret
= !memcmp(tmp_name
->start
, name
, name_len
);
2080 fs_path_free(tmp_name
);
2085 * Used by process_recorded_refs to determine if a new ref would overwrite an
2086 * already existing ref. In case it detects an overwrite, it returns the
2087 * inode/gen in who_ino/who_gen.
2088 * When an overwrite is detected, process_recorded_refs does proper orphanizing
2089 * to make sure later references to the overwritten inode are possible.
2090 * Orphanizing is however only required for the first ref of an inode.
2091 * process_recorded_refs does an additional is_first_ref check to see if
2092 * orphanizing is really required.
2094 static int will_overwrite_ref(struct send_ctx
*sctx
, u64 dir
, u64 dir_gen
,
2095 const char *name
, int name_len
,
2096 u64
*who_ino
, u64
*who_gen
, u64
*who_mode
)
2099 u64 parent_root_dir_gen
;
2100 u64 other_inode
= 0;
2101 struct btrfs_inode_info info
;
2103 if (!sctx
->parent_root
)
2106 ret
= is_inode_existent(sctx
, dir
, dir_gen
, NULL
, &parent_root_dir_gen
);
2111 * If we have a parent root we need to verify that the parent dir was
2112 * not deleted and then re-created, if it was then we have no overwrite
2113 * and we can just unlink this entry.
2115 * @parent_root_dir_gen was set to 0 if the inode does not exist in the
2118 if (sctx
->parent_root
&& dir
!= BTRFS_FIRST_FREE_OBJECTID
&&
2119 parent_root_dir_gen
!= dir_gen
)
2122 ret
= lookup_dir_item_inode(sctx
->parent_root
, dir
, name
, name_len
,
2130 * Check if the overwritten ref was already processed. If yes, the ref
2131 * was already unlinked/moved, so we can safely assume that we will not
2132 * overwrite anything at this point in time.
2134 if (other_inode
> sctx
->send_progress
||
2135 is_waiting_for_move(sctx
, other_inode
)) {
2136 ret
= get_inode_info(sctx
->parent_root
, other_inode
, &info
);
2140 *who_ino
= other_inode
;
2141 *who_gen
= info
.gen
;
2142 *who_mode
= info
.mode
;
2150 * Checks if the ref was overwritten by an already processed inode. This is
2151 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
2152 * thus the orphan name needs be used.
2153 * process_recorded_refs also uses it to avoid unlinking of refs that were
2156 static int did_overwrite_ref(struct send_ctx
*sctx
,
2157 u64 dir
, u64 dir_gen
,
2158 u64 ino
, u64 ino_gen
,
2159 const char *name
, int name_len
)
2164 u64 send_root_dir_gen
;
2166 if (!sctx
->parent_root
)
2169 ret
= is_inode_existent(sctx
, dir
, dir_gen
, &send_root_dir_gen
, NULL
);
2174 * @send_root_dir_gen was set to 0 if the inode does not exist in the
2177 if (dir
!= BTRFS_FIRST_FREE_OBJECTID
&& send_root_dir_gen
!= dir_gen
)
2180 /* check if the ref was overwritten by another ref */
2181 ret
= lookup_dir_item_inode(sctx
->send_root
, dir
, name
, name_len
,
2183 if (ret
== -ENOENT
) {
2184 /* was never and will never be overwritten */
2186 } else if (ret
< 0) {
2190 if (ow_inode
== ino
) {
2191 ret
= get_inode_gen(sctx
->send_root
, ow_inode
, &ow_gen
);
2195 /* It's the same inode, so no overwrite happened. */
2196 if (ow_gen
== ino_gen
)
2201 * We know that it is or will be overwritten. Check this now.
2202 * The current inode being processed might have been the one that caused
2203 * inode 'ino' to be orphanized, therefore check if ow_inode matches
2204 * the current inode being processed.
2206 if (ow_inode
< sctx
->send_progress
)
2209 if (ino
!= sctx
->cur_ino
&& ow_inode
== sctx
->cur_ino
) {
2211 ret
= get_inode_gen(sctx
->send_root
, ow_inode
, &ow_gen
);
2215 if (ow_gen
== sctx
->cur_inode_gen
)
2223 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2224 * that got overwritten. This is used by process_recorded_refs to determine
2225 * if it has to use the path as returned by get_cur_path or the orphan name.
2227 static int did_overwrite_first_ref(struct send_ctx
*sctx
, u64 ino
, u64 gen
)
2230 struct fs_path
*name
= NULL
;
2234 if (!sctx
->parent_root
)
2237 name
= fs_path_alloc();
2241 ret
= get_first_ref(sctx
->parent_root
, ino
, &dir
, &dir_gen
, name
);
2245 ret
= did_overwrite_ref(sctx
, dir
, dir_gen
, ino
, gen
,
2246 name
->start
, fs_path_len(name
));
2253 static inline struct name_cache_entry
*name_cache_search(struct send_ctx
*sctx
,
2256 struct btrfs_lru_cache_entry
*entry
;
2258 entry
= btrfs_lru_cache_lookup(&sctx
->name_cache
, ino
, gen
);
2262 return container_of(entry
, struct name_cache_entry
, entry
);
2266 * Used by get_cur_path for each ref up to the root.
2267 * Returns 0 if it succeeded.
2268 * Returns 1 if the inode is not existent or got overwritten. In that case, the
2269 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2270 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
2271 * Returns <0 in case of error.
2273 static int __get_cur_name_and_parent(struct send_ctx
*sctx
,
2277 struct fs_path
*dest
)
2281 struct name_cache_entry
*nce
;
2284 * First check if we already did a call to this function with the same
2285 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2286 * return the cached result.
2288 nce
= name_cache_search(sctx
, ino
, gen
);
2290 if (ino
< sctx
->send_progress
&& nce
->need_later_update
) {
2291 btrfs_lru_cache_remove(&sctx
->name_cache
, &nce
->entry
);
2294 *parent_ino
= nce
->parent_ino
;
2295 *parent_gen
= nce
->parent_gen
;
2296 ret
= fs_path_add(dest
, nce
->name
, nce
->name_len
);
2304 * If the inode is not existent yet, add the orphan name and return 1.
2305 * This should only happen for the parent dir that we determine in
2306 * record_new_ref_if_needed().
2308 ret
= is_inode_existent(sctx
, ino
, gen
, NULL
, NULL
);
2313 ret
= gen_unique_name(sctx
, ino
, gen
, dest
);
2321 * Depending on whether the inode was already processed or not, use
2322 * send_root or parent_root for ref lookup.
2324 if (ino
< sctx
->send_progress
)
2325 ret
= get_first_ref(sctx
->send_root
, ino
,
2326 parent_ino
, parent_gen
, dest
);
2328 ret
= get_first_ref(sctx
->parent_root
, ino
,
2329 parent_ino
, parent_gen
, dest
);
2334 * Check if the ref was overwritten by an inode's ref that was processed
2335 * earlier. If yes, treat as orphan and return 1.
2337 ret
= did_overwrite_ref(sctx
, *parent_ino
, *parent_gen
, ino
, gen
,
2338 dest
->start
, fs_path_len(dest
));
2342 fs_path_reset(dest
);
2343 ret
= gen_unique_name(sctx
, ino
, gen
, dest
);
2351 * Store the result of the lookup in the name cache.
2353 nce
= kmalloc(sizeof(*nce
) + fs_path_len(dest
), GFP_KERNEL
);
2357 nce
->entry
.key
= ino
;
2358 nce
->entry
.gen
= gen
;
2359 nce
->parent_ino
= *parent_ino
;
2360 nce
->parent_gen
= *parent_gen
;
2361 nce
->name_len
= fs_path_len(dest
);
2363 memcpy(nce
->name
, dest
->start
, nce
->name_len
);
2365 if (ino
< sctx
->send_progress
)
2366 nce
->need_later_update
= 0;
2368 nce
->need_later_update
= 1;
2370 nce_ret
= btrfs_lru_cache_store(&sctx
->name_cache
, &nce
->entry
, GFP_KERNEL
);
2380 * Magic happens here. This function returns the first ref to an inode as it
2381 * would look like while receiving the stream at this point in time.
2382 * We walk the path up to the root. For every inode in between, we check if it
2383 * was already processed/sent. If yes, we continue with the parent as found
2384 * in send_root. If not, we continue with the parent as found in parent_root.
2385 * If we encounter an inode that was deleted at this point in time, we use the
2386 * inodes "orphan" name instead of the real name and stop. Same with new inodes
2387 * that were not created yet and overwritten inodes/refs.
2389 * When do we have orphan inodes:
2390 * 1. When an inode is freshly created and thus no valid refs are available yet
2391 * 2. When a directory lost all it's refs (deleted) but still has dir items
2392 * inside which were not processed yet (pending for move/delete). If anyone
2393 * tried to get the path to the dir items, it would get a path inside that
2395 * 3. When an inode is moved around or gets new links, it may overwrite the ref
2396 * of an unprocessed inode. If in that case the first ref would be
2397 * overwritten, the overwritten inode gets "orphanized". Later when we
2398 * process this overwritten inode, it is restored at a new place by moving
2401 * sctx->send_progress tells this function at which point in time receiving
2404 static int get_cur_path(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
2405 struct fs_path
*dest
)
2408 struct fs_path
*name
= NULL
;
2409 u64 parent_inode
= 0;
2412 const bool is_cur_inode
= (ino
== sctx
->cur_ino
&& gen
== sctx
->cur_inode_gen
);
2414 if (is_cur_inode
&& fs_path_len(&sctx
->cur_inode_path
) > 0) {
2415 if (dest
!= &sctx
->cur_inode_path
)
2416 return fs_path_copy(dest
, &sctx
->cur_inode_path
);
2421 name
= fs_path_alloc();
2428 fs_path_reset(dest
);
2430 while (!stop
&& ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
2431 struct waiting_dir_move
*wdm
;
2433 fs_path_reset(name
);
2435 if (is_waiting_for_rm(sctx
, ino
, gen
)) {
2436 ret
= gen_unique_name(sctx
, ino
, gen
, name
);
2439 ret
= fs_path_add_path(dest
, name
);
2443 wdm
= get_waiting_dir_move(sctx
, ino
);
2444 if (wdm
&& wdm
->orphanized
) {
2445 ret
= gen_unique_name(sctx
, ino
, gen
, name
);
2448 ret
= get_first_ref(sctx
->parent_root
, ino
,
2449 &parent_inode
, &parent_gen
, name
);
2451 ret
= __get_cur_name_and_parent(sctx
, ino
, gen
,
2461 ret
= fs_path_add_path(dest
, name
);
2472 fs_path_unreverse(dest
);
2473 if (is_cur_inode
&& dest
!= &sctx
->cur_inode_path
)
2474 ret
= fs_path_copy(&sctx
->cur_inode_path
, dest
);
2481 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2483 static int send_subvol_begin(struct send_ctx
*sctx
)
2486 struct btrfs_root
*send_root
= sctx
->send_root
;
2487 struct btrfs_root
*parent_root
= sctx
->parent_root
;
2488 struct btrfs_path
*path
;
2489 struct btrfs_key key
;
2490 struct btrfs_root_ref
*ref
;
2491 struct extent_buffer
*leaf
;
2495 path
= btrfs_alloc_path();
2499 name
= kmalloc(BTRFS_PATH_NAME_MAX
, GFP_KERNEL
);
2501 btrfs_free_path(path
);
2505 key
.objectid
= btrfs_root_id(send_root
);
2506 key
.type
= BTRFS_ROOT_BACKREF_KEY
;
2509 ret
= btrfs_search_slot_for_read(send_root
->fs_info
->tree_root
,
2518 leaf
= path
->nodes
[0];
2519 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
2520 if (key
.type
!= BTRFS_ROOT_BACKREF_KEY
||
2521 key
.objectid
!= btrfs_root_id(send_root
)) {
2525 ref
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_root_ref
);
2526 namelen
= btrfs_root_ref_name_len(leaf
, ref
);
2527 read_extent_buffer(leaf
, name
, (unsigned long)(ref
+ 1), namelen
);
2528 btrfs_release_path(path
);
2531 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SNAPSHOT
);
2535 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SUBVOL
);
2540 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_PATH
, name
, namelen
);
2542 if (!btrfs_is_empty_uuid(sctx
->send_root
->root_item
.received_uuid
))
2543 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_UUID
,
2544 sctx
->send_root
->root_item
.received_uuid
);
2546 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_UUID
,
2547 sctx
->send_root
->root_item
.uuid
);
2549 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CTRANSID
,
2550 btrfs_root_ctransid(&sctx
->send_root
->root_item
));
2552 if (!btrfs_is_empty_uuid(parent_root
->root_item
.received_uuid
))
2553 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
2554 parent_root
->root_item
.received_uuid
);
2556 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
2557 parent_root
->root_item
.uuid
);
2558 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_CTRANSID
,
2559 btrfs_root_ctransid(&sctx
->parent_root
->root_item
));
2562 ret
= send_cmd(sctx
);
2566 btrfs_free_path(path
);
2571 static struct fs_path
*get_cur_inode_path(struct send_ctx
*sctx
)
2573 if (fs_path_len(&sctx
->cur_inode_path
) == 0) {
2576 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
2577 &sctx
->cur_inode_path
);
2579 return ERR_PTR(ret
);
2582 return &sctx
->cur_inode_path
;
2585 static struct fs_path
*get_path_for_command(struct send_ctx
*sctx
, u64 ino
, u64 gen
)
2587 struct fs_path
*path
;
2590 if (ino
== sctx
->cur_ino
&& gen
== sctx
->cur_inode_gen
)
2591 return get_cur_inode_path(sctx
);
2593 path
= fs_path_alloc();
2595 return ERR_PTR(-ENOMEM
);
2597 ret
= get_cur_path(sctx
, ino
, gen
, path
);
2600 return ERR_PTR(ret
);
2606 static void free_path_for_command(const struct send_ctx
*sctx
, struct fs_path
*path
)
2608 if (path
!= &sctx
->cur_inode_path
)
2612 static int send_truncate(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 size
)
2617 p
= get_path_for_command(sctx
, ino
, gen
);
2621 ret
= begin_cmd(sctx
, BTRFS_SEND_C_TRUNCATE
);
2625 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2626 TLV_PUT_U64(sctx
, BTRFS_SEND_A_SIZE
, size
);
2628 ret
= send_cmd(sctx
);
2632 free_path_for_command(sctx
, p
);
2636 static int send_chmod(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 mode
)
2641 p
= get_path_for_command(sctx
, ino
, gen
);
2645 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CHMOD
);
2649 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2650 TLV_PUT_U64(sctx
, BTRFS_SEND_A_MODE
, mode
& 07777);
2652 ret
= send_cmd(sctx
);
2656 free_path_for_command(sctx
, p
);
2660 static int send_fileattr(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 fileattr
)
2665 if (sctx
->proto
< 2)
2668 p
= get_path_for_command(sctx
, ino
, gen
);
2672 ret
= begin_cmd(sctx
, BTRFS_SEND_C_FILEATTR
);
2676 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2677 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILEATTR
, fileattr
);
2679 ret
= send_cmd(sctx
);
2683 free_path_for_command(sctx
, p
);
2687 static int send_chown(struct send_ctx
*sctx
, u64 ino
, u64 gen
, u64 uid
, u64 gid
)
2692 p
= get_path_for_command(sctx
, ino
, gen
);
2696 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CHOWN
);
2700 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2701 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UID
, uid
);
2702 TLV_PUT_U64(sctx
, BTRFS_SEND_A_GID
, gid
);
2704 ret
= send_cmd(sctx
);
2708 free_path_for_command(sctx
, p
);
2712 static int send_utimes(struct send_ctx
*sctx
, u64 ino
, u64 gen
)
2715 struct fs_path
*p
= NULL
;
2716 struct btrfs_inode_item
*ii
;
2717 struct btrfs_path
*path
= NULL
;
2718 struct extent_buffer
*eb
;
2719 struct btrfs_key key
;
2722 p
= get_path_for_command(sctx
, ino
, gen
);
2726 path
= alloc_path_for_send();
2733 key
.type
= BTRFS_INODE_ITEM_KEY
;
2735 ret
= btrfs_search_slot(NULL
, sctx
->send_root
, &key
, path
, 0, 0);
2741 eb
= path
->nodes
[0];
2742 slot
= path
->slots
[0];
2743 ii
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_item
);
2745 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UTIMES
);
2749 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2750 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_ATIME
, eb
, &ii
->atime
);
2751 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_MTIME
, eb
, &ii
->mtime
);
2752 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_CTIME
, eb
, &ii
->ctime
);
2753 if (sctx
->proto
>= 2)
2754 TLV_PUT_BTRFS_TIMESPEC(sctx
, BTRFS_SEND_A_OTIME
, eb
, &ii
->otime
);
2756 ret
= send_cmd(sctx
);
2760 free_path_for_command(sctx
, p
);
2761 btrfs_free_path(path
);
2766 * If the cache is full, we can't remove entries from it and do a call to
2767 * send_utimes() for each respective inode, because we might be finishing
2768 * processing an inode that is a directory and it just got renamed, and existing
2769 * entries in the cache may refer to inodes that have the directory in their
2770 * full path - in which case we would generate outdated paths (pre-rename)
2771 * for the inodes that the cache entries point to. Instead of prunning the
2772 * cache when inserting, do it after we finish processing each inode at
2773 * finish_inode_if_needed().
2775 static int cache_dir_utimes(struct send_ctx
*sctx
, u64 dir
, u64 gen
)
2777 struct btrfs_lru_cache_entry
*entry
;
2780 entry
= btrfs_lru_cache_lookup(&sctx
->dir_utimes_cache
, dir
, gen
);
2784 /* Caching is optional, don't fail if we can't allocate memory. */
2785 entry
= kmalloc(sizeof(*entry
), GFP_KERNEL
);
2787 return send_utimes(sctx
, dir
, gen
);
2792 ret
= btrfs_lru_cache_store(&sctx
->dir_utimes_cache
, entry
, GFP_KERNEL
);
2793 ASSERT(ret
!= -EEXIST
);
2796 return send_utimes(sctx
, dir
, gen
);
2802 static int trim_dir_utimes_cache(struct send_ctx
*sctx
)
2804 while (sctx
->dir_utimes_cache
.size
> SEND_MAX_DIR_UTIMES_CACHE_SIZE
) {
2805 struct btrfs_lru_cache_entry
*lru
;
2808 lru
= btrfs_lru_cache_lru_entry(&sctx
->dir_utimes_cache
);
2809 ASSERT(lru
!= NULL
);
2811 ret
= send_utimes(sctx
, lru
->key
, lru
->gen
);
2815 btrfs_lru_cache_remove(&sctx
->dir_utimes_cache
, lru
);
2822 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2823 * a valid path yet because we did not process the refs yet. So, the inode
2824 * is created as orphan.
2826 static int send_create_inode(struct send_ctx
*sctx
, u64 ino
)
2831 struct btrfs_inode_info info
;
2836 p
= fs_path_alloc();
2840 if (ino
!= sctx
->cur_ino
) {
2841 ret
= get_inode_info(sctx
->send_root
, ino
, &info
);
2848 gen
= sctx
->cur_inode_gen
;
2849 mode
= sctx
->cur_inode_mode
;
2850 rdev
= sctx
->cur_inode_rdev
;
2853 if (S_ISREG(mode
)) {
2854 cmd
= BTRFS_SEND_C_MKFILE
;
2855 } else if (S_ISDIR(mode
)) {
2856 cmd
= BTRFS_SEND_C_MKDIR
;
2857 } else if (S_ISLNK(mode
)) {
2858 cmd
= BTRFS_SEND_C_SYMLINK
;
2859 } else if (S_ISCHR(mode
) || S_ISBLK(mode
)) {
2860 cmd
= BTRFS_SEND_C_MKNOD
;
2861 } else if (S_ISFIFO(mode
)) {
2862 cmd
= BTRFS_SEND_C_MKFIFO
;
2863 } else if (S_ISSOCK(mode
)) {
2864 cmd
= BTRFS_SEND_C_MKSOCK
;
2866 btrfs_warn(sctx
->send_root
->fs_info
, "unexpected inode type %o",
2867 (int)(mode
& S_IFMT
));
2872 ret
= begin_cmd(sctx
, cmd
);
2876 ret
= gen_unique_name(sctx
, ino
, gen
, p
);
2880 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
2881 TLV_PUT_U64(sctx
, BTRFS_SEND_A_INO
, ino
);
2883 if (S_ISLNK(mode
)) {
2885 ret
= read_symlink(sctx
->send_root
, ino
, p
);
2888 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH_LINK
, p
);
2889 } else if (S_ISCHR(mode
) || S_ISBLK(mode
) ||
2890 S_ISFIFO(mode
) || S_ISSOCK(mode
)) {
2891 TLV_PUT_U64(sctx
, BTRFS_SEND_A_RDEV
, new_encode_dev(rdev
));
2892 TLV_PUT_U64(sctx
, BTRFS_SEND_A_MODE
, mode
);
2895 ret
= send_cmd(sctx
);
2906 static void cache_dir_created(struct send_ctx
*sctx
, u64 dir
)
2908 struct btrfs_lru_cache_entry
*entry
;
2911 /* Caching is optional, ignore any failures. */
2912 entry
= kmalloc(sizeof(*entry
), GFP_KERNEL
);
2918 ret
= btrfs_lru_cache_store(&sctx
->dir_created_cache
, entry
, GFP_KERNEL
);
2924 * We need some special handling for inodes that get processed before the parent
2925 * directory got created. See process_recorded_refs for details.
2926 * This function does the check if we already created the dir out of order.
2928 static int did_create_dir(struct send_ctx
*sctx
, u64 dir
)
2932 struct btrfs_path
*path
= NULL
;
2933 struct btrfs_key key
;
2934 struct btrfs_key found_key
;
2935 struct btrfs_key di_key
;
2936 struct btrfs_dir_item
*di
;
2938 if (btrfs_lru_cache_lookup(&sctx
->dir_created_cache
, dir
, 0))
2941 path
= alloc_path_for_send();
2946 key
.type
= BTRFS_DIR_INDEX_KEY
;
2949 btrfs_for_each_slot(sctx
->send_root
, &key
, &found_key
, path
, iter_ret
) {
2950 struct extent_buffer
*eb
= path
->nodes
[0];
2952 if (found_key
.objectid
!= key
.objectid
||
2953 found_key
.type
!= key
.type
) {
2958 di
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_dir_item
);
2959 btrfs_dir_item_key_to_cpu(eb
, di
, &di_key
);
2961 if (di_key
.type
!= BTRFS_ROOT_ITEM_KEY
&&
2962 di_key
.objectid
< sctx
->send_progress
) {
2964 cache_dir_created(sctx
, dir
);
2968 /* Catch error found during iteration */
2972 btrfs_free_path(path
);
2977 * Only creates the inode if it is:
2978 * 1. Not a directory
2979 * 2. Or a directory which was not created already due to out of order
2980 * directories. See did_create_dir and process_recorded_refs for details.
2982 static int send_create_inode_if_needed(struct send_ctx
*sctx
)
2986 if (S_ISDIR(sctx
->cur_inode_mode
)) {
2987 ret
= did_create_dir(sctx
, sctx
->cur_ino
);
2994 ret
= send_create_inode(sctx
, sctx
->cur_ino
);
2996 if (ret
== 0 && S_ISDIR(sctx
->cur_inode_mode
))
2997 cache_dir_created(sctx
, sctx
->cur_ino
);
3002 struct recorded_ref
{
3003 struct list_head list
;
3005 struct fs_path
*full_path
;
3009 struct rb_node node
;
3010 struct rb_root
*root
;
3013 static struct recorded_ref
*recorded_ref_alloc(void)
3015 struct recorded_ref
*ref
;
3017 ref
= kzalloc(sizeof(*ref
), GFP_KERNEL
);
3020 RB_CLEAR_NODE(&ref
->node
);
3021 INIT_LIST_HEAD(&ref
->list
);
3025 static void recorded_ref_free(struct recorded_ref
*ref
)
3029 if (!RB_EMPTY_NODE(&ref
->node
))
3030 rb_erase(&ref
->node
, ref
->root
);
3031 list_del(&ref
->list
);
3032 fs_path_free(ref
->full_path
);
3036 static void set_ref_path(struct recorded_ref
*ref
, struct fs_path
*path
)
3038 ref
->full_path
= path
;
3039 ref
->name
= (char *)kbasename(ref
->full_path
->start
);
3040 ref
->name_len
= ref
->full_path
->end
- ref
->name
;
3043 static int dup_ref(struct recorded_ref
*ref
, struct list_head
*list
)
3045 struct recorded_ref
*new;
3047 new = recorded_ref_alloc();
3051 new->dir
= ref
->dir
;
3052 new->dir_gen
= ref
->dir_gen
;
3053 list_add_tail(&new->list
, list
);
3057 static void __free_recorded_refs(struct list_head
*head
)
3059 struct recorded_ref
*cur
;
3061 while (!list_empty(head
)) {
3062 cur
= list_first_entry(head
, struct recorded_ref
, list
);
3063 recorded_ref_free(cur
);
3067 static void free_recorded_refs(struct send_ctx
*sctx
)
3069 __free_recorded_refs(&sctx
->new_refs
);
3070 __free_recorded_refs(&sctx
->deleted_refs
);
3074 * Renames/moves a file/dir to its orphan name. Used when the first
3075 * ref of an unprocessed inode gets overwritten and for all non empty
3078 static int orphanize_inode(struct send_ctx
*sctx
, u64 ino
, u64 gen
,
3079 struct fs_path
*path
)
3082 struct fs_path
*orphan
;
3084 orphan
= fs_path_alloc();
3088 ret
= gen_unique_name(sctx
, ino
, gen
, orphan
);
3092 ret
= send_rename(sctx
, path
, orphan
);
3096 if (ino
== sctx
->cur_ino
&& gen
== sctx
->cur_inode_gen
)
3097 ret
= fs_path_copy(&sctx
->cur_inode_path
, orphan
);
3100 fs_path_free(orphan
);
3104 static struct orphan_dir_info
*add_orphan_dir_info(struct send_ctx
*sctx
,
3105 u64 dir_ino
, u64 dir_gen
)
3107 struct rb_node
**p
= &sctx
->orphan_dirs
.rb_node
;
3108 struct rb_node
*parent
= NULL
;
3109 struct orphan_dir_info
*entry
, *odi
;
3113 entry
= rb_entry(parent
, struct orphan_dir_info
, node
);
3114 if (dir_ino
< entry
->ino
)
3116 else if (dir_ino
> entry
->ino
)
3117 p
= &(*p
)->rb_right
;
3118 else if (dir_gen
< entry
->gen
)
3120 else if (dir_gen
> entry
->gen
)
3121 p
= &(*p
)->rb_right
;
3126 odi
= kmalloc(sizeof(*odi
), GFP_KERNEL
);
3128 return ERR_PTR(-ENOMEM
);
3131 odi
->last_dir_index_offset
= 0;
3132 odi
->dir_high_seq_ino
= 0;
3134 rb_link_node(&odi
->node
, parent
, p
);
3135 rb_insert_color(&odi
->node
, &sctx
->orphan_dirs
);
3139 static struct orphan_dir_info
*get_orphan_dir_info(struct send_ctx
*sctx
,
3140 u64 dir_ino
, u64 gen
)
3142 struct rb_node
*n
= sctx
->orphan_dirs
.rb_node
;
3143 struct orphan_dir_info
*entry
;
3146 entry
= rb_entry(n
, struct orphan_dir_info
, node
);
3147 if (dir_ino
< entry
->ino
)
3149 else if (dir_ino
> entry
->ino
)
3151 else if (gen
< entry
->gen
)
3153 else if (gen
> entry
->gen
)
3161 static int is_waiting_for_rm(struct send_ctx
*sctx
, u64 dir_ino
, u64 gen
)
3163 struct orphan_dir_info
*odi
= get_orphan_dir_info(sctx
, dir_ino
, gen
);
3168 static void free_orphan_dir_info(struct send_ctx
*sctx
,
3169 struct orphan_dir_info
*odi
)
3173 rb_erase(&odi
->node
, &sctx
->orphan_dirs
);
3178 * Returns 1 if a directory can be removed at this point in time.
3179 * We check this by iterating all dir items and checking if the inode behind
3180 * the dir item was already processed.
3182 static int can_rmdir(struct send_ctx
*sctx
, u64 dir
, u64 dir_gen
)
3186 struct btrfs_root
*root
= sctx
->parent_root
;
3187 struct btrfs_path
*path
;
3188 struct btrfs_key key
;
3189 struct btrfs_key found_key
;
3190 struct btrfs_key loc
;
3191 struct btrfs_dir_item
*di
;
3192 struct orphan_dir_info
*odi
= NULL
;
3193 u64 dir_high_seq_ino
= 0;
3194 u64 last_dir_index_offset
= 0;
3197 * Don't try to rmdir the top/root subvolume dir.
3199 if (dir
== BTRFS_FIRST_FREE_OBJECTID
)
3202 odi
= get_orphan_dir_info(sctx
, dir
, dir_gen
);
3203 if (odi
&& sctx
->cur_ino
< odi
->dir_high_seq_ino
)
3206 path
= alloc_path_for_send();
3212 * Find the inode number associated with the last dir index
3213 * entry. This is very likely the inode with the highest number
3214 * of all inodes that have an entry in the directory. We can
3215 * then use it to avoid future calls to can_rmdir(), when
3216 * processing inodes with a lower number, from having to search
3217 * the parent root b+tree for dir index keys.
3220 key
.type
= BTRFS_DIR_INDEX_KEY
;
3221 key
.offset
= (u64
)-1;
3223 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
3226 } else if (ret
> 0) {
3227 /* Can't happen, the root is never empty. */
3228 ASSERT(path
->slots
[0] > 0);
3229 if (WARN_ON(path
->slots
[0] == 0)) {
3236 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
3237 if (key
.objectid
!= dir
|| key
.type
!= BTRFS_DIR_INDEX_KEY
) {
3238 /* No index keys, dir can be removed. */
3243 di
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3244 struct btrfs_dir_item
);
3245 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &loc
);
3246 dir_high_seq_ino
= loc
.objectid
;
3247 if (sctx
->cur_ino
< dir_high_seq_ino
) {
3252 btrfs_release_path(path
);
3256 key
.type
= BTRFS_DIR_INDEX_KEY
;
3257 key
.offset
= (odi
? odi
->last_dir_index_offset
: 0);
3259 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
3260 struct waiting_dir_move
*dm
;
3262 if (found_key
.objectid
!= key
.objectid
||
3263 found_key
.type
!= key
.type
)
3266 di
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3267 struct btrfs_dir_item
);
3268 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &loc
);
3270 dir_high_seq_ino
= max(dir_high_seq_ino
, loc
.objectid
);
3271 last_dir_index_offset
= found_key
.offset
;
3273 dm
= get_waiting_dir_move(sctx
, loc
.objectid
);
3275 dm
->rmdir_ino
= dir
;
3276 dm
->rmdir_gen
= dir_gen
;
3281 if (loc
.objectid
> sctx
->cur_ino
) {
3290 free_orphan_dir_info(sctx
, odi
);
3295 btrfs_free_path(path
);
3301 odi
= add_orphan_dir_info(sctx
, dir
, dir_gen
);
3303 return PTR_ERR(odi
);
3308 odi
->last_dir_index_offset
= last_dir_index_offset
;
3309 odi
->dir_high_seq_ino
= max(odi
->dir_high_seq_ino
, dir_high_seq_ino
);
3314 static int is_waiting_for_move(struct send_ctx
*sctx
, u64 ino
)
3316 struct waiting_dir_move
*entry
= get_waiting_dir_move(sctx
, ino
);
3318 return entry
!= NULL
;
3321 static int add_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
, bool orphanized
)
3323 struct rb_node
**p
= &sctx
->waiting_dir_moves
.rb_node
;
3324 struct rb_node
*parent
= NULL
;
3325 struct waiting_dir_move
*entry
, *dm
;
3327 dm
= kmalloc(sizeof(*dm
), GFP_KERNEL
);
3333 dm
->orphanized
= orphanized
;
3337 entry
= rb_entry(parent
, struct waiting_dir_move
, node
);
3338 if (ino
< entry
->ino
) {
3340 } else if (ino
> entry
->ino
) {
3341 p
= &(*p
)->rb_right
;
3348 rb_link_node(&dm
->node
, parent
, p
);
3349 rb_insert_color(&dm
->node
, &sctx
->waiting_dir_moves
);
3353 static struct waiting_dir_move
*
3354 get_waiting_dir_move(struct send_ctx
*sctx
, u64 ino
)
3356 struct rb_node
*n
= sctx
->waiting_dir_moves
.rb_node
;
3357 struct waiting_dir_move
*entry
;
3360 entry
= rb_entry(n
, struct waiting_dir_move
, node
);
3361 if (ino
< entry
->ino
)
3363 else if (ino
> entry
->ino
)
3371 static void free_waiting_dir_move(struct send_ctx
*sctx
,
3372 struct waiting_dir_move
*dm
)
3376 rb_erase(&dm
->node
, &sctx
->waiting_dir_moves
);
3380 static int add_pending_dir_move(struct send_ctx
*sctx
,
3384 struct list_head
*new_refs
,
3385 struct list_head
*deleted_refs
,
3386 const bool is_orphan
)
3388 struct rb_node
**p
= &sctx
->pending_dir_moves
.rb_node
;
3389 struct rb_node
*parent
= NULL
;
3390 struct pending_dir_move
*entry
= NULL
, *pm
;
3391 struct recorded_ref
*cur
;
3395 pm
= kmalloc(sizeof(*pm
), GFP_KERNEL
);
3398 pm
->parent_ino
= parent_ino
;
3401 INIT_LIST_HEAD(&pm
->list
);
3402 INIT_LIST_HEAD(&pm
->update_refs
);
3403 RB_CLEAR_NODE(&pm
->node
);
3407 entry
= rb_entry(parent
, struct pending_dir_move
, node
);
3408 if (parent_ino
< entry
->parent_ino
) {
3410 } else if (parent_ino
> entry
->parent_ino
) {
3411 p
= &(*p
)->rb_right
;
3418 list_for_each_entry(cur
, deleted_refs
, list
) {
3419 ret
= dup_ref(cur
, &pm
->update_refs
);
3423 list_for_each_entry(cur
, new_refs
, list
) {
3424 ret
= dup_ref(cur
, &pm
->update_refs
);
3429 ret
= add_waiting_dir_move(sctx
, pm
->ino
, is_orphan
);
3434 list_add_tail(&pm
->list
, &entry
->list
);
3436 rb_link_node(&pm
->node
, parent
, p
);
3437 rb_insert_color(&pm
->node
, &sctx
->pending_dir_moves
);
3442 __free_recorded_refs(&pm
->update_refs
);
3448 static struct pending_dir_move
*get_pending_dir_moves(struct send_ctx
*sctx
,
3451 struct rb_node
*n
= sctx
->pending_dir_moves
.rb_node
;
3452 struct pending_dir_move
*entry
;
3455 entry
= rb_entry(n
, struct pending_dir_move
, node
);
3456 if (parent_ino
< entry
->parent_ino
)
3458 else if (parent_ino
> entry
->parent_ino
)
3466 static int path_loop(struct send_ctx
*sctx
, struct fs_path
*name
,
3467 u64 ino
, u64 gen
, u64
*ancestor_ino
)
3470 u64 parent_inode
= 0;
3472 u64 start_ino
= ino
;
3475 while (ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
3476 fs_path_reset(name
);
3478 if (is_waiting_for_rm(sctx
, ino
, gen
))
3480 if (is_waiting_for_move(sctx
, ino
)) {
3481 if (*ancestor_ino
== 0)
3482 *ancestor_ino
= ino
;
3483 ret
= get_first_ref(sctx
->parent_root
, ino
,
3484 &parent_inode
, &parent_gen
, name
);
3486 ret
= __get_cur_name_and_parent(sctx
, ino
, gen
,
3496 if (parent_inode
== start_ino
) {
3498 if (*ancestor_ino
== 0)
3499 *ancestor_ino
= ino
;
3508 static int apply_dir_move(struct send_ctx
*sctx
, struct pending_dir_move
*pm
)
3510 struct fs_path
*from_path
= NULL
;
3511 struct fs_path
*to_path
= NULL
;
3512 struct fs_path
*name
= NULL
;
3513 u64 orig_progress
= sctx
->send_progress
;
3514 struct recorded_ref
*cur
;
3515 u64 parent_ino
, parent_gen
;
3516 struct waiting_dir_move
*dm
= NULL
;
3523 name
= fs_path_alloc();
3524 from_path
= fs_path_alloc();
3525 if (!name
|| !from_path
) {
3530 dm
= get_waiting_dir_move(sctx
, pm
->ino
);
3532 rmdir_ino
= dm
->rmdir_ino
;
3533 rmdir_gen
= dm
->rmdir_gen
;
3534 is_orphan
= dm
->orphanized
;
3535 free_waiting_dir_move(sctx
, dm
);
3538 ret
= gen_unique_name(sctx
, pm
->ino
,
3539 pm
->gen
, from_path
);
3541 ret
= get_first_ref(sctx
->parent_root
, pm
->ino
,
3542 &parent_ino
, &parent_gen
, name
);
3545 ret
= get_cur_path(sctx
, parent_ino
, parent_gen
,
3549 ret
= fs_path_add_path(from_path
, name
);
3554 sctx
->send_progress
= sctx
->cur_ino
+ 1;
3555 ret
= path_loop(sctx
, name
, pm
->ino
, pm
->gen
, &ancestor
);
3559 LIST_HEAD(deleted_refs
);
3560 ASSERT(ancestor
> BTRFS_FIRST_FREE_OBJECTID
);
3561 ret
= add_pending_dir_move(sctx
, pm
->ino
, pm
->gen
, ancestor
,
3562 &pm
->update_refs
, &deleted_refs
,
3567 dm
= get_waiting_dir_move(sctx
, pm
->ino
);
3569 dm
->rmdir_ino
= rmdir_ino
;
3570 dm
->rmdir_gen
= rmdir_gen
;
3574 fs_path_reset(name
);
3577 ret
= get_cur_path(sctx
, pm
->ino
, pm
->gen
, to_path
);
3581 ret
= send_rename(sctx
, from_path
, to_path
);
3586 struct orphan_dir_info
*odi
;
3589 odi
= get_orphan_dir_info(sctx
, rmdir_ino
, rmdir_gen
);
3591 /* already deleted */
3596 ret
= can_rmdir(sctx
, rmdir_ino
, gen
);
3602 name
= fs_path_alloc();
3607 ret
= get_cur_path(sctx
, rmdir_ino
, gen
, name
);
3610 ret
= send_rmdir(sctx
, name
);
3616 ret
= cache_dir_utimes(sctx
, pm
->ino
, pm
->gen
);
3621 * After rename/move, need to update the utimes of both new parent(s)
3622 * and old parent(s).
3624 list_for_each_entry(cur
, &pm
->update_refs
, list
) {
3626 * The parent inode might have been deleted in the send snapshot
3628 ret
= get_inode_info(sctx
->send_root
, cur
->dir
, NULL
);
3629 if (ret
== -ENOENT
) {
3636 ret
= cache_dir_utimes(sctx
, cur
->dir
, cur
->dir_gen
);
3643 fs_path_free(from_path
);
3644 fs_path_free(to_path
);
3645 sctx
->send_progress
= orig_progress
;
3650 static void free_pending_move(struct send_ctx
*sctx
, struct pending_dir_move
*m
)
3652 if (!list_empty(&m
->list
))
3654 if (!RB_EMPTY_NODE(&m
->node
))
3655 rb_erase(&m
->node
, &sctx
->pending_dir_moves
);
3656 __free_recorded_refs(&m
->update_refs
);
3660 static void tail_append_pending_moves(struct send_ctx
*sctx
,
3661 struct pending_dir_move
*moves
,
3662 struct list_head
*stack
)
3664 if (list_empty(&moves
->list
)) {
3665 list_add_tail(&moves
->list
, stack
);
3668 list_splice_init(&moves
->list
, &list
);
3669 list_add_tail(&moves
->list
, stack
);
3670 list_splice_tail(&list
, stack
);
3672 if (!RB_EMPTY_NODE(&moves
->node
)) {
3673 rb_erase(&moves
->node
, &sctx
->pending_dir_moves
);
3674 RB_CLEAR_NODE(&moves
->node
);
3678 static int apply_children_dir_moves(struct send_ctx
*sctx
)
3680 struct pending_dir_move
*pm
;
3682 u64 parent_ino
= sctx
->cur_ino
;
3685 pm
= get_pending_dir_moves(sctx
, parent_ino
);
3689 tail_append_pending_moves(sctx
, pm
, &stack
);
3691 while (!list_empty(&stack
)) {
3692 pm
= list_first_entry(&stack
, struct pending_dir_move
, list
);
3693 parent_ino
= pm
->ino
;
3694 ret
= apply_dir_move(sctx
, pm
);
3695 free_pending_move(sctx
, pm
);
3698 pm
= get_pending_dir_moves(sctx
, parent_ino
);
3700 tail_append_pending_moves(sctx
, pm
, &stack
);
3705 while (!list_empty(&stack
)) {
3706 pm
= list_first_entry(&stack
, struct pending_dir_move
, list
);
3707 free_pending_move(sctx
, pm
);
3713 * We might need to delay a directory rename even when no ancestor directory
3714 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3715 * renamed. This happens when we rename a directory to the old name (the name
3716 * in the parent root) of some other unrelated directory that got its rename
3717 * delayed due to some ancestor with higher number that got renamed.
3723 * |---- a/ (ino 257)
3724 * | |---- file (ino 260)
3726 * |---- b/ (ino 258)
3727 * |---- c/ (ino 259)
3731 * |---- a/ (ino 258)
3732 * |---- x/ (ino 259)
3733 * |---- y/ (ino 257)
3734 * |----- file (ino 260)
3736 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3737 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
3738 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3741 * 1 - rename 259 from 'c' to 'x'
3742 * 2 - rename 257 from 'a' to 'x/y'
3743 * 3 - rename 258 from 'b' to 'a'
3745 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3746 * be done right away and < 0 on error.
3748 static int wait_for_dest_dir_move(struct send_ctx
*sctx
,
3749 struct recorded_ref
*parent_ref
,
3750 const bool is_orphan
)
3752 struct btrfs_path
*path
;
3753 struct btrfs_key key
;
3754 struct btrfs_key di_key
;
3755 struct btrfs_dir_item
*di
;
3759 struct waiting_dir_move
*wdm
;
3761 if (RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
))
3764 path
= alloc_path_for_send();
3768 key
.objectid
= parent_ref
->dir
;
3769 key
.type
= BTRFS_DIR_ITEM_KEY
;
3770 key
.offset
= btrfs_name_hash(parent_ref
->name
, parent_ref
->name_len
);
3772 ret
= btrfs_search_slot(NULL
, sctx
->parent_root
, &key
, path
, 0, 0);
3775 } else if (ret
> 0) {
3780 di
= btrfs_match_dir_item_name(path
, parent_ref
->name
,
3781 parent_ref
->name_len
);
3787 * di_key.objectid has the number of the inode that has a dentry in the
3788 * parent directory with the same name that sctx->cur_ino is being
3789 * renamed to. We need to check if that inode is in the send root as
3790 * well and if it is currently marked as an inode with a pending rename,
3791 * if it is, we need to delay the rename of sctx->cur_ino as well, so
3792 * that it happens after that other inode is renamed.
3794 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &di_key
);
3795 if (di_key
.type
!= BTRFS_INODE_ITEM_KEY
) {
3800 ret
= get_inode_gen(sctx
->parent_root
, di_key
.objectid
, &left_gen
);
3803 ret
= get_inode_gen(sctx
->send_root
, di_key
.objectid
, &right_gen
);
3810 /* Different inode, no need to delay the rename of sctx->cur_ino */
3811 if (right_gen
!= left_gen
) {
3816 wdm
= get_waiting_dir_move(sctx
, di_key
.objectid
);
3817 if (wdm
&& !wdm
->orphanized
) {
3818 ret
= add_pending_dir_move(sctx
,
3820 sctx
->cur_inode_gen
,
3823 &sctx
->deleted_refs
,
3829 btrfs_free_path(path
);
3834 * Check if inode ino2, or any of its ancestors, is inode ino1.
3835 * Return 1 if true, 0 if false and < 0 on error.
3837 static int check_ino_in_path(struct btrfs_root
*root
,
3842 struct fs_path
*fs_path
)
3847 return ino1_gen
== ino2_gen
;
3849 while (ino
> BTRFS_FIRST_FREE_OBJECTID
) {
3854 fs_path_reset(fs_path
);
3855 ret
= get_first_ref(root
, ino
, &parent
, &parent_gen
, fs_path
);
3859 return parent_gen
== ino1_gen
;
3866 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3867 * possible path (in case ino2 is not a directory and has multiple hard links).
3868 * Return 1 if true, 0 if false and < 0 on error.
3870 static int is_ancestor(struct btrfs_root
*root
,
3874 struct fs_path
*fs_path
)
3876 bool free_fs_path
= false;
3879 struct btrfs_path
*path
= NULL
;
3880 struct btrfs_key key
;
3883 fs_path
= fs_path_alloc();
3886 free_fs_path
= true;
3889 path
= alloc_path_for_send();
3895 key
.objectid
= ino2
;
3896 key
.type
= BTRFS_INODE_REF_KEY
;
3899 btrfs_for_each_slot(root
, &key
, &key
, path
, iter_ret
) {
3900 struct extent_buffer
*leaf
= path
->nodes
[0];
3901 int slot
= path
->slots
[0];
3905 if (key
.objectid
!= ino2
)
3907 if (key
.type
!= BTRFS_INODE_REF_KEY
&&
3908 key
.type
!= BTRFS_INODE_EXTREF_KEY
)
3911 item_size
= btrfs_item_size(leaf
, slot
);
3912 while (cur_offset
< item_size
) {
3916 if (key
.type
== BTRFS_INODE_EXTREF_KEY
) {
3918 struct btrfs_inode_extref
*extref
;
3920 ptr
= btrfs_item_ptr_offset(leaf
, slot
);
3921 extref
= (struct btrfs_inode_extref
*)
3923 parent
= btrfs_inode_extref_parent(leaf
,
3925 cur_offset
+= sizeof(*extref
);
3926 cur_offset
+= btrfs_inode_extref_name_len(leaf
,
3929 parent
= key
.offset
;
3930 cur_offset
= item_size
;
3933 ret
= get_inode_gen(root
, parent
, &parent_gen
);
3936 ret
= check_ino_in_path(root
, ino1
, ino1_gen
,
3937 parent
, parent_gen
, fs_path
);
3947 btrfs_free_path(path
);
3949 fs_path_free(fs_path
);
3953 static int wait_for_parent_move(struct send_ctx
*sctx
,
3954 struct recorded_ref
*parent_ref
,
3955 const bool is_orphan
)
3958 u64 ino
= parent_ref
->dir
;
3959 u64 ino_gen
= parent_ref
->dir_gen
;
3960 u64 parent_ino_before
, parent_ino_after
;
3961 struct fs_path
*path_before
= NULL
;
3962 struct fs_path
*path_after
= NULL
;
3965 path_after
= fs_path_alloc();
3966 path_before
= fs_path_alloc();
3967 if (!path_after
|| !path_before
) {
3973 * Our current directory inode may not yet be renamed/moved because some
3974 * ancestor (immediate or not) has to be renamed/moved first. So find if
3975 * such ancestor exists and make sure our own rename/move happens after
3976 * that ancestor is processed to avoid path build infinite loops (done
3977 * at get_cur_path()).
3979 while (ino
> BTRFS_FIRST_FREE_OBJECTID
) {
3980 u64 parent_ino_after_gen
;
3982 if (is_waiting_for_move(sctx
, ino
)) {
3984 * If the current inode is an ancestor of ino in the
3985 * parent root, we need to delay the rename of the
3986 * current inode, otherwise don't delayed the rename
3987 * because we can end up with a circular dependency
3988 * of renames, resulting in some directories never
3989 * getting the respective rename operations issued in
3990 * the send stream or getting into infinite path build
3993 ret
= is_ancestor(sctx
->parent_root
,
3994 sctx
->cur_ino
, sctx
->cur_inode_gen
,
4000 fs_path_reset(path_before
);
4001 fs_path_reset(path_after
);
4003 ret
= get_first_ref(sctx
->send_root
, ino
, &parent_ino_after
,
4004 &parent_ino_after_gen
, path_after
);
4007 ret
= get_first_ref(sctx
->parent_root
, ino
, &parent_ino_before
,
4009 if (ret
< 0 && ret
!= -ENOENT
) {
4011 } else if (ret
== -ENOENT
) {
4016 len1
= fs_path_len(path_before
);
4017 len2
= fs_path_len(path_after
);
4018 if (ino
> sctx
->cur_ino
&&
4019 (parent_ino_before
!= parent_ino_after
|| len1
!= len2
||
4020 memcmp(path_before
->start
, path_after
->start
, len1
))) {
4023 ret
= get_inode_gen(sctx
->parent_root
, ino
, &parent_ino_gen
);
4026 if (ino_gen
== parent_ino_gen
) {
4031 ino
= parent_ino_after
;
4032 ino_gen
= parent_ino_after_gen
;
4036 fs_path_free(path_before
);
4037 fs_path_free(path_after
);
4040 ret
= add_pending_dir_move(sctx
,
4042 sctx
->cur_inode_gen
,
4045 &sctx
->deleted_refs
,
4054 static int update_ref_path(struct send_ctx
*sctx
, struct recorded_ref
*ref
)
4057 struct fs_path
*new_path
;
4060 * Our reference's name member points to its full_path member string, so
4061 * we use here a new path.
4063 new_path
= fs_path_alloc();
4067 ret
= get_cur_path(sctx
, ref
->dir
, ref
->dir_gen
, new_path
);
4069 fs_path_free(new_path
);
4072 ret
= fs_path_add(new_path
, ref
->name
, ref
->name_len
);
4074 fs_path_free(new_path
);
4078 fs_path_free(ref
->full_path
);
4079 set_ref_path(ref
, new_path
);
4085 * When processing the new references for an inode we may orphanize an existing
4086 * directory inode because its old name conflicts with one of the new references
4087 * of the current inode. Later, when processing another new reference of our
4088 * inode, we might need to orphanize another inode, but the path we have in the
4089 * reference reflects the pre-orphanization name of the directory we previously
4090 * orphanized. For example:
4092 * parent snapshot looks like:
4095 * |----- f1 (ino 257)
4096 * |----- f2 (ino 258)
4097 * |----- d1/ (ino 259)
4098 * |----- d2/ (ino 260)
4100 * send snapshot looks like:
4103 * |----- d1 (ino 258)
4104 * |----- f2/ (ino 259)
4105 * |----- f2_link/ (ino 260)
4106 * | |----- f1 (ino 257)
4108 * |----- d2 (ino 258)
4110 * When processing inode 257 we compute the name for inode 259 as "d1", and we
4111 * cache it in the name cache. Later when we start processing inode 258, when
4112 * collecting all its new references we set a full path of "d1/d2" for its new
4113 * reference with name "d2". When we start processing the new references we
4114 * start by processing the new reference with name "d1", and this results in
4115 * orphanizing inode 259, since its old reference causes a conflict. Then we
4116 * move on the next new reference, with name "d2", and we find out we must
4117 * orphanize inode 260, as its old reference conflicts with ours - but for the
4118 * orphanization we use a source path corresponding to the path we stored in the
4119 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
4120 * receiver fail since the path component "d1/" no longer exists, it was renamed
4121 * to "o259-6-0/" when processing the previous new reference. So in this case we
4122 * must recompute the path in the new reference and use it for the new
4123 * orphanization operation.
4125 static int refresh_ref_path(struct send_ctx
*sctx
, struct recorded_ref
*ref
)
4130 name
= kmemdup(ref
->name
, ref
->name_len
, GFP_KERNEL
);
4134 fs_path_reset(ref
->full_path
);
4135 ret
= get_cur_path(sctx
, ref
->dir
, ref
->dir_gen
, ref
->full_path
);
4139 ret
= fs_path_add(ref
->full_path
, name
, ref
->name_len
);
4143 /* Update the reference's base name pointer. */
4144 set_ref_path(ref
, ref
->full_path
);
4150 static int rename_current_inode(struct send_ctx
*sctx
,
4151 struct fs_path
*current_path
,
4152 struct fs_path
*new_path
)
4156 ret
= send_rename(sctx
, current_path
, new_path
);
4160 ret
= fs_path_copy(&sctx
->cur_inode_path
, new_path
);
4164 return fs_path_copy(current_path
, new_path
);
4168 * This does all the move/link/unlink/rmdir magic.
4170 static int process_recorded_refs(struct send_ctx
*sctx
, int *pending_move
)
4172 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
4174 struct recorded_ref
*cur
;
4175 struct recorded_ref
*cur2
;
4176 LIST_HEAD(check_dirs
);
4177 struct fs_path
*valid_path
= NULL
;
4181 u64 last_dir_ino_rm
= 0;
4182 bool did_overwrite
= false;
4183 bool is_orphan
= false;
4184 bool can_rename
= true;
4185 bool orphanized_dir
= false;
4186 bool orphanized_ancestor
= false;
4189 * This should never happen as the root dir always has the same ref
4190 * which is always '..'
4192 if (unlikely(sctx
->cur_ino
<= BTRFS_FIRST_FREE_OBJECTID
)) {
4194 "send: unexpected inode %llu in process_recorded_refs()",
4200 valid_path
= fs_path_alloc();
4207 * First, check if the first ref of the current inode was overwritten
4208 * before. If yes, we know that the current inode was already orphanized
4209 * and thus use the orphan name. If not, we can use get_cur_path to
4210 * get the path of the first ref as it would like while receiving at
4211 * this point in time.
4212 * New inodes are always orphan at the beginning, so force to use the
4213 * orphan name in this case.
4214 * The first ref is stored in valid_path and will be updated if it
4215 * gets moved around.
4217 if (!sctx
->cur_inode_new
) {
4218 ret
= did_overwrite_first_ref(sctx
, sctx
->cur_ino
,
4219 sctx
->cur_inode_gen
);
4223 did_overwrite
= true;
4225 if (sctx
->cur_inode_new
|| did_overwrite
) {
4226 ret
= gen_unique_name(sctx
, sctx
->cur_ino
,
4227 sctx
->cur_inode_gen
, valid_path
);
4232 ret
= get_cur_path(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
4239 * Before doing any rename and link operations, do a first pass on the
4240 * new references to orphanize any unprocessed inodes that may have a
4241 * reference that conflicts with one of the new references of the current
4242 * inode. This needs to happen first because a new reference may conflict
4243 * with the old reference of a parent directory, so we must make sure
4244 * that the path used for link and rename commands don't use an
4245 * orphanized name when an ancestor was not yet orphanized.
4252 * |----- testdir/ (ino 259)
4253 * | |----- a (ino 257)
4255 * |----- b (ino 258)
4260 * |----- testdir_2/ (ino 259)
4261 * | |----- a (ino 260)
4263 * |----- testdir (ino 257)
4264 * |----- b (ino 257)
4265 * |----- b2 (ino 258)
4267 * Processing the new reference for inode 257 with name "b" may happen
4268 * before processing the new reference with name "testdir". If so, we
4269 * must make sure that by the time we send a link command to create the
4270 * hard link "b", inode 259 was already orphanized, since the generated
4271 * path in "valid_path" already contains the orphanized name for 259.
4272 * We are processing inode 257, so only later when processing 259 we do
4273 * the rename operation to change its temporary (orphanized) name to
4276 list_for_each_entry(cur
, &sctx
->new_refs
, list
) {
4277 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4280 if (ret
== inode_state_will_create
)
4284 * Check if this new ref would overwrite the first ref of another
4285 * unprocessed inode. If yes, orphanize the overwritten inode.
4286 * If we find an overwritten ref that is not the first ref,
4289 ret
= will_overwrite_ref(sctx
, cur
->dir
, cur
->dir_gen
,
4290 cur
->name
, cur
->name_len
,
4291 &ow_inode
, &ow_gen
, &ow_mode
);
4295 ret
= is_first_ref(sctx
->parent_root
,
4296 ow_inode
, cur
->dir
, cur
->name
,
4301 struct name_cache_entry
*nce
;
4302 struct waiting_dir_move
*wdm
;
4304 if (orphanized_dir
) {
4305 ret
= refresh_ref_path(sctx
, cur
);
4310 ret
= orphanize_inode(sctx
, ow_inode
, ow_gen
,
4314 if (S_ISDIR(ow_mode
))
4315 orphanized_dir
= true;
4318 * If ow_inode has its rename operation delayed
4319 * make sure that its orphanized name is used in
4320 * the source path when performing its rename
4323 wdm
= get_waiting_dir_move(sctx
, ow_inode
);
4325 wdm
->orphanized
= true;
4328 * Make sure we clear our orphanized inode's
4329 * name from the name cache. This is because the
4330 * inode ow_inode might be an ancestor of some
4331 * other inode that will be orphanized as well
4332 * later and has an inode number greater than
4333 * sctx->send_progress. We need to prevent
4334 * future name lookups from using the old name
4335 * and get instead the orphan name.
4337 nce
= name_cache_search(sctx
, ow_inode
, ow_gen
);
4339 btrfs_lru_cache_remove(&sctx
->name_cache
,
4343 * ow_inode might currently be an ancestor of
4344 * cur_ino, therefore compute valid_path (the
4345 * current path of cur_ino) again because it
4346 * might contain the pre-orphanization name of
4347 * ow_inode, which is no longer valid.
4349 ret
= is_ancestor(sctx
->parent_root
,
4351 sctx
->cur_ino
, NULL
);
4353 orphanized_ancestor
= true;
4354 fs_path_reset(valid_path
);
4355 fs_path_reset(&sctx
->cur_inode_path
);
4356 ret
= get_cur_path(sctx
, sctx
->cur_ino
,
4357 sctx
->cur_inode_gen
,
4364 * If we previously orphanized a directory that
4365 * collided with a new reference that we already
4366 * processed, recompute the current path because
4367 * that directory may be part of the path.
4369 if (orphanized_dir
) {
4370 ret
= refresh_ref_path(sctx
, cur
);
4374 ret
= send_unlink(sctx
, cur
->full_path
);
4382 list_for_each_entry(cur
, &sctx
->new_refs
, list
) {
4384 * We may have refs where the parent directory does not exist
4385 * yet. This happens if the parent directories inum is higher
4386 * than the current inum. To handle this case, we create the
4387 * parent directory out of order. But we need to check if this
4388 * did already happen before due to other refs in the same dir.
4390 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4393 if (ret
== inode_state_will_create
) {
4396 * First check if any of the current inodes refs did
4397 * already create the dir.
4399 list_for_each_entry(cur2
, &sctx
->new_refs
, list
) {
4402 if (cur2
->dir
== cur
->dir
) {
4409 * If that did not happen, check if a previous inode
4410 * did already create the dir.
4413 ret
= did_create_dir(sctx
, cur
->dir
);
4417 ret
= send_create_inode(sctx
, cur
->dir
);
4420 cache_dir_created(sctx
, cur
->dir
);
4424 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->parent_root
) {
4425 ret
= wait_for_dest_dir_move(sctx
, cur
, is_orphan
);
4434 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->parent_root
&&
4436 ret
= wait_for_parent_move(sctx
, cur
, is_orphan
);
4446 * link/move the ref to the new place. If we have an orphan
4447 * inode, move it and update valid_path. If not, link or move
4448 * it depending on the inode mode.
4450 if (is_orphan
&& can_rename
) {
4451 ret
= rename_current_inode(sctx
, valid_path
, cur
->full_path
);
4455 } else if (can_rename
) {
4456 if (S_ISDIR(sctx
->cur_inode_mode
)) {
4458 * Dirs can't be linked, so move it. For moved
4459 * dirs, we always have one new and one deleted
4460 * ref. The deleted ref is ignored later.
4462 ret
= rename_current_inode(sctx
, valid_path
,
4468 * We might have previously orphanized an inode
4469 * which is an ancestor of our current inode,
4470 * so our reference's full path, which was
4471 * computed before any such orphanizations, must
4474 if (orphanized_dir
) {
4475 ret
= update_ref_path(sctx
, cur
);
4479 ret
= send_link(sctx
, cur
->full_path
,
4485 ret
= dup_ref(cur
, &check_dirs
);
4490 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->cur_inode_deleted
) {
4492 * Check if we can already rmdir the directory. If not,
4493 * orphanize it. For every dir item inside that gets deleted
4494 * later, we do this check again and rmdir it then if possible.
4495 * See the use of check_dirs for more details.
4497 ret
= can_rmdir(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
4501 ret
= send_rmdir(sctx
, valid_path
);
4504 } else if (!is_orphan
) {
4505 ret
= orphanize_inode(sctx
, sctx
->cur_ino
,
4506 sctx
->cur_inode_gen
, valid_path
);
4512 list_for_each_entry(cur
, &sctx
->deleted_refs
, list
) {
4513 ret
= dup_ref(cur
, &check_dirs
);
4517 } else if (S_ISDIR(sctx
->cur_inode_mode
) &&
4518 !list_empty(&sctx
->deleted_refs
)) {
4520 * We have a moved dir. Add the old parent to check_dirs
4522 cur
= list_first_entry(&sctx
->deleted_refs
, struct recorded_ref
, list
);
4523 ret
= dup_ref(cur
, &check_dirs
);
4526 } else if (!S_ISDIR(sctx
->cur_inode_mode
)) {
4528 * We have a non dir inode. Go through all deleted refs and
4529 * unlink them if they were not already overwritten by other
4532 list_for_each_entry(cur
, &sctx
->deleted_refs
, list
) {
4533 ret
= did_overwrite_ref(sctx
, cur
->dir
, cur
->dir_gen
,
4534 sctx
->cur_ino
, sctx
->cur_inode_gen
,
4535 cur
->name
, cur
->name_len
);
4540 * If we orphanized any ancestor before, we need
4541 * to recompute the full path for deleted names,
4542 * since any such path was computed before we
4543 * processed any references and orphanized any
4546 if (orphanized_ancestor
) {
4547 ret
= update_ref_path(sctx
, cur
);
4551 ret
= send_unlink(sctx
, cur
->full_path
);
4554 if (is_current_inode_path(sctx
, cur
->full_path
))
4555 fs_path_reset(&sctx
->cur_inode_path
);
4557 ret
= dup_ref(cur
, &check_dirs
);
4562 * If the inode is still orphan, unlink the orphan. This may
4563 * happen when a previous inode did overwrite the first ref
4564 * of this inode and no new refs were added for the current
4565 * inode. Unlinking does not mean that the inode is deleted in
4566 * all cases. There may still be links to this inode in other
4570 ret
= send_unlink(sctx
, valid_path
);
4577 * We did collect all parent dirs where cur_inode was once located. We
4578 * now go through all these dirs and check if they are pending for
4579 * deletion and if it's finally possible to perform the rmdir now.
4580 * We also update the inode stats of the parent dirs here.
4582 list_for_each_entry(cur
, &check_dirs
, list
) {
4584 * In case we had refs into dirs that were not processed yet,
4585 * we don't need to do the utime and rmdir logic for these dirs.
4586 * The dir will be processed later.
4588 if (cur
->dir
> sctx
->cur_ino
)
4591 ret
= get_cur_inode_state(sctx
, cur
->dir
, cur
->dir_gen
, NULL
, NULL
);
4595 if (ret
== inode_state_did_create
||
4596 ret
== inode_state_no_change
) {
4597 ret
= cache_dir_utimes(sctx
, cur
->dir
, cur
->dir_gen
);
4600 } else if (ret
== inode_state_did_delete
&&
4601 cur
->dir
!= last_dir_ino_rm
) {
4602 ret
= can_rmdir(sctx
, cur
->dir
, cur
->dir_gen
);
4606 ret
= get_cur_path(sctx
, cur
->dir
,
4607 cur
->dir_gen
, valid_path
);
4610 ret
= send_rmdir(sctx
, valid_path
);
4613 last_dir_ino_rm
= cur
->dir
;
4621 __free_recorded_refs(&check_dirs
);
4622 free_recorded_refs(sctx
);
4623 fs_path_free(valid_path
);
4627 static int rbtree_ref_comp(const void *k
, const struct rb_node
*node
)
4629 const struct recorded_ref
*data
= k
;
4630 const struct recorded_ref
*ref
= rb_entry(node
, struct recorded_ref
, node
);
4633 if (data
->dir
> ref
->dir
)
4635 if (data
->dir
< ref
->dir
)
4637 if (data
->dir_gen
> ref
->dir_gen
)
4639 if (data
->dir_gen
< ref
->dir_gen
)
4641 if (data
->name_len
> ref
->name_len
)
4643 if (data
->name_len
< ref
->name_len
)
4645 result
= strcmp(data
->name
, ref
->name
);
4653 static bool rbtree_ref_less(struct rb_node
*node
, const struct rb_node
*parent
)
4655 const struct recorded_ref
*entry
= rb_entry(node
, struct recorded_ref
, node
);
4657 return rbtree_ref_comp(entry
, parent
) < 0;
4660 static int record_ref_in_tree(struct rb_root
*root
, struct list_head
*refs
,
4661 struct fs_path
*name
, u64 dir
, u64 dir_gen
,
4662 struct send_ctx
*sctx
)
4665 struct fs_path
*path
= NULL
;
4666 struct recorded_ref
*ref
= NULL
;
4668 path
= fs_path_alloc();
4674 ref
= recorded_ref_alloc();
4680 ret
= get_cur_path(sctx
, dir
, dir_gen
, path
);
4683 ret
= fs_path_add_path(path
, name
);
4688 ref
->dir_gen
= dir_gen
;
4689 set_ref_path(ref
, path
);
4690 list_add_tail(&ref
->list
, refs
);
4691 rb_add(&ref
->node
, root
, rbtree_ref_less
);
4695 if (path
&& (!ref
|| !ref
->full_path
))
4697 recorded_ref_free(ref
);
4702 static int record_new_ref_if_needed(u64 dir
, struct fs_path
*name
, void *ctx
)
4705 struct send_ctx
*sctx
= ctx
;
4706 struct rb_node
*node
= NULL
;
4707 struct recorded_ref data
;
4708 struct recorded_ref
*ref
;
4711 ret
= get_inode_gen(sctx
->send_root
, dir
, &dir_gen
);
4716 data
.dir_gen
= dir_gen
;
4717 set_ref_path(&data
, name
);
4718 node
= rb_find(&data
, &sctx
->rbtree_deleted_refs
, rbtree_ref_comp
);
4720 ref
= rb_entry(node
, struct recorded_ref
, node
);
4721 recorded_ref_free(ref
);
4723 ret
= record_ref_in_tree(&sctx
->rbtree_new_refs
,
4724 &sctx
->new_refs
, name
, dir
, dir_gen
,
4731 static int record_deleted_ref_if_needed(u64 dir
, struct fs_path
*name
, void *ctx
)
4734 struct send_ctx
*sctx
= ctx
;
4735 struct rb_node
*node
= NULL
;
4736 struct recorded_ref data
;
4737 struct recorded_ref
*ref
;
4740 ret
= get_inode_gen(sctx
->parent_root
, dir
, &dir_gen
);
4745 data
.dir_gen
= dir_gen
;
4746 set_ref_path(&data
, name
);
4747 node
= rb_find(&data
, &sctx
->rbtree_new_refs
, rbtree_ref_comp
);
4749 ref
= rb_entry(node
, struct recorded_ref
, node
);
4750 recorded_ref_free(ref
);
4752 ret
= record_ref_in_tree(&sctx
->rbtree_deleted_refs
,
4753 &sctx
->deleted_refs
, name
, dir
,
4760 static int record_new_ref(struct send_ctx
*sctx
)
4764 ret
= iterate_inode_ref(sctx
->send_root
, sctx
->left_path
,
4765 sctx
->cmp_key
, 0, record_new_ref_if_needed
, sctx
);
4772 static int record_deleted_ref(struct send_ctx
*sctx
)
4776 ret
= iterate_inode_ref(sctx
->parent_root
, sctx
->right_path
,
4777 sctx
->cmp_key
, 0, record_deleted_ref_if_needed
,
4785 static int record_changed_ref(struct send_ctx
*sctx
)
4789 ret
= iterate_inode_ref(sctx
->send_root
, sctx
->left_path
,
4790 sctx
->cmp_key
, 0, record_new_ref_if_needed
, sctx
);
4793 ret
= iterate_inode_ref(sctx
->parent_root
, sctx
->right_path
,
4794 sctx
->cmp_key
, 0, record_deleted_ref_if_needed
, sctx
);
4802 * Record and process all refs at once. Needed when an inode changes the
4803 * generation number, which means that it was deleted and recreated.
4805 static int process_all_refs(struct send_ctx
*sctx
,
4806 enum btrfs_compare_tree_result cmd
)
4810 struct btrfs_root
*root
;
4811 struct btrfs_path
*path
;
4812 struct btrfs_key key
;
4813 struct btrfs_key found_key
;
4814 iterate_inode_ref_t cb
;
4815 int pending_move
= 0;
4817 path
= alloc_path_for_send();
4821 if (cmd
== BTRFS_COMPARE_TREE_NEW
) {
4822 root
= sctx
->send_root
;
4823 cb
= record_new_ref_if_needed
;
4824 } else if (cmd
== BTRFS_COMPARE_TREE_DELETED
) {
4825 root
= sctx
->parent_root
;
4826 cb
= record_deleted_ref_if_needed
;
4828 btrfs_err(sctx
->send_root
->fs_info
,
4829 "Wrong command %d in process_all_refs", cmd
);
4834 key
.objectid
= sctx
->cmp_key
->objectid
;
4835 key
.type
= BTRFS_INODE_REF_KEY
;
4837 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
4838 if (found_key
.objectid
!= key
.objectid
||
4839 (found_key
.type
!= BTRFS_INODE_REF_KEY
&&
4840 found_key
.type
!= BTRFS_INODE_EXTREF_KEY
))
4843 ret
= iterate_inode_ref(root
, path
, &found_key
, 0, cb
, sctx
);
4847 /* Catch error found during iteration */
4852 btrfs_release_path(path
);
4855 * We don't actually care about pending_move as we are simply
4856 * re-creating this inode and will be rename'ing it into place once we
4857 * rename the parent directory.
4859 ret
= process_recorded_refs(sctx
, &pending_move
);
4861 btrfs_free_path(path
);
4865 static int send_set_xattr(struct send_ctx
*sctx
,
4866 const char *name
, int name_len
,
4867 const char *data
, int data_len
)
4869 struct fs_path
*path
;
4872 path
= get_cur_inode_path(sctx
);
4874 return PTR_ERR(path
);
4876 ret
= begin_cmd(sctx
, BTRFS_SEND_C_SET_XATTR
);
4880 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
4881 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_XATTR_NAME
, name
, name_len
);
4882 TLV_PUT(sctx
, BTRFS_SEND_A_XATTR_DATA
, data
, data_len
);
4884 ret
= send_cmd(sctx
);
4890 static int send_remove_xattr(struct send_ctx
*sctx
,
4891 struct fs_path
*path
,
4892 const char *name
, int name_len
)
4896 ret
= begin_cmd(sctx
, BTRFS_SEND_C_REMOVE_XATTR
);
4900 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
4901 TLV_PUT_STRING(sctx
, BTRFS_SEND_A_XATTR_NAME
, name
, name_len
);
4903 ret
= send_cmd(sctx
);
4909 static int __process_new_xattr(int num
, struct btrfs_key
*di_key
,
4910 const char *name
, int name_len
, const char *data
,
4911 int data_len
, void *ctx
)
4913 struct send_ctx
*sctx
= ctx
;
4914 struct posix_acl_xattr_header dummy_acl
;
4916 /* Capabilities are emitted by finish_inode_if_needed */
4917 if (!strncmp(name
, XATTR_NAME_CAPS
, name_len
))
4921 * This hack is needed because empty acls are stored as zero byte
4922 * data in xattrs. Problem with that is, that receiving these zero byte
4923 * acls will fail later. To fix this, we send a dummy acl list that
4924 * only contains the version number and no entries.
4926 if (!strncmp(name
, XATTR_NAME_POSIX_ACL_ACCESS
, name_len
) ||
4927 !strncmp(name
, XATTR_NAME_POSIX_ACL_DEFAULT
, name_len
)) {
4928 if (data_len
== 0) {
4929 dummy_acl
.a_version
=
4930 cpu_to_le32(POSIX_ACL_XATTR_VERSION
);
4931 data
= (char *)&dummy_acl
;
4932 data_len
= sizeof(dummy_acl
);
4936 return send_set_xattr(sctx
, name
, name_len
, data
, data_len
);
4939 static int __process_deleted_xattr(int num
, struct btrfs_key
*di_key
,
4940 const char *name
, int name_len
,
4941 const char *data
, int data_len
, void *ctx
)
4943 struct send_ctx
*sctx
= ctx
;
4946 p
= get_cur_inode_path(sctx
);
4950 return send_remove_xattr(sctx
, p
, name
, name_len
);
4953 static int process_new_xattr(struct send_ctx
*sctx
)
4955 return iterate_dir_item(sctx
->send_root
, sctx
->left_path
,
4956 __process_new_xattr
, sctx
);
4959 static int process_deleted_xattr(struct send_ctx
*sctx
)
4961 return iterate_dir_item(sctx
->parent_root
, sctx
->right_path
,
4962 __process_deleted_xattr
, sctx
);
4965 struct find_xattr_ctx
{
4973 static int __find_xattr(int num
, struct btrfs_key
*di_key
, const char *name
,
4974 int name_len
, const char *data
, int data_len
, void *vctx
)
4976 struct find_xattr_ctx
*ctx
= vctx
;
4978 if (name_len
== ctx
->name_len
&&
4979 strncmp(name
, ctx
->name
, name_len
) == 0) {
4980 ctx
->found_idx
= num
;
4981 ctx
->found_data_len
= data_len
;
4982 ctx
->found_data
= kmemdup(data
, data_len
, GFP_KERNEL
);
4983 if (!ctx
->found_data
)
4990 static int find_xattr(struct btrfs_root
*root
,
4991 struct btrfs_path
*path
,
4992 struct btrfs_key
*key
,
4993 const char *name
, int name_len
,
4994 char **data
, int *data_len
)
4997 struct find_xattr_ctx ctx
;
5000 ctx
.name_len
= name_len
;
5002 ctx
.found_data
= NULL
;
5003 ctx
.found_data_len
= 0;
5005 ret
= iterate_dir_item(root
, path
, __find_xattr
, &ctx
);
5009 if (ctx
.found_idx
== -1)
5012 *data
= ctx
.found_data
;
5013 *data_len
= ctx
.found_data_len
;
5015 kfree(ctx
.found_data
);
5017 return ctx
.found_idx
;
5021 static int __process_changed_new_xattr(int num
, struct btrfs_key
*di_key
,
5022 const char *name
, int name_len
,
5023 const char *data
, int data_len
,
5027 struct send_ctx
*sctx
= ctx
;
5028 char *found_data
= NULL
;
5029 int found_data_len
= 0;
5031 ret
= find_xattr(sctx
->parent_root
, sctx
->right_path
,
5032 sctx
->cmp_key
, name
, name_len
, &found_data
,
5034 if (ret
== -ENOENT
) {
5035 ret
= __process_new_xattr(num
, di_key
, name
, name_len
, data
,
5037 } else if (ret
>= 0) {
5038 if (data_len
!= found_data_len
||
5039 memcmp(data
, found_data
, data_len
)) {
5040 ret
= __process_new_xattr(num
, di_key
, name
, name_len
,
5041 data
, data_len
, ctx
);
5051 static int __process_changed_deleted_xattr(int num
, struct btrfs_key
*di_key
,
5052 const char *name
, int name_len
,
5053 const char *data
, int data_len
,
5057 struct send_ctx
*sctx
= ctx
;
5059 ret
= find_xattr(sctx
->send_root
, sctx
->left_path
, sctx
->cmp_key
,
5060 name
, name_len
, NULL
, NULL
);
5062 ret
= __process_deleted_xattr(num
, di_key
, name
, name_len
, data
,
5070 static int process_changed_xattr(struct send_ctx
*sctx
)
5074 ret
= iterate_dir_item(sctx
->send_root
, sctx
->left_path
,
5075 __process_changed_new_xattr
, sctx
);
5079 return iterate_dir_item(sctx
->parent_root
, sctx
->right_path
,
5080 __process_changed_deleted_xattr
, sctx
);
5083 static int process_all_new_xattrs(struct send_ctx
*sctx
)
5087 struct btrfs_root
*root
;
5088 struct btrfs_path
*path
;
5089 struct btrfs_key key
;
5090 struct btrfs_key found_key
;
5092 path
= alloc_path_for_send();
5096 root
= sctx
->send_root
;
5098 key
.objectid
= sctx
->cmp_key
->objectid
;
5099 key
.type
= BTRFS_XATTR_ITEM_KEY
;
5101 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
5102 if (found_key
.objectid
!= key
.objectid
||
5103 found_key
.type
!= key
.type
) {
5108 ret
= iterate_dir_item(root
, path
, __process_new_xattr
, sctx
);
5112 /* Catch error found during iteration */
5116 btrfs_free_path(path
);
5120 static int send_verity(struct send_ctx
*sctx
, struct fs_path
*path
,
5121 struct fsverity_descriptor
*desc
)
5125 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENABLE_VERITY
);
5129 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, path
);
5130 TLV_PUT_U8(sctx
, BTRFS_SEND_A_VERITY_ALGORITHM
,
5131 le8_to_cpu(desc
->hash_algorithm
));
5132 TLV_PUT_U32(sctx
, BTRFS_SEND_A_VERITY_BLOCK_SIZE
,
5133 1U << le8_to_cpu(desc
->log_blocksize
));
5134 TLV_PUT(sctx
, BTRFS_SEND_A_VERITY_SALT_DATA
, desc
->salt
,
5135 le8_to_cpu(desc
->salt_size
));
5136 TLV_PUT(sctx
, BTRFS_SEND_A_VERITY_SIG_DATA
, desc
->signature
,
5137 le32_to_cpu(desc
->sig_size
));
5139 ret
= send_cmd(sctx
);
5145 static int process_verity(struct send_ctx
*sctx
)
5148 struct btrfs_inode
*inode
;
5151 inode
= btrfs_iget(sctx
->cur_ino
, sctx
->send_root
);
5153 return PTR_ERR(inode
);
5155 ret
= btrfs_get_verity_descriptor(&inode
->vfs_inode
, NULL
, 0);
5159 if (ret
> FS_VERITY_MAX_DESCRIPTOR_SIZE
) {
5163 if (!sctx
->verity_descriptor
) {
5164 sctx
->verity_descriptor
= kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE
,
5166 if (!sctx
->verity_descriptor
) {
5172 ret
= btrfs_get_verity_descriptor(&inode
->vfs_inode
, sctx
->verity_descriptor
, ret
);
5176 p
= get_cur_inode_path(sctx
);
5182 ret
= send_verity(sctx
, p
, sctx
->verity_descriptor
);
5184 iput(&inode
->vfs_inode
);
5188 static inline u64
max_send_read_size(const struct send_ctx
*sctx
)
5190 return sctx
->send_max_size
- SZ_16K
;
5193 static int put_data_header(struct send_ctx
*sctx
, u32 len
)
5195 if (WARN_ON_ONCE(sctx
->put_data
))
5197 sctx
->put_data
= true;
5198 if (sctx
->proto
>= 2) {
5200 * Since v2, the data attribute header doesn't include a length,
5201 * it is implicitly to the end of the command.
5203 if (sctx
->send_max_size
- sctx
->send_size
< sizeof(__le16
) + len
)
5205 put_unaligned_le16(BTRFS_SEND_A_DATA
, sctx
->send_buf
+ sctx
->send_size
);
5206 sctx
->send_size
+= sizeof(__le16
);
5208 struct btrfs_tlv_header
*hdr
;
5210 if (sctx
->send_max_size
- sctx
->send_size
< sizeof(*hdr
) + len
)
5212 hdr
= (struct btrfs_tlv_header
*)(sctx
->send_buf
+ sctx
->send_size
);
5213 put_unaligned_le16(BTRFS_SEND_A_DATA
, &hdr
->tlv_type
);
5214 put_unaligned_le16(len
, &hdr
->tlv_len
);
5215 sctx
->send_size
+= sizeof(*hdr
);
5220 static int put_file_data(struct send_ctx
*sctx
, u64 offset
, u32 len
)
5222 struct btrfs_root
*root
= sctx
->send_root
;
5223 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5225 const u64 end
= offset
+ len
;
5226 const pgoff_t last_index
= ((end
- 1) >> PAGE_SHIFT
);
5227 struct address_space
*mapping
= sctx
->cur_inode
->i_mapping
;
5230 ret
= put_data_header(sctx
, len
);
5235 pgoff_t index
= (cur
>> PAGE_SHIFT
);
5236 unsigned int cur_len
;
5237 unsigned int pg_offset
;
5238 struct folio
*folio
;
5240 folio
= filemap_lock_folio(mapping
, index
);
5241 if (IS_ERR(folio
)) {
5242 page_cache_sync_readahead(mapping
,
5243 &sctx
->ra
, NULL
, index
,
5244 last_index
+ 1 - index
);
5246 folio
= filemap_grab_folio(mapping
, index
);
5247 if (IS_ERR(folio
)) {
5248 ret
= PTR_ERR(folio
);
5252 pg_offset
= offset_in_folio(folio
, cur
);
5253 cur_len
= min_t(unsigned int, end
- cur
, folio_size(folio
) - pg_offset
);
5255 if (folio_test_readahead(folio
))
5256 page_cache_async_readahead(mapping
, &sctx
->ra
, NULL
, folio
,
5257 last_index
+ 1 - index
);
5259 if (!folio_test_uptodate(folio
)) {
5260 btrfs_read_folio(NULL
, folio
);
5262 if (!folio_test_uptodate(folio
)) {
5263 folio_unlock(folio
);
5265 "send: IO error at offset %llu for inode %llu root %llu",
5266 folio_pos(folio
), sctx
->cur_ino
,
5267 btrfs_root_id(sctx
->send_root
));
5272 if (folio
->mapping
!= mapping
) {
5273 folio_unlock(folio
);
5279 memcpy_from_folio(sctx
->send_buf
+ sctx
->send_size
, folio
,
5280 pg_offset
, cur_len
);
5281 folio_unlock(folio
);
5284 sctx
->send_size
+= cur_len
;
5291 * Read some bytes from the current inode/file and send a write command to
5294 static int send_write(struct send_ctx
*sctx
, u64 offset
, u32 len
)
5299 p
= get_cur_inode_path(sctx
);
5303 ret
= begin_cmd(sctx
, BTRFS_SEND_C_WRITE
);
5307 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5308 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5309 ret
= put_file_data(sctx
, offset
, len
);
5313 ret
= send_cmd(sctx
);
5320 * Send a clone command to user space.
5322 static int send_clone(struct send_ctx
*sctx
,
5323 u64 offset
, u32 len
,
5324 struct clone_root
*clone_root
)
5328 struct fs_path
*cur_inode_path
;
5331 cur_inode_path
= get_cur_inode_path(sctx
);
5332 if (IS_ERR(cur_inode_path
))
5333 return PTR_ERR(cur_inode_path
);
5335 p
= fs_path_alloc();
5339 ret
= begin_cmd(sctx
, BTRFS_SEND_C_CLONE
);
5343 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5344 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_LEN
, len
);
5345 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, cur_inode_path
);
5347 if (clone_root
->root
== sctx
->send_root
) {
5348 ret
= get_inode_gen(sctx
->send_root
, clone_root
->ino
, &gen
);
5351 ret
= get_cur_path(sctx
, clone_root
->ino
, gen
, p
);
5353 ret
= get_inode_path(clone_root
->root
, clone_root
->ino
, p
);
5359 * If the parent we're using has a received_uuid set then use that as
5360 * our clone source as that is what we will look for when doing a
5363 * This covers the case that we create a snapshot off of a received
5364 * subvolume and then use that as the parent and try to receive on a
5367 if (!btrfs_is_empty_uuid(clone_root
->root
->root_item
.received_uuid
))
5368 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
5369 clone_root
->root
->root_item
.received_uuid
);
5371 TLV_PUT_UUID(sctx
, BTRFS_SEND_A_CLONE_UUID
,
5372 clone_root
->root
->root_item
.uuid
);
5373 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_CTRANSID
,
5374 btrfs_root_ctransid(&clone_root
->root
->root_item
));
5375 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_CLONE_PATH
, p
);
5376 TLV_PUT_U64(sctx
, BTRFS_SEND_A_CLONE_OFFSET
,
5377 clone_root
->offset
);
5379 ret
= send_cmd(sctx
);
5388 * Send an update extent command to user space.
5390 static int send_update_extent(struct send_ctx
*sctx
,
5391 u64 offset
, u32 len
)
5396 p
= get_cur_inode_path(sctx
);
5400 ret
= begin_cmd(sctx
, BTRFS_SEND_C_UPDATE_EXTENT
);
5404 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5405 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5406 TLV_PUT_U64(sctx
, BTRFS_SEND_A_SIZE
, len
);
5408 ret
= send_cmd(sctx
);
5414 static int send_hole(struct send_ctx
*sctx
, u64 end
)
5416 struct fs_path
*p
= NULL
;
5417 u64 read_size
= max_send_read_size(sctx
);
5418 u64 offset
= sctx
->cur_inode_last_extent
;
5422 * A hole that starts at EOF or beyond it. Since we do not yet support
5423 * fallocate (for extent preallocation and hole punching), sending a
5424 * write of zeroes starting at EOF or beyond would later require issuing
5425 * a truncate operation which would undo the write and achieve nothing.
5427 if (offset
>= sctx
->cur_inode_size
)
5431 * Don't go beyond the inode's i_size due to prealloc extents that start
5434 end
= min_t(u64
, end
, sctx
->cur_inode_size
);
5436 if (sctx
->flags
& BTRFS_SEND_FLAG_NO_FILE_DATA
)
5437 return send_update_extent(sctx
, offset
, end
- offset
);
5439 p
= get_cur_inode_path(sctx
);
5443 while (offset
< end
) {
5444 u64 len
= min(end
- offset
, read_size
);
5446 ret
= begin_cmd(sctx
, BTRFS_SEND_C_WRITE
);
5449 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, p
);
5450 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5451 ret
= put_data_header(sctx
, len
);
5454 memset(sctx
->send_buf
+ sctx
->send_size
, 0, len
);
5455 sctx
->send_size
+= len
;
5456 ret
= send_cmd(sctx
);
5461 sctx
->cur_inode_next_write_offset
= offset
;
5466 static int send_encoded_inline_extent(struct send_ctx
*sctx
,
5467 struct btrfs_path
*path
, u64 offset
,
5470 struct btrfs_fs_info
*fs_info
= sctx
->send_root
->fs_info
;
5471 struct fs_path
*fspath
;
5472 struct extent_buffer
*leaf
= path
->nodes
[0];
5473 struct btrfs_key key
;
5474 struct btrfs_file_extent_item
*ei
;
5479 fspath
= get_cur_inode_path(sctx
);
5481 return PTR_ERR(fspath
);
5483 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENCODED_WRITE
);
5487 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
5488 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_file_extent_item
);
5489 ram_bytes
= btrfs_file_extent_ram_bytes(leaf
, ei
);
5490 inline_size
= btrfs_file_extent_inline_item_len(leaf
, path
->slots
[0]);
5492 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, fspath
);
5493 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5494 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_FILE_LEN
,
5495 min(key
.offset
+ ram_bytes
- offset
, len
));
5496 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_LEN
, ram_bytes
);
5497 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_OFFSET
, offset
- key
.offset
);
5498 ret
= btrfs_encoded_io_compression_from_extent(fs_info
,
5499 btrfs_file_extent_compression(leaf
, ei
));
5502 TLV_PUT_U32(sctx
, BTRFS_SEND_A_COMPRESSION
, ret
);
5504 ret
= put_data_header(sctx
, inline_size
);
5507 read_extent_buffer(leaf
, sctx
->send_buf
+ sctx
->send_size
,
5508 btrfs_file_extent_inline_start(ei
), inline_size
);
5509 sctx
->send_size
+= inline_size
;
5511 ret
= send_cmd(sctx
);
5517 static int send_encoded_extent(struct send_ctx
*sctx
, struct btrfs_path
*path
,
5518 u64 offset
, u64 len
)
5520 struct btrfs_root
*root
= sctx
->send_root
;
5521 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
5522 struct btrfs_inode
*inode
;
5523 struct fs_path
*fspath
;
5524 struct extent_buffer
*leaf
= path
->nodes
[0];
5525 struct btrfs_key key
;
5526 struct btrfs_file_extent_item
*ei
;
5527 u64 disk_bytenr
, disk_num_bytes
;
5529 struct btrfs_cmd_header
*hdr
;
5533 inode
= btrfs_iget(sctx
->cur_ino
, root
);
5535 return PTR_ERR(inode
);
5537 fspath
= get_cur_inode_path(sctx
);
5538 if (IS_ERR(fspath
)) {
5539 ret
= PTR_ERR(fspath
);
5543 ret
= begin_cmd(sctx
, BTRFS_SEND_C_ENCODED_WRITE
);
5547 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
5548 ei
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_file_extent_item
);
5549 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, ei
);
5550 disk_num_bytes
= btrfs_file_extent_disk_num_bytes(leaf
, ei
);
5552 TLV_PUT_PATH(sctx
, BTRFS_SEND_A_PATH
, fspath
);
5553 TLV_PUT_U64(sctx
, BTRFS_SEND_A_FILE_OFFSET
, offset
);
5554 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_FILE_LEN
,
5555 min(key
.offset
+ btrfs_file_extent_num_bytes(leaf
, ei
) - offset
,
5557 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_LEN
,
5558 btrfs_file_extent_ram_bytes(leaf
, ei
));
5559 TLV_PUT_U64(sctx
, BTRFS_SEND_A_UNENCODED_OFFSET
,
5560 offset
- key
.offset
+ btrfs_file_extent_offset(leaf
, ei
));
5561 ret
= btrfs_encoded_io_compression_from_extent(fs_info
,
5562 btrfs_file_extent_compression(leaf
, ei
));
5565 TLV_PUT_U32(sctx
, BTRFS_SEND_A_COMPRESSION
, ret
);
5566 TLV_PUT_U32(sctx
, BTRFS_SEND_A_ENCRYPTION
, 0);
5568 ret
= put_data_header(sctx
, disk_num_bytes
);
5573 * We want to do I/O directly into the send buffer, so get the next page
5574 * boundary in the send buffer. This means that there may be a gap
5575 * between the beginning of the command and the file data.
5577 data_offset
= PAGE_ALIGN(sctx
->send_size
);
5578 if (data_offset
> sctx
->send_max_size
||
5579 sctx
->send_max_size
- data_offset
< disk_num_bytes
) {
5585 * Note that send_buf is a mapping of send_buf_pages, so this is really
5586 * reading into send_buf.
5588 ret
= btrfs_encoded_read_regular_fill_pages(inode
,
5589 disk_bytenr
, disk_num_bytes
,
5590 sctx
->send_buf_pages
+
5591 (data_offset
>> PAGE_SHIFT
),
5596 hdr
= (struct btrfs_cmd_header
*)sctx
->send_buf
;
5597 hdr
->len
= cpu_to_le32(sctx
->send_size
+ disk_num_bytes
- sizeof(*hdr
));
5599 crc
= crc32c(0, sctx
->send_buf
, sctx
->send_size
);
5600 crc
= crc32c(crc
, sctx
->send_buf
+ data_offset
, disk_num_bytes
);
5601 hdr
->crc
= cpu_to_le32(crc
);
5603 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
, sctx
->send_size
,
5606 ret
= write_buf(sctx
->send_filp
, sctx
->send_buf
+ data_offset
,
5607 disk_num_bytes
, &sctx
->send_off
);
5609 sctx
->send_size
= 0;
5610 sctx
->put_data
= false;
5614 iput(&inode
->vfs_inode
);
5618 static int send_extent_data(struct send_ctx
*sctx
, struct btrfs_path
*path
,
5619 const u64 offset
, const u64 len
)
5621 const u64 end
= offset
+ len
;
5622 struct extent_buffer
*leaf
= path
->nodes
[0];
5623 struct btrfs_file_extent_item
*ei
;
5624 u64 read_size
= max_send_read_size(sctx
);
5627 if (sctx
->flags
& BTRFS_SEND_FLAG_NO_FILE_DATA
)
5628 return send_update_extent(sctx
, offset
, len
);
5630 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
5631 struct btrfs_file_extent_item
);
5632 if ((sctx
->flags
& BTRFS_SEND_FLAG_COMPRESSED
) &&
5633 btrfs_file_extent_compression(leaf
, ei
) != BTRFS_COMPRESS_NONE
) {
5634 bool is_inline
= (btrfs_file_extent_type(leaf
, ei
) ==
5635 BTRFS_FILE_EXTENT_INLINE
);
5638 * Send the compressed extent unless the compressed data is
5639 * larger than the decompressed data. This can happen if we're
5640 * not sending the entire extent, either because it has been
5641 * partially overwritten/truncated or because this is a part of
5642 * the extent that we couldn't clone in clone_range().
5645 btrfs_file_extent_inline_item_len(leaf
,
5646 path
->slots
[0]) <= len
) {
5647 return send_encoded_inline_extent(sctx
, path
, offset
,
5649 } else if (!is_inline
&&
5650 btrfs_file_extent_disk_num_bytes(leaf
, ei
) <= len
) {
5651 return send_encoded_extent(sctx
, path
, offset
, len
);
5655 if (sctx
->cur_inode
== NULL
) {
5656 struct btrfs_inode
*btrfs_inode
;
5657 struct btrfs_root
*root
= sctx
->send_root
;
5659 btrfs_inode
= btrfs_iget(sctx
->cur_ino
, root
);
5660 if (IS_ERR(btrfs_inode
))
5661 return PTR_ERR(btrfs_inode
);
5663 sctx
->cur_inode
= &btrfs_inode
->vfs_inode
;
5664 memset(&sctx
->ra
, 0, sizeof(struct file_ra_state
));
5665 file_ra_state_init(&sctx
->ra
, sctx
->cur_inode
->i_mapping
);
5668 * It's very likely there are no pages from this inode in the page
5669 * cache, so after reading extents and sending their data, we clean
5670 * the page cache to avoid trashing the page cache (adding pressure
5671 * to the page cache and forcing eviction of other data more useful
5672 * for applications).
5674 * We decide if we should clean the page cache simply by checking
5675 * if the inode's mapping nrpages is 0 when we first open it, and
5676 * not by using something like filemap_range_has_page() before
5677 * reading an extent because when we ask the readahead code to
5678 * read a given file range, it may (and almost always does) read
5679 * pages from beyond that range (see the documentation for
5680 * page_cache_sync_readahead()), so it would not be reliable,
5681 * because after reading the first extent future calls to
5682 * filemap_range_has_page() would return true because the readahead
5683 * on the previous extent resulted in reading pages of the current
5686 sctx
->clean_page_cache
= (sctx
->cur_inode
->i_mapping
->nrpages
== 0);
5687 sctx
->page_cache_clear_start
= round_down(offset
, PAGE_SIZE
);
5690 while (sent
< len
) {
5691 u64 size
= min(len
- sent
, read_size
);
5694 ret
= send_write(sctx
, offset
+ sent
, size
);
5700 if (sctx
->clean_page_cache
&& PAGE_ALIGNED(end
)) {
5702 * Always operate only on ranges that are a multiple of the page
5703 * size. This is not only to prevent zeroing parts of a page in
5704 * the case of subpage sector size, but also to guarantee we evict
5705 * pages, as passing a range that is smaller than page size does
5706 * not evict the respective page (only zeroes part of its content).
5708 * Always start from the end offset of the last range cleared.
5709 * This is because the readahead code may (and very often does)
5710 * reads pages beyond the range we request for readahead. So if
5711 * we have an extent layout like this:
5713 * [ extent A ] [ extent B ] [ extent C ]
5715 * When we ask page_cache_sync_readahead() to read extent A, it
5716 * may also trigger reads for pages of extent B. If we are doing
5717 * an incremental send and extent B has not changed between the
5718 * parent and send snapshots, some or all of its pages may end
5719 * up being read and placed in the page cache. So when truncating
5720 * the page cache we always start from the end offset of the
5721 * previously processed extent up to the end of the current
5724 truncate_inode_pages_range(&sctx
->cur_inode
->i_data
,
5725 sctx
->page_cache_clear_start
,
5727 sctx
->page_cache_clear_start
= end
;
5734 * Search for a capability xattr related to sctx->cur_ino. If the capability is
5735 * found, call send_set_xattr function to emit it.
5737 * Return 0 if there isn't a capability, or when the capability was emitted
5738 * successfully, or < 0 if an error occurred.
5740 static int send_capabilities(struct send_ctx
*sctx
)
5742 struct btrfs_path
*path
;
5743 struct btrfs_dir_item
*di
;
5744 struct extent_buffer
*leaf
;
5745 unsigned long data_ptr
;
5750 path
= alloc_path_for_send();
5754 di
= btrfs_lookup_xattr(NULL
, sctx
->send_root
, path
, sctx
->cur_ino
,
5755 XATTR_NAME_CAPS
, strlen(XATTR_NAME_CAPS
), 0);
5757 /* There is no xattr for this inode */
5759 } else if (IS_ERR(di
)) {
5764 leaf
= path
->nodes
[0];
5765 buf_len
= btrfs_dir_data_len(leaf
, di
);
5767 buf
= kmalloc(buf_len
, GFP_KERNEL
);
5773 data_ptr
= (unsigned long)(di
+ 1) + btrfs_dir_name_len(leaf
, di
);
5774 read_extent_buffer(leaf
, buf
, data_ptr
, buf_len
);
5776 ret
= send_set_xattr(sctx
, XATTR_NAME_CAPS
,
5777 strlen(XATTR_NAME_CAPS
), buf
, buf_len
);
5780 btrfs_free_path(path
);
5784 static int clone_range(struct send_ctx
*sctx
, struct btrfs_path
*dst_path
,
5785 struct clone_root
*clone_root
, const u64 disk_byte
,
5786 u64 data_offset
, u64 offset
, u64 len
)
5788 struct btrfs_path
*path
;
5789 struct btrfs_key key
;
5791 struct btrfs_inode_info info
;
5792 u64 clone_src_i_size
= 0;
5795 * Prevent cloning from a zero offset with a length matching the sector
5796 * size because in some scenarios this will make the receiver fail.
5798 * For example, if in the source filesystem the extent at offset 0
5799 * has a length of sectorsize and it was written using direct IO, then
5800 * it can never be an inline extent (even if compression is enabled).
5801 * Then this extent can be cloned in the original filesystem to a non
5802 * zero file offset, but it may not be possible to clone in the
5803 * destination filesystem because it can be inlined due to compression
5804 * on the destination filesystem (as the receiver's write operations are
5805 * always done using buffered IO). The same happens when the original
5806 * filesystem does not have compression enabled but the destination
5809 if (clone_root
->offset
== 0 &&
5810 len
== sctx
->send_root
->fs_info
->sectorsize
)
5811 return send_extent_data(sctx
, dst_path
, offset
, len
);
5813 path
= alloc_path_for_send();
5818 * There are inodes that have extents that lie behind its i_size. Don't
5819 * accept clones from these extents.
5821 ret
= get_inode_info(clone_root
->root
, clone_root
->ino
, &info
);
5822 btrfs_release_path(path
);
5825 clone_src_i_size
= info
.size
;
5828 * We can't send a clone operation for the entire range if we find
5829 * extent items in the respective range in the source file that
5830 * refer to different extents or if we find holes.
5831 * So check for that and do a mix of clone and regular write/copy
5832 * operations if needed.
5836 * mkfs.btrfs -f /dev/sda
5837 * mount /dev/sda /mnt
5838 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5839 * cp --reflink=always /mnt/foo /mnt/bar
5840 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5841 * btrfs subvolume snapshot -r /mnt /mnt/snap
5843 * If when we send the snapshot and we are processing file bar (which
5844 * has a higher inode number than foo) we blindly send a clone operation
5845 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
5846 * a file bar that matches the content of file foo - iow, doesn't match
5847 * the content from bar in the original filesystem.
5849 key
.objectid
= clone_root
->ino
;
5850 key
.type
= BTRFS_EXTENT_DATA_KEY
;
5851 key
.offset
= clone_root
->offset
;
5852 ret
= btrfs_search_slot(NULL
, clone_root
->root
, &key
, path
, 0, 0);
5855 if (ret
> 0 && path
->slots
[0] > 0) {
5856 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0] - 1);
5857 if (key
.objectid
== clone_root
->ino
&&
5858 key
.type
== BTRFS_EXTENT_DATA_KEY
)
5863 struct extent_buffer
*leaf
= path
->nodes
[0];
5864 int slot
= path
->slots
[0];
5865 struct btrfs_file_extent_item
*ei
;
5869 u64 clone_data_offset
;
5870 bool crossed_src_i_size
= false;
5872 if (slot
>= btrfs_header_nritems(leaf
)) {
5873 ret
= btrfs_next_leaf(clone_root
->root
, path
);
5881 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
5884 * We might have an implicit trailing hole (NO_HOLES feature
5885 * enabled). We deal with it after leaving this loop.
5887 if (key
.objectid
!= clone_root
->ino
||
5888 key
.type
!= BTRFS_EXTENT_DATA_KEY
)
5891 ei
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
5892 type
= btrfs_file_extent_type(leaf
, ei
);
5893 if (type
== BTRFS_FILE_EXTENT_INLINE
) {
5894 ext_len
= btrfs_file_extent_ram_bytes(leaf
, ei
);
5895 ext_len
= PAGE_ALIGN(ext_len
);
5897 ext_len
= btrfs_file_extent_num_bytes(leaf
, ei
);
5900 if (key
.offset
+ ext_len
<= clone_root
->offset
)
5903 if (key
.offset
> clone_root
->offset
) {
5904 /* Implicit hole, NO_HOLES feature enabled. */
5905 u64 hole_len
= key
.offset
- clone_root
->offset
;
5909 ret
= send_extent_data(sctx
, dst_path
, offset
,
5918 clone_root
->offset
+= hole_len
;
5919 data_offset
+= hole_len
;
5922 if (key
.offset
>= clone_root
->offset
+ len
)
5925 if (key
.offset
>= clone_src_i_size
)
5928 if (key
.offset
+ ext_len
> clone_src_i_size
) {
5929 ext_len
= clone_src_i_size
- key
.offset
;
5930 crossed_src_i_size
= true;
5933 clone_data_offset
= btrfs_file_extent_offset(leaf
, ei
);
5934 if (btrfs_file_extent_disk_bytenr(leaf
, ei
) == disk_byte
) {
5935 clone_root
->offset
= key
.offset
;
5936 if (clone_data_offset
< data_offset
&&
5937 clone_data_offset
+ ext_len
> data_offset
) {
5940 extent_offset
= data_offset
- clone_data_offset
;
5941 ext_len
-= extent_offset
;
5942 clone_data_offset
+= extent_offset
;
5943 clone_root
->offset
+= extent_offset
;
5947 clone_len
= min_t(u64
, ext_len
, len
);
5949 if (btrfs_file_extent_disk_bytenr(leaf
, ei
) == disk_byte
&&
5950 clone_data_offset
== data_offset
) {
5951 const u64 src_end
= clone_root
->offset
+ clone_len
;
5952 const u64 sectorsize
= SZ_64K
;
5955 * We can't clone the last block, when its size is not
5956 * sector size aligned, into the middle of a file. If we
5957 * do so, the receiver will get a failure (-EINVAL) when
5958 * trying to clone or will silently corrupt the data in
5959 * the destination file if it's on a kernel without the
5960 * fix introduced by commit ac765f83f1397646
5961 * ("Btrfs: fix data corruption due to cloning of eof
5964 * So issue a clone of the aligned down range plus a
5965 * regular write for the eof block, if we hit that case.
5967 * Also, we use the maximum possible sector size, 64K,
5968 * because we don't know what's the sector size of the
5969 * filesystem that receives the stream, so we have to
5970 * assume the largest possible sector size.
5972 if (src_end
== clone_src_i_size
&&
5973 !IS_ALIGNED(src_end
, sectorsize
) &&
5974 offset
+ clone_len
< sctx
->cur_inode_size
) {
5977 slen
= ALIGN_DOWN(src_end
- clone_root
->offset
,
5980 ret
= send_clone(sctx
, offset
, slen
,
5985 ret
= send_extent_data(sctx
, dst_path
,
5989 ret
= send_clone(sctx
, offset
, clone_len
,
5992 } else if (crossed_src_i_size
&& clone_len
< len
) {
5994 * If we are at i_size of the clone source inode and we
5995 * can not clone from it, terminate the loop. This is
5996 * to avoid sending two write operations, one with a
5997 * length matching clone_len and the final one after
5998 * this loop with a length of len - clone_len.
6000 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
6001 * was passed to the send ioctl), this helps avoid
6002 * sending an encoded write for an offset that is not
6003 * sector size aligned, in case the i_size of the source
6004 * inode is not sector size aligned. That will make the
6005 * receiver fallback to decompression of the data and
6006 * writing it using regular buffered IO, therefore while
6007 * not incorrect, it's not optimal due decompression and
6008 * possible re-compression at the receiver.
6012 ret
= send_extent_data(sctx
, dst_path
, offset
,
6022 offset
+= clone_len
;
6023 clone_root
->offset
+= clone_len
;
6026 * If we are cloning from the file we are currently processing,
6027 * and using the send root as the clone root, we must stop once
6028 * the current clone offset reaches the current eof of the file
6029 * at the receiver, otherwise we would issue an invalid clone
6030 * operation (source range going beyond eof) and cause the
6031 * receiver to fail. So if we reach the current eof, bail out
6032 * and fallback to a regular write.
6034 if (clone_root
->root
== sctx
->send_root
&&
6035 clone_root
->ino
== sctx
->cur_ino
&&
6036 clone_root
->offset
>= sctx
->cur_inode_next_write_offset
)
6039 data_offset
+= clone_len
;
6045 ret
= send_extent_data(sctx
, dst_path
, offset
, len
);
6049 btrfs_free_path(path
);
6053 static int send_write_or_clone(struct send_ctx
*sctx
,
6054 struct btrfs_path
*path
,
6055 struct btrfs_key
*key
,
6056 struct clone_root
*clone_root
)
6059 u64 offset
= key
->offset
;
6061 u64 bs
= sctx
->send_root
->fs_info
->sectorsize
;
6062 struct btrfs_file_extent_item
*ei
;
6066 struct btrfs_inode_info info
= { 0 };
6068 end
= min_t(u64
, btrfs_file_extent_end(path
), sctx
->cur_inode_size
);
6072 num_bytes
= end
- offset
;
6077 if (IS_ALIGNED(end
, bs
))
6081 * If the extent end is not aligned, we can clone if the extent ends at
6082 * the i_size of the inode and the clone range ends at the i_size of the
6083 * source inode, otherwise the clone operation fails with -EINVAL.
6085 if (end
!= sctx
->cur_inode_size
)
6088 ret
= get_inode_info(clone_root
->root
, clone_root
->ino
, &info
);
6092 if (clone_root
->offset
+ num_bytes
== info
.size
) {
6094 * The final size of our file matches the end offset, but it may
6095 * be that its current size is larger, so we have to truncate it
6096 * to any value between the start offset of the range and the
6097 * final i_size, otherwise the clone operation is invalid
6098 * because it's unaligned and it ends before the current EOF.
6099 * We do this truncate to the final i_size when we finish
6100 * processing the inode, but it's too late by then. And here we
6101 * truncate to the start offset of the range because it's always
6102 * sector size aligned while if it were the final i_size it
6103 * would result in dirtying part of a page, filling part of a
6104 * page with zeroes and then having the clone operation at the
6105 * receiver trigger IO and wait for it due to the dirty page.
6107 if (sctx
->parent_root
!= NULL
) {
6108 ret
= send_truncate(sctx
, sctx
->cur_ino
,
6109 sctx
->cur_inode_gen
, offset
);
6117 ret
= send_extent_data(sctx
, path
, offset
, num_bytes
);
6118 sctx
->cur_inode_next_write_offset
= end
;
6122 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
6123 struct btrfs_file_extent_item
);
6124 disk_byte
= btrfs_file_extent_disk_bytenr(path
->nodes
[0], ei
);
6125 data_offset
= btrfs_file_extent_offset(path
->nodes
[0], ei
);
6126 ret
= clone_range(sctx
, path
, clone_root
, disk_byte
, data_offset
, offset
,
6128 sctx
->cur_inode_next_write_offset
= end
;
6132 static int is_extent_unchanged(struct send_ctx
*sctx
,
6133 struct btrfs_path
*left_path
,
6134 struct btrfs_key
*ekey
)
6137 struct btrfs_key key
;
6138 struct btrfs_path
*path
= NULL
;
6139 struct extent_buffer
*eb
;
6141 struct btrfs_key found_key
;
6142 struct btrfs_file_extent_item
*ei
;
6147 u64 left_offset_fixed
;
6155 path
= alloc_path_for_send();
6159 eb
= left_path
->nodes
[0];
6160 slot
= left_path
->slots
[0];
6161 ei
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
6162 left_type
= btrfs_file_extent_type(eb
, ei
);
6164 if (left_type
!= BTRFS_FILE_EXTENT_REG
) {
6168 left_disknr
= btrfs_file_extent_disk_bytenr(eb
, ei
);
6169 left_len
= btrfs_file_extent_num_bytes(eb
, ei
);
6170 left_offset
= btrfs_file_extent_offset(eb
, ei
);
6171 left_gen
= btrfs_file_extent_generation(eb
, ei
);
6174 * Following comments will refer to these graphics. L is the left
6175 * extents which we are checking at the moment. 1-8 are the right
6176 * extents that we iterate.
6179 * |-1-|-2a-|-3-|-4-|-5-|-6-|
6182 * |--1--|-2b-|...(same as above)
6184 * Alternative situation. Happens on files where extents got split.
6186 * |-----------7-----------|-6-|
6188 * Alternative situation. Happens on files which got larger.
6191 * Nothing follows after 8.
6194 key
.objectid
= ekey
->objectid
;
6195 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6196 key
.offset
= ekey
->offset
;
6197 ret
= btrfs_search_slot_for_read(sctx
->parent_root
, &key
, path
, 0, 0);
6206 * Handle special case where the right side has no extents at all.
6208 eb
= path
->nodes
[0];
6209 slot
= path
->slots
[0];
6210 btrfs_item_key_to_cpu(eb
, &found_key
, slot
);
6211 if (found_key
.objectid
!= key
.objectid
||
6212 found_key
.type
!= key
.type
) {
6213 /* If we're a hole then just pretend nothing changed */
6214 ret
= (left_disknr
) ? 0 : 1;
6219 * We're now on 2a, 2b or 7.
6222 while (key
.offset
< ekey
->offset
+ left_len
) {
6223 ei
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
6224 right_type
= btrfs_file_extent_type(eb
, ei
);
6225 if (right_type
!= BTRFS_FILE_EXTENT_REG
&&
6226 right_type
!= BTRFS_FILE_EXTENT_INLINE
) {
6231 if (right_type
== BTRFS_FILE_EXTENT_INLINE
) {
6232 right_len
= btrfs_file_extent_ram_bytes(eb
, ei
);
6233 right_len
= PAGE_ALIGN(right_len
);
6235 right_len
= btrfs_file_extent_num_bytes(eb
, ei
);
6239 * Are we at extent 8? If yes, we know the extent is changed.
6240 * This may only happen on the first iteration.
6242 if (found_key
.offset
+ right_len
<= ekey
->offset
) {
6243 /* If we're a hole just pretend nothing changed */
6244 ret
= (left_disknr
) ? 0 : 1;
6249 * We just wanted to see if when we have an inline extent, what
6250 * follows it is a regular extent (wanted to check the above
6251 * condition for inline extents too). This should normally not
6252 * happen but it's possible for example when we have an inline
6253 * compressed extent representing data with a size matching
6254 * the page size (currently the same as sector size).
6256 if (right_type
== BTRFS_FILE_EXTENT_INLINE
) {
6261 right_disknr
= btrfs_file_extent_disk_bytenr(eb
, ei
);
6262 right_offset
= btrfs_file_extent_offset(eb
, ei
);
6263 right_gen
= btrfs_file_extent_generation(eb
, ei
);
6265 left_offset_fixed
= left_offset
;
6266 if (key
.offset
< ekey
->offset
) {
6267 /* Fix the right offset for 2a and 7. */
6268 right_offset
+= ekey
->offset
- key
.offset
;
6270 /* Fix the left offset for all behind 2a and 2b */
6271 left_offset_fixed
+= key
.offset
- ekey
->offset
;
6275 * Check if we have the same extent.
6277 if (left_disknr
!= right_disknr
||
6278 left_offset_fixed
!= right_offset
||
6279 left_gen
!= right_gen
) {
6285 * Go to the next extent.
6287 ret
= btrfs_next_item(sctx
->parent_root
, path
);
6291 eb
= path
->nodes
[0];
6292 slot
= path
->slots
[0];
6293 btrfs_item_key_to_cpu(eb
, &found_key
, slot
);
6295 if (ret
|| found_key
.objectid
!= key
.objectid
||
6296 found_key
.type
!= key
.type
) {
6297 key
.offset
+= right_len
;
6300 if (found_key
.offset
!= key
.offset
+ right_len
) {
6308 * We're now behind the left extent (treat as unchanged) or at the end
6309 * of the right side (treat as changed).
6311 if (key
.offset
>= ekey
->offset
+ left_len
)
6318 btrfs_free_path(path
);
6322 static int get_last_extent(struct send_ctx
*sctx
, u64 offset
)
6324 struct btrfs_path
*path
;
6325 struct btrfs_root
*root
= sctx
->send_root
;
6326 struct btrfs_key key
;
6329 path
= alloc_path_for_send();
6333 sctx
->cur_inode_last_extent
= 0;
6335 key
.objectid
= sctx
->cur_ino
;
6336 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6337 key
.offset
= offset
;
6338 ret
= btrfs_search_slot_for_read(root
, &key
, path
, 0, 1);
6342 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
6343 if (key
.objectid
!= sctx
->cur_ino
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
6346 sctx
->cur_inode_last_extent
= btrfs_file_extent_end(path
);
6348 btrfs_free_path(path
);
6352 static int range_is_hole_in_parent(struct send_ctx
*sctx
,
6356 struct btrfs_path
*path
;
6357 struct btrfs_key key
;
6358 struct btrfs_root
*root
= sctx
->parent_root
;
6359 u64 search_start
= start
;
6362 path
= alloc_path_for_send();
6366 key
.objectid
= sctx
->cur_ino
;
6367 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6368 key
.offset
= search_start
;
6369 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
6372 if (ret
> 0 && path
->slots
[0] > 0)
6375 while (search_start
< end
) {
6376 struct extent_buffer
*leaf
= path
->nodes
[0];
6377 int slot
= path
->slots
[0];
6378 struct btrfs_file_extent_item
*fi
;
6381 if (slot
>= btrfs_header_nritems(leaf
)) {
6382 ret
= btrfs_next_leaf(root
, path
);
6390 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
6391 if (key
.objectid
< sctx
->cur_ino
||
6392 key
.type
< BTRFS_EXTENT_DATA_KEY
)
6394 if (key
.objectid
> sctx
->cur_ino
||
6395 key
.type
> BTRFS_EXTENT_DATA_KEY
||
6399 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
6400 extent_end
= btrfs_file_extent_end(path
);
6401 if (extent_end
<= start
)
6403 if (btrfs_file_extent_disk_bytenr(leaf
, fi
) == 0) {
6404 search_start
= extent_end
;
6414 btrfs_free_path(path
);
6418 static int maybe_send_hole(struct send_ctx
*sctx
, struct btrfs_path
*path
,
6419 struct btrfs_key
*key
)
6423 if (sctx
->cur_ino
!= key
->objectid
|| !need_send_hole(sctx
))
6427 * Get last extent's end offset (exclusive) if we haven't determined it
6428 * yet (we're processing the first file extent item that is new), or if
6429 * we're at the first slot of a leaf and the last extent's end is less
6430 * than the current extent's offset, because we might have skipped
6431 * entire leaves that contained only file extent items for our current
6432 * inode. These leaves have a generation number smaller (older) than the
6433 * one in the current leaf and the leaf our last extent came from, and
6434 * are located between these 2 leaves.
6436 if ((sctx
->cur_inode_last_extent
== (u64
)-1) ||
6437 (path
->slots
[0] == 0 && sctx
->cur_inode_last_extent
< key
->offset
)) {
6438 ret
= get_last_extent(sctx
, key
->offset
- 1);
6443 if (sctx
->cur_inode_last_extent
< key
->offset
) {
6444 ret
= range_is_hole_in_parent(sctx
,
6445 sctx
->cur_inode_last_extent
,
6450 ret
= send_hole(sctx
, key
->offset
);
6454 sctx
->cur_inode_last_extent
= btrfs_file_extent_end(path
);
6458 static int process_extent(struct send_ctx
*sctx
,
6459 struct btrfs_path
*path
,
6460 struct btrfs_key
*key
)
6462 struct clone_root
*found_clone
= NULL
;
6465 if (S_ISLNK(sctx
->cur_inode_mode
))
6468 if (sctx
->parent_root
&& !sctx
->cur_inode_new
) {
6469 ret
= is_extent_unchanged(sctx
, path
, key
);
6477 struct btrfs_file_extent_item
*ei
;
6480 ei
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
6481 struct btrfs_file_extent_item
);
6482 type
= btrfs_file_extent_type(path
->nodes
[0], ei
);
6483 if (type
== BTRFS_FILE_EXTENT_PREALLOC
||
6484 type
== BTRFS_FILE_EXTENT_REG
) {
6486 * The send spec does not have a prealloc command yet,
6487 * so just leave a hole for prealloc'ed extents until
6488 * we have enough commands queued up to justify rev'ing
6491 if (type
== BTRFS_FILE_EXTENT_PREALLOC
) {
6496 /* Have a hole, just skip it. */
6497 if (btrfs_file_extent_disk_bytenr(path
->nodes
[0], ei
) == 0) {
6504 ret
= find_extent_clone(sctx
, path
, key
->objectid
, key
->offset
,
6505 sctx
->cur_inode_size
, &found_clone
);
6506 if (ret
!= -ENOENT
&& ret
< 0)
6509 ret
= send_write_or_clone(sctx
, path
, key
, found_clone
);
6513 ret
= maybe_send_hole(sctx
, path
, key
);
6518 static int process_all_extents(struct send_ctx
*sctx
)
6522 struct btrfs_root
*root
;
6523 struct btrfs_path
*path
;
6524 struct btrfs_key key
;
6525 struct btrfs_key found_key
;
6527 root
= sctx
->send_root
;
6528 path
= alloc_path_for_send();
6532 key
.objectid
= sctx
->cmp_key
->objectid
;
6533 key
.type
= BTRFS_EXTENT_DATA_KEY
;
6535 btrfs_for_each_slot(root
, &key
, &found_key
, path
, iter_ret
) {
6536 if (found_key
.objectid
!= key
.objectid
||
6537 found_key
.type
!= key
.type
) {
6542 ret
= process_extent(sctx
, path
, &found_key
);
6546 /* Catch error found during iteration */
6550 btrfs_free_path(path
);
6554 static int process_recorded_refs_if_needed(struct send_ctx
*sctx
, int at_end
,
6556 int *refs_processed
)
6560 if (sctx
->cur_ino
== 0)
6562 if (!at_end
&& sctx
->cur_ino
== sctx
->cmp_key
->objectid
&&
6563 sctx
->cmp_key
->type
<= BTRFS_INODE_EXTREF_KEY
)
6565 if (list_empty(&sctx
->new_refs
) && list_empty(&sctx
->deleted_refs
))
6568 ret
= process_recorded_refs(sctx
, pending_move
);
6572 *refs_processed
= 1;
6577 static int finish_inode_if_needed(struct send_ctx
*sctx
, int at_end
)
6580 struct btrfs_inode_info info
;
6591 bool need_fileattr
= false;
6592 int need_truncate
= 1;
6593 int pending_move
= 0;
6594 int refs_processed
= 0;
6596 if (sctx
->ignore_cur_inode
)
6599 ret
= process_recorded_refs_if_needed(sctx
, at_end
, &pending_move
,
6605 * We have processed the refs and thus need to advance send_progress.
6606 * Now, calls to get_cur_xxx will take the updated refs of the current
6607 * inode into account.
6609 * On the other hand, if our current inode is a directory and couldn't
6610 * be moved/renamed because its parent was renamed/moved too and it has
6611 * a higher inode number, we can only move/rename our current inode
6612 * after we moved/renamed its parent. Therefore in this case operate on
6613 * the old path (pre move/rename) of our current inode, and the
6614 * move/rename will be performed later.
6616 if (refs_processed
&& !pending_move
)
6617 sctx
->send_progress
= sctx
->cur_ino
+ 1;
6619 if (sctx
->cur_ino
== 0 || sctx
->cur_inode_deleted
)
6621 if (!at_end
&& sctx
->cmp_key
->objectid
== sctx
->cur_ino
)
6623 ret
= get_inode_info(sctx
->send_root
, sctx
->cur_ino
, &info
);
6626 left_mode
= info
.mode
;
6627 left_uid
= info
.uid
;
6628 left_gid
= info
.gid
;
6629 left_fileattr
= info
.fileattr
;
6631 if (!sctx
->parent_root
|| sctx
->cur_inode_new
) {
6633 if (!S_ISLNK(sctx
->cur_inode_mode
))
6635 if (sctx
->cur_inode_next_write_offset
== sctx
->cur_inode_size
)
6640 ret
= get_inode_info(sctx
->parent_root
, sctx
->cur_ino
, &info
);
6643 old_size
= info
.size
;
6644 right_mode
= info
.mode
;
6645 right_uid
= info
.uid
;
6646 right_gid
= info
.gid
;
6647 right_fileattr
= info
.fileattr
;
6649 if (left_uid
!= right_uid
|| left_gid
!= right_gid
)
6651 if (!S_ISLNK(sctx
->cur_inode_mode
) && left_mode
!= right_mode
)
6653 if (!S_ISLNK(sctx
->cur_inode_mode
) && left_fileattr
!= right_fileattr
)
6654 need_fileattr
= true;
6655 if ((old_size
== sctx
->cur_inode_size
) ||
6656 (sctx
->cur_inode_size
> old_size
&&
6657 sctx
->cur_inode_next_write_offset
== sctx
->cur_inode_size
))
6661 if (S_ISREG(sctx
->cur_inode_mode
)) {
6662 if (need_send_hole(sctx
)) {
6663 if (sctx
->cur_inode_last_extent
== (u64
)-1 ||
6664 sctx
->cur_inode_last_extent
<
6665 sctx
->cur_inode_size
) {
6666 ret
= get_last_extent(sctx
, (u64
)-1);
6670 if (sctx
->cur_inode_last_extent
< sctx
->cur_inode_size
) {
6671 ret
= range_is_hole_in_parent(sctx
,
6672 sctx
->cur_inode_last_extent
,
6673 sctx
->cur_inode_size
);
6676 } else if (ret
== 0) {
6677 ret
= send_hole(sctx
, sctx
->cur_inode_size
);
6681 /* Range is already a hole, skip. */
6686 if (need_truncate
) {
6687 ret
= send_truncate(sctx
, sctx
->cur_ino
,
6688 sctx
->cur_inode_gen
,
6689 sctx
->cur_inode_size
);
6696 ret
= send_chown(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6697 left_uid
, left_gid
);
6702 ret
= send_chmod(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6707 if (need_fileattr
) {
6708 ret
= send_fileattr(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
,
6714 if (proto_cmd_ok(sctx
, BTRFS_SEND_C_ENABLE_VERITY
)
6715 && sctx
->cur_inode_needs_verity
) {
6716 ret
= process_verity(sctx
);
6721 ret
= send_capabilities(sctx
);
6726 * If other directory inodes depended on our current directory
6727 * inode's move/rename, now do their move/rename operations.
6729 if (!is_waiting_for_move(sctx
, sctx
->cur_ino
)) {
6730 ret
= apply_children_dir_moves(sctx
);
6734 * Need to send that every time, no matter if it actually
6735 * changed between the two trees as we have done changes to
6736 * the inode before. If our inode is a directory and it's
6737 * waiting to be moved/renamed, we will send its utimes when
6738 * it's moved/renamed, therefore we don't need to do it here.
6740 sctx
->send_progress
= sctx
->cur_ino
+ 1;
6743 * If the current inode is a non-empty directory, delay issuing
6744 * the utimes command for it, as it's very likely we have inodes
6745 * with an higher number inside it. We want to issue the utimes
6746 * command only after adding all dentries to it.
6748 if (S_ISDIR(sctx
->cur_inode_mode
) && sctx
->cur_inode_size
> 0)
6749 ret
= cache_dir_utimes(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
6751 ret
= send_utimes(sctx
, sctx
->cur_ino
, sctx
->cur_inode_gen
);
6759 ret
= trim_dir_utimes_cache(sctx
);
6764 static void close_current_inode(struct send_ctx
*sctx
)
6768 if (sctx
->cur_inode
== NULL
)
6771 i_size
= i_size_read(sctx
->cur_inode
);
6774 * If we are doing an incremental send, we may have extents between the
6775 * last processed extent and the i_size that have not been processed
6776 * because they haven't changed but we may have read some of their pages
6777 * through readahead, see the comments at send_extent_data().
6779 if (sctx
->clean_page_cache
&& sctx
->page_cache_clear_start
< i_size
)
6780 truncate_inode_pages_range(&sctx
->cur_inode
->i_data
,
6781 sctx
->page_cache_clear_start
,
6782 round_up(i_size
, PAGE_SIZE
) - 1);
6784 iput(sctx
->cur_inode
);
6785 sctx
->cur_inode
= NULL
;
6788 static int changed_inode(struct send_ctx
*sctx
,
6789 enum btrfs_compare_tree_result result
)
6792 struct btrfs_key
*key
= sctx
->cmp_key
;
6793 struct btrfs_inode_item
*left_ii
= NULL
;
6794 struct btrfs_inode_item
*right_ii
= NULL
;
6798 close_current_inode(sctx
);
6800 sctx
->cur_ino
= key
->objectid
;
6801 sctx
->cur_inode_new_gen
= false;
6802 sctx
->cur_inode_last_extent
= (u64
)-1;
6803 sctx
->cur_inode_next_write_offset
= 0;
6804 sctx
->ignore_cur_inode
= false;
6805 fs_path_reset(&sctx
->cur_inode_path
);
6808 * Set send_progress to current inode. This will tell all get_cur_xxx
6809 * functions that the current inode's refs are not updated yet. Later,
6810 * when process_recorded_refs is finished, it is set to cur_ino + 1.
6812 sctx
->send_progress
= sctx
->cur_ino
;
6814 if (result
== BTRFS_COMPARE_TREE_NEW
||
6815 result
== BTRFS_COMPARE_TREE_CHANGED
) {
6816 left_ii
= btrfs_item_ptr(sctx
->left_path
->nodes
[0],
6817 sctx
->left_path
->slots
[0],
6818 struct btrfs_inode_item
);
6819 left_gen
= btrfs_inode_generation(sctx
->left_path
->nodes
[0],
6822 right_ii
= btrfs_item_ptr(sctx
->right_path
->nodes
[0],
6823 sctx
->right_path
->slots
[0],
6824 struct btrfs_inode_item
);
6825 right_gen
= btrfs_inode_generation(sctx
->right_path
->nodes
[0],
6828 if (result
== BTRFS_COMPARE_TREE_CHANGED
) {
6829 right_ii
= btrfs_item_ptr(sctx
->right_path
->nodes
[0],
6830 sctx
->right_path
->slots
[0],
6831 struct btrfs_inode_item
);
6833 right_gen
= btrfs_inode_generation(sctx
->right_path
->nodes
[0],
6837 * The cur_ino = root dir case is special here. We can't treat
6838 * the inode as deleted+reused because it would generate a
6839 * stream that tries to delete/mkdir the root dir.
6841 if (left_gen
!= right_gen
&&
6842 sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
)
6843 sctx
->cur_inode_new_gen
= true;
6847 * Normally we do not find inodes with a link count of zero (orphans)
6848 * because the most common case is to create a snapshot and use it
6849 * for a send operation. However other less common use cases involve
6850 * using a subvolume and send it after turning it to RO mode just
6851 * after deleting all hard links of a file while holding an open
6852 * file descriptor against it or turning a RO snapshot into RW mode,
6853 * keep an open file descriptor against a file, delete it and then
6854 * turn the snapshot back to RO mode before using it for a send
6855 * operation. The former is what the receiver operation does.
6856 * Therefore, if we want to send these snapshots soon after they're
6857 * received, we need to handle orphan inodes as well. Moreover, orphans
6858 * can appear not only in the send snapshot but also in the parent
6859 * snapshot. Here are several cases:
6861 * Case 1: BTRFS_COMPARE_TREE_NEW
6862 * | send snapshot | action
6863 * --------------------------------
6864 * nlink | 0 | ignore
6866 * Case 2: BTRFS_COMPARE_TREE_DELETED
6867 * | parent snapshot | action
6868 * ----------------------------------
6869 * nlink | 0 | as usual
6870 * Note: No unlinks will be sent because there're no paths for it.
6872 * Case 3: BTRFS_COMPARE_TREE_CHANGED
6873 * | | parent snapshot | send snapshot | action
6874 * -----------------------------------------------------------------------
6875 * subcase 1 | nlink | 0 | 0 | ignore
6876 * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
6877 * subcase 3 | nlink | 0 | >0 | new_gen(creation)
6880 if (result
== BTRFS_COMPARE_TREE_NEW
) {
6881 if (btrfs_inode_nlink(sctx
->left_path
->nodes
[0], left_ii
) == 0) {
6882 sctx
->ignore_cur_inode
= true;
6885 sctx
->cur_inode_gen
= left_gen
;
6886 sctx
->cur_inode_new
= true;
6887 sctx
->cur_inode_deleted
= false;
6888 sctx
->cur_inode_size
= btrfs_inode_size(
6889 sctx
->left_path
->nodes
[0], left_ii
);
6890 sctx
->cur_inode_mode
= btrfs_inode_mode(
6891 sctx
->left_path
->nodes
[0], left_ii
);
6892 sctx
->cur_inode_rdev
= btrfs_inode_rdev(
6893 sctx
->left_path
->nodes
[0], left_ii
);
6894 if (sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
)
6895 ret
= send_create_inode_if_needed(sctx
);
6896 } else if (result
== BTRFS_COMPARE_TREE_DELETED
) {
6897 sctx
->cur_inode_gen
= right_gen
;
6898 sctx
->cur_inode_new
= false;
6899 sctx
->cur_inode_deleted
= true;
6900 sctx
->cur_inode_size
= btrfs_inode_size(
6901 sctx
->right_path
->nodes
[0], right_ii
);
6902 sctx
->cur_inode_mode
= btrfs_inode_mode(
6903 sctx
->right_path
->nodes
[0], right_ii
);
6904 } else if (result
== BTRFS_COMPARE_TREE_CHANGED
) {
6905 u32 new_nlinks
, old_nlinks
;
6907 new_nlinks
= btrfs_inode_nlink(sctx
->left_path
->nodes
[0], left_ii
);
6908 old_nlinks
= btrfs_inode_nlink(sctx
->right_path
->nodes
[0], right_ii
);
6909 if (new_nlinks
== 0 && old_nlinks
== 0) {
6910 sctx
->ignore_cur_inode
= true;
6912 } else if (new_nlinks
== 0 || old_nlinks
== 0) {
6913 sctx
->cur_inode_new_gen
= 1;
6916 * We need to do some special handling in case the inode was
6917 * reported as changed with a changed generation number. This
6918 * means that the original inode was deleted and new inode
6919 * reused the same inum. So we have to treat the old inode as
6920 * deleted and the new one as new.
6922 if (sctx
->cur_inode_new_gen
) {
6924 * First, process the inode as if it was deleted.
6926 if (old_nlinks
> 0) {
6927 sctx
->cur_inode_gen
= right_gen
;
6928 sctx
->cur_inode_new
= false;
6929 sctx
->cur_inode_deleted
= true;
6930 sctx
->cur_inode_size
= btrfs_inode_size(
6931 sctx
->right_path
->nodes
[0], right_ii
);
6932 sctx
->cur_inode_mode
= btrfs_inode_mode(
6933 sctx
->right_path
->nodes
[0], right_ii
);
6934 ret
= process_all_refs(sctx
,
6935 BTRFS_COMPARE_TREE_DELETED
);
6941 * Now process the inode as if it was new.
6943 if (new_nlinks
> 0) {
6944 sctx
->cur_inode_gen
= left_gen
;
6945 sctx
->cur_inode_new
= true;
6946 sctx
->cur_inode_deleted
= false;
6947 sctx
->cur_inode_size
= btrfs_inode_size(
6948 sctx
->left_path
->nodes
[0],
6950 sctx
->cur_inode_mode
= btrfs_inode_mode(
6951 sctx
->left_path
->nodes
[0],
6953 sctx
->cur_inode_rdev
= btrfs_inode_rdev(
6954 sctx
->left_path
->nodes
[0],
6956 ret
= send_create_inode_if_needed(sctx
);
6960 ret
= process_all_refs(sctx
, BTRFS_COMPARE_TREE_NEW
);
6964 * Advance send_progress now as we did not get
6965 * into process_recorded_refs_if_needed in the
6968 sctx
->send_progress
= sctx
->cur_ino
+ 1;
6971 * Now process all extents and xattrs of the
6972 * inode as if they were all new.
6974 ret
= process_all_extents(sctx
);
6977 ret
= process_all_new_xattrs(sctx
);
6982 sctx
->cur_inode_gen
= left_gen
;
6983 sctx
->cur_inode_new
= false;
6984 sctx
->cur_inode_new_gen
= false;
6985 sctx
->cur_inode_deleted
= false;
6986 sctx
->cur_inode_size
= btrfs_inode_size(
6987 sctx
->left_path
->nodes
[0], left_ii
);
6988 sctx
->cur_inode_mode
= btrfs_inode_mode(
6989 sctx
->left_path
->nodes
[0], left_ii
);
6998 * We have to process new refs before deleted refs, but compare_trees gives us
6999 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
7000 * first and later process them in process_recorded_refs.
7001 * For the cur_inode_new_gen case, we skip recording completely because
7002 * changed_inode did already initiate processing of refs. The reason for this is
7003 * that in this case, compare_tree actually compares the refs of 2 different
7004 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
7005 * refs of the right tree as deleted and all refs of the left tree as new.
7007 static int changed_ref(struct send_ctx
*sctx
,
7008 enum btrfs_compare_tree_result result
)
7012 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
) {
7013 inconsistent_snapshot_error(sctx
, result
, "reference");
7017 if (!sctx
->cur_inode_new_gen
&&
7018 sctx
->cur_ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
7019 if (result
== BTRFS_COMPARE_TREE_NEW
)
7020 ret
= record_new_ref(sctx
);
7021 else if (result
== BTRFS_COMPARE_TREE_DELETED
)
7022 ret
= record_deleted_ref(sctx
);
7023 else if (result
== BTRFS_COMPARE_TREE_CHANGED
)
7024 ret
= record_changed_ref(sctx
);
7031 * Process new/deleted/changed xattrs. We skip processing in the
7032 * cur_inode_new_gen case because changed_inode did already initiate processing
7033 * of xattrs. The reason is the same as in changed_ref
7035 static int changed_xattr(struct send_ctx
*sctx
,
7036 enum btrfs_compare_tree_result result
)
7040 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
) {
7041 inconsistent_snapshot_error(sctx
, result
, "xattr");
7045 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7046 if (result
== BTRFS_COMPARE_TREE_NEW
)
7047 ret
= process_new_xattr(sctx
);
7048 else if (result
== BTRFS_COMPARE_TREE_DELETED
)
7049 ret
= process_deleted_xattr(sctx
);
7050 else if (result
== BTRFS_COMPARE_TREE_CHANGED
)
7051 ret
= process_changed_xattr(sctx
);
7058 * Process new/deleted/changed extents. We skip processing in the
7059 * cur_inode_new_gen case because changed_inode did already initiate processing
7060 * of extents. The reason is the same as in changed_ref
7062 static int changed_extent(struct send_ctx
*sctx
,
7063 enum btrfs_compare_tree_result result
)
7068 * We have found an extent item that changed without the inode item
7069 * having changed. This can happen either after relocation (where the
7070 * disk_bytenr of an extent item is replaced at
7071 * relocation.c:replace_file_extents()) or after deduplication into a
7072 * file in both the parent and send snapshots (where an extent item can
7073 * get modified or replaced with a new one). Note that deduplication
7074 * updates the inode item, but it only changes the iversion (sequence
7075 * field in the inode item) of the inode, so if a file is deduplicated
7076 * the same amount of times in both the parent and send snapshots, its
7077 * iversion becomes the same in both snapshots, whence the inode item is
7078 * the same on both snapshots.
7080 if (sctx
->cur_ino
!= sctx
->cmp_key
->objectid
)
7083 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7084 if (result
!= BTRFS_COMPARE_TREE_DELETED
)
7085 ret
= process_extent(sctx
, sctx
->left_path
,
7092 static int changed_verity(struct send_ctx
*sctx
, enum btrfs_compare_tree_result result
)
7094 if (!sctx
->cur_inode_new_gen
&& !sctx
->cur_inode_deleted
) {
7095 if (result
== BTRFS_COMPARE_TREE_NEW
)
7096 sctx
->cur_inode_needs_verity
= true;
7101 static int dir_changed(struct send_ctx
*sctx
, u64 dir
)
7103 u64 orig_gen
, new_gen
;
7106 ret
= get_inode_gen(sctx
->send_root
, dir
, &new_gen
);
7110 ret
= get_inode_gen(sctx
->parent_root
, dir
, &orig_gen
);
7114 return (orig_gen
!= new_gen
) ? 1 : 0;
7117 static int compare_refs(struct send_ctx
*sctx
, struct btrfs_path
*path
,
7118 struct btrfs_key
*key
)
7120 struct btrfs_inode_extref
*extref
;
7121 struct extent_buffer
*leaf
;
7122 u64 dirid
= 0, last_dirid
= 0;
7129 /* Easy case, just check this one dirid */
7130 if (key
->type
== BTRFS_INODE_REF_KEY
) {
7131 dirid
= key
->offset
;
7133 ret
= dir_changed(sctx
, dirid
);
7137 leaf
= path
->nodes
[0];
7138 item_size
= btrfs_item_size(leaf
, path
->slots
[0]);
7139 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
7140 while (cur_offset
< item_size
) {
7141 extref
= (struct btrfs_inode_extref
*)(ptr
+
7143 dirid
= btrfs_inode_extref_parent(leaf
, extref
);
7144 ref_name_len
= btrfs_inode_extref_name_len(leaf
, extref
);
7145 cur_offset
+= ref_name_len
+ sizeof(*extref
);
7146 if (dirid
== last_dirid
)
7148 ret
= dir_changed(sctx
, dirid
);
7158 * Updates compare related fields in sctx and simply forwards to the actual
7159 * changed_xxx functions.
7161 static int changed_cb(struct btrfs_path
*left_path
,
7162 struct btrfs_path
*right_path
,
7163 struct btrfs_key
*key
,
7164 enum btrfs_compare_tree_result result
,
7165 struct send_ctx
*sctx
)
7170 * We can not hold the commit root semaphore here. This is because in
7171 * the case of sending and receiving to the same filesystem, using a
7172 * pipe, could result in a deadlock:
7174 * 1) The task running send blocks on the pipe because it's full;
7176 * 2) The task running receive, which is the only consumer of the pipe,
7177 * is waiting for a transaction commit (for example due to a space
7178 * reservation when doing a write or triggering a transaction commit
7179 * when creating a subvolume);
7181 * 3) The transaction is waiting to write lock the commit root semaphore,
7182 * but can not acquire it since it's being held at 1).
7184 * Down this call chain we write to the pipe through kernel_write().
7185 * The same type of problem can also happen when sending to a file that
7186 * is stored in the same filesystem - when reserving space for a write
7187 * into the file, we can trigger a transaction commit.
7189 * Our caller has supplied us with clones of leaves from the send and
7190 * parent roots, so we're safe here from a concurrent relocation and
7191 * further reallocation of metadata extents while we are here. Below we
7192 * also assert that the leaves are clones.
7194 lockdep_assert_not_held(&sctx
->send_root
->fs_info
->commit_root_sem
);
7197 * We always have a send root, so left_path is never NULL. We will not
7198 * have a leaf when we have reached the end of the send root but have
7199 * not yet reached the end of the parent root.
7201 if (left_path
->nodes
[0])
7202 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED
,
7203 &left_path
->nodes
[0]->bflags
));
7205 * When doing a full send we don't have a parent root, so right_path is
7206 * NULL. When doing an incremental send, we may have reached the end of
7207 * the parent root already, so we don't have a leaf at right_path.
7209 if (right_path
&& right_path
->nodes
[0])
7210 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED
,
7211 &right_path
->nodes
[0]->bflags
));
7213 if (result
== BTRFS_COMPARE_TREE_SAME
) {
7214 if (key
->type
== BTRFS_INODE_REF_KEY
||
7215 key
->type
== BTRFS_INODE_EXTREF_KEY
) {
7216 ret
= compare_refs(sctx
, left_path
, key
);
7221 } else if (key
->type
== BTRFS_EXTENT_DATA_KEY
) {
7222 return maybe_send_hole(sctx
, left_path
, key
);
7226 result
= BTRFS_COMPARE_TREE_CHANGED
;
7229 sctx
->left_path
= left_path
;
7230 sctx
->right_path
= right_path
;
7231 sctx
->cmp_key
= key
;
7233 ret
= finish_inode_if_needed(sctx
, 0);
7237 /* Ignore non-FS objects */
7238 if (key
->objectid
== BTRFS_FREE_INO_OBJECTID
||
7239 key
->objectid
== BTRFS_FREE_SPACE_OBJECTID
)
7242 if (key
->type
== BTRFS_INODE_ITEM_KEY
) {
7243 ret
= changed_inode(sctx
, result
);
7244 } else if (!sctx
->ignore_cur_inode
) {
7245 if (key
->type
== BTRFS_INODE_REF_KEY
||
7246 key
->type
== BTRFS_INODE_EXTREF_KEY
)
7247 ret
= changed_ref(sctx
, result
);
7248 else if (key
->type
== BTRFS_XATTR_ITEM_KEY
)
7249 ret
= changed_xattr(sctx
, result
);
7250 else if (key
->type
== BTRFS_EXTENT_DATA_KEY
)
7251 ret
= changed_extent(sctx
, result
);
7252 else if (key
->type
== BTRFS_VERITY_DESC_ITEM_KEY
&&
7254 ret
= changed_verity(sctx
, result
);
7261 static int search_key_again(const struct send_ctx
*sctx
,
7262 struct btrfs_root
*root
,
7263 struct btrfs_path
*path
,
7264 const struct btrfs_key
*key
)
7268 if (!path
->need_commit_sem
)
7269 lockdep_assert_held_read(&root
->fs_info
->commit_root_sem
);
7272 * Roots used for send operations are readonly and no one can add,
7273 * update or remove keys from them, so we should be able to find our
7274 * key again. The only exception is deduplication, which can operate on
7275 * readonly roots and add, update or remove keys to/from them - but at
7276 * the moment we don't allow it to run in parallel with send.
7278 ret
= btrfs_search_slot(NULL
, root
, key
, path
, 0, 0);
7281 btrfs_print_tree(path
->nodes
[path
->lowest_level
], false);
7282 btrfs_err(root
->fs_info
,
7283 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
7284 key
->objectid
, key
->type
, key
->offset
,
7285 (root
== sctx
->parent_root
? "parent" : "send"),
7286 btrfs_root_id(root
), path
->lowest_level
,
7287 path
->slots
[path
->lowest_level
]);
7294 static int full_send_tree(struct send_ctx
*sctx
)
7297 struct btrfs_root
*send_root
= sctx
->send_root
;
7298 struct btrfs_key key
;
7299 struct btrfs_fs_info
*fs_info
= send_root
->fs_info
;
7300 struct btrfs_path
*path
;
7302 path
= alloc_path_for_send();
7305 path
->reada
= READA_FORWARD_ALWAYS
;
7307 key
.objectid
= BTRFS_FIRST_FREE_OBJECTID
;
7308 key
.type
= BTRFS_INODE_ITEM_KEY
;
7311 down_read(&fs_info
->commit_root_sem
);
7312 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7313 up_read(&fs_info
->commit_root_sem
);
7315 ret
= btrfs_search_slot_for_read(send_root
, &key
, path
, 1, 0);
7322 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
7324 ret
= changed_cb(path
, NULL
, &key
,
7325 BTRFS_COMPARE_TREE_NEW
, sctx
);
7329 down_read(&fs_info
->commit_root_sem
);
7330 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
7331 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7332 up_read(&fs_info
->commit_root_sem
);
7334 * A transaction used for relocating a block group was
7335 * committed or is about to finish its commit. Release
7336 * our path (leaf) and restart the search, so that we
7337 * avoid operating on any file extent items that are
7338 * stale, with a disk_bytenr that reflects a pre
7339 * relocation value. This way we avoid as much as
7340 * possible to fallback to regular writes when checking
7341 * if we can clone file ranges.
7343 btrfs_release_path(path
);
7344 ret
= search_key_again(sctx
, send_root
, path
, &key
);
7348 up_read(&fs_info
->commit_root_sem
);
7351 ret
= btrfs_next_item(send_root
, path
);
7361 ret
= finish_inode_if_needed(sctx
, 1);
7364 btrfs_free_path(path
);
7368 static int replace_node_with_clone(struct btrfs_path
*path
, int level
)
7370 struct extent_buffer
*clone
;
7372 clone
= btrfs_clone_extent_buffer(path
->nodes
[level
]);
7376 free_extent_buffer(path
->nodes
[level
]);
7377 path
->nodes
[level
] = clone
;
7382 static int tree_move_down(struct btrfs_path
*path
, int *level
, u64 reada_min_gen
)
7384 struct extent_buffer
*eb
;
7385 struct extent_buffer
*parent
= path
->nodes
[*level
];
7386 int slot
= path
->slots
[*level
];
7387 const int nritems
= btrfs_header_nritems(parent
);
7391 lockdep_assert_held_read(&parent
->fs_info
->commit_root_sem
);
7392 ASSERT(*level
!= 0);
7394 eb
= btrfs_read_node_slot(parent
, slot
);
7399 * Trigger readahead for the next leaves we will process, so that it is
7400 * very likely that when we need them they are already in memory and we
7401 * will not block on disk IO. For nodes we only do readahead for one,
7402 * since the time window between processing nodes is typically larger.
7404 reada_max
= (*level
== 1 ? SZ_128K
: eb
->fs_info
->nodesize
);
7406 for (slot
++; slot
< nritems
&& reada_done
< reada_max
; slot
++) {
7407 if (btrfs_node_ptr_generation(parent
, slot
) > reada_min_gen
) {
7408 btrfs_readahead_node_child(parent
, slot
);
7409 reada_done
+= eb
->fs_info
->nodesize
;
7413 path
->nodes
[*level
- 1] = eb
;
7414 path
->slots
[*level
- 1] = 0;
7418 return replace_node_with_clone(path
, 0);
7423 static int tree_move_next_or_upnext(struct btrfs_path
*path
,
7424 int *level
, int root_level
)
7428 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
7430 path
->slots
[*level
]++;
7432 while (path
->slots
[*level
] >= nritems
) {
7433 if (*level
== root_level
) {
7434 path
->slots
[*level
] = nritems
- 1;
7439 path
->slots
[*level
] = 0;
7440 free_extent_buffer(path
->nodes
[*level
]);
7441 path
->nodes
[*level
] = NULL
;
7443 path
->slots
[*level
]++;
7445 nritems
= btrfs_header_nritems(path
->nodes
[*level
]);
7452 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
7455 static int tree_advance(struct btrfs_path
*path
,
7456 int *level
, int root_level
,
7458 struct btrfs_key
*key
,
7463 if (*level
== 0 || !allow_down
) {
7464 ret
= tree_move_next_or_upnext(path
, level
, root_level
);
7466 ret
= tree_move_down(path
, level
, reada_min_gen
);
7470 * Even if we have reached the end of a tree, ret is -1, update the key
7471 * anyway, so that in case we need to restart due to a block group
7472 * relocation, we can assert that the last key of the root node still
7473 * exists in the tree.
7476 btrfs_item_key_to_cpu(path
->nodes
[*level
], key
,
7477 path
->slots
[*level
]);
7479 btrfs_node_key_to_cpu(path
->nodes
[*level
], key
,
7480 path
->slots
[*level
]);
7485 static int tree_compare_item(struct btrfs_path
*left_path
,
7486 struct btrfs_path
*right_path
,
7491 unsigned long off1
, off2
;
7493 len1
= btrfs_item_size(left_path
->nodes
[0], left_path
->slots
[0]);
7494 len2
= btrfs_item_size(right_path
->nodes
[0], right_path
->slots
[0]);
7498 off1
= btrfs_item_ptr_offset(left_path
->nodes
[0], left_path
->slots
[0]);
7499 off2
= btrfs_item_ptr_offset(right_path
->nodes
[0],
7500 right_path
->slots
[0]);
7502 read_extent_buffer(left_path
->nodes
[0], tmp_buf
, off1
, len1
);
7504 cmp
= memcmp_extent_buffer(right_path
->nodes
[0], tmp_buf
, off2
, len1
);
7511 * A transaction used for relocating a block group was committed or is about to
7512 * finish its commit. Release our paths and restart the search, so that we are
7513 * not using stale extent buffers:
7515 * 1) For levels > 0, we are only holding references of extent buffers, without
7516 * any locks on them, which does not prevent them from having been relocated
7517 * and reallocated after the last time we released the commit root semaphore.
7518 * The exception are the root nodes, for which we always have a clone, see
7519 * the comment at btrfs_compare_trees();
7521 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7522 * we are safe from the concurrent relocation and reallocation. However they
7523 * can have file extent items with a pre relocation disk_bytenr value, so we
7524 * restart the start from the current commit roots and clone the new leaves so
7525 * that we get the post relocation disk_bytenr values. Not doing so, could
7526 * make us clone the wrong data in case there are new extents using the old
7527 * disk_bytenr that happen to be shared.
7529 static int restart_after_relocation(struct btrfs_path
*left_path
,
7530 struct btrfs_path
*right_path
,
7531 const struct btrfs_key
*left_key
,
7532 const struct btrfs_key
*right_key
,
7535 const struct send_ctx
*sctx
)
7540 lockdep_assert_held_read(&sctx
->send_root
->fs_info
->commit_root_sem
);
7542 btrfs_release_path(left_path
);
7543 btrfs_release_path(right_path
);
7546 * Since keys can not be added or removed to/from our roots because they
7547 * are readonly and we do not allow deduplication to run in parallel
7548 * (which can add, remove or change keys), the layout of the trees should
7551 left_path
->lowest_level
= left_level
;
7552 ret
= search_key_again(sctx
, sctx
->send_root
, left_path
, left_key
);
7556 right_path
->lowest_level
= right_level
;
7557 ret
= search_key_again(sctx
, sctx
->parent_root
, right_path
, right_key
);
7562 * If the lowest level nodes are leaves, clone them so that they can be
7563 * safely used by changed_cb() while not under the protection of the
7564 * commit root semaphore, even if relocation and reallocation happens in
7567 if (left_level
== 0) {
7568 ret
= replace_node_with_clone(left_path
, 0);
7573 if (right_level
== 0) {
7574 ret
= replace_node_with_clone(right_path
, 0);
7580 * Now clone the root nodes (unless they happen to be the leaves we have
7581 * already cloned). This is to protect against concurrent snapshotting of
7582 * the send and parent roots (see the comment at btrfs_compare_trees()).
7584 root_level
= btrfs_header_level(sctx
->send_root
->commit_root
);
7585 if (root_level
> 0) {
7586 ret
= replace_node_with_clone(left_path
, root_level
);
7591 root_level
= btrfs_header_level(sctx
->parent_root
->commit_root
);
7592 if (root_level
> 0) {
7593 ret
= replace_node_with_clone(right_path
, root_level
);
7602 * This function compares two trees and calls the provided callback for
7603 * every changed/new/deleted item it finds.
7604 * If shared tree blocks are encountered, whole subtrees are skipped, making
7605 * the compare pretty fast on snapshotted subvolumes.
7607 * This currently works on commit roots only. As commit roots are read only,
7608 * we don't do any locking. The commit roots are protected with transactions.
7609 * Transactions are ended and rejoined when a commit is tried in between.
7611 * This function checks for modifications done to the trees while comparing.
7612 * If it detects a change, it aborts immediately.
7614 static int btrfs_compare_trees(struct btrfs_root
*left_root
,
7615 struct btrfs_root
*right_root
, struct send_ctx
*sctx
)
7617 struct btrfs_fs_info
*fs_info
= left_root
->fs_info
;
7620 struct btrfs_path
*left_path
= NULL
;
7621 struct btrfs_path
*right_path
= NULL
;
7622 struct btrfs_key left_key
;
7623 struct btrfs_key right_key
;
7624 char *tmp_buf
= NULL
;
7625 int left_root_level
;
7626 int right_root_level
;
7629 int left_end_reached
= 0;
7630 int right_end_reached
= 0;
7631 int advance_left
= 0;
7632 int advance_right
= 0;
7639 left_path
= btrfs_alloc_path();
7644 right_path
= btrfs_alloc_path();
7650 tmp_buf
= kvmalloc(fs_info
->nodesize
, GFP_KERNEL
);
7656 left_path
->search_commit_root
= 1;
7657 left_path
->skip_locking
= 1;
7658 right_path
->search_commit_root
= 1;
7659 right_path
->skip_locking
= 1;
7662 * Strategy: Go to the first items of both trees. Then do
7664 * If both trees are at level 0
7665 * Compare keys of current items
7666 * If left < right treat left item as new, advance left tree
7668 * If left > right treat right item as deleted, advance right tree
7670 * If left == right do deep compare of items, treat as changed if
7671 * needed, advance both trees and repeat
7672 * If both trees are at the same level but not at level 0
7673 * Compare keys of current nodes/leafs
7674 * If left < right advance left tree and repeat
7675 * If left > right advance right tree and repeat
7676 * If left == right compare blockptrs of the next nodes/leafs
7677 * If they match advance both trees but stay at the same level
7679 * If they don't match advance both trees while allowing to go
7681 * If tree levels are different
7682 * Advance the tree that needs it and repeat
7684 * Advancing a tree means:
7685 * If we are at level 0, try to go to the next slot. If that's not
7686 * possible, go one level up and repeat. Stop when we found a level
7687 * where we could go to the next slot. We may at this point be on a
7690 * If we are not at level 0 and not on shared tree blocks, go one
7693 * If we are not at level 0 and on shared tree blocks, go one slot to
7694 * the right if possible or go up and right.
7697 down_read(&fs_info
->commit_root_sem
);
7698 left_level
= btrfs_header_level(left_root
->commit_root
);
7699 left_root_level
= left_level
;
7701 * We clone the root node of the send and parent roots to prevent races
7702 * with snapshot creation of these roots. Snapshot creation COWs the
7703 * root node of a tree, so after the transaction is committed the old
7704 * extent can be reallocated while this send operation is still ongoing.
7705 * So we clone them, under the commit root semaphore, to be race free.
7707 left_path
->nodes
[left_level
] =
7708 btrfs_clone_extent_buffer(left_root
->commit_root
);
7709 if (!left_path
->nodes
[left_level
]) {
7714 right_level
= btrfs_header_level(right_root
->commit_root
);
7715 right_root_level
= right_level
;
7716 right_path
->nodes
[right_level
] =
7717 btrfs_clone_extent_buffer(right_root
->commit_root
);
7718 if (!right_path
->nodes
[right_level
]) {
7723 * Our right root is the parent root, while the left root is the "send"
7724 * root. We know that all new nodes/leaves in the left root must have
7725 * a generation greater than the right root's generation, so we trigger
7726 * readahead for those nodes and leaves of the left root, as we know we
7727 * will need to read them at some point.
7729 reada_min_gen
= btrfs_header_generation(right_root
->commit_root
);
7731 if (left_level
== 0)
7732 btrfs_item_key_to_cpu(left_path
->nodes
[left_level
],
7733 &left_key
, left_path
->slots
[left_level
]);
7735 btrfs_node_key_to_cpu(left_path
->nodes
[left_level
],
7736 &left_key
, left_path
->slots
[left_level
]);
7737 if (right_level
== 0)
7738 btrfs_item_key_to_cpu(right_path
->nodes
[right_level
],
7739 &right_key
, right_path
->slots
[right_level
]);
7741 btrfs_node_key_to_cpu(right_path
->nodes
[right_level
],
7742 &right_key
, right_path
->slots
[right_level
]);
7744 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7747 if (need_resched() ||
7748 rwsem_is_contended(&fs_info
->commit_root_sem
)) {
7749 up_read(&fs_info
->commit_root_sem
);
7751 down_read(&fs_info
->commit_root_sem
);
7754 if (fs_info
->last_reloc_trans
> sctx
->last_reloc_trans
) {
7755 ret
= restart_after_relocation(left_path
, right_path
,
7756 &left_key
, &right_key
,
7757 left_level
, right_level
,
7761 sctx
->last_reloc_trans
= fs_info
->last_reloc_trans
;
7764 if (advance_left
&& !left_end_reached
) {
7765 ret
= tree_advance(left_path
, &left_level
,
7767 advance_left
!= ADVANCE_ONLY_NEXT
,
7768 &left_key
, reada_min_gen
);
7770 left_end_reached
= ADVANCE
;
7775 if (advance_right
&& !right_end_reached
) {
7776 ret
= tree_advance(right_path
, &right_level
,
7778 advance_right
!= ADVANCE_ONLY_NEXT
,
7779 &right_key
, reada_min_gen
);
7781 right_end_reached
= ADVANCE
;
7787 if (left_end_reached
&& right_end_reached
) {
7790 } else if (left_end_reached
) {
7791 if (right_level
== 0) {
7792 up_read(&fs_info
->commit_root_sem
);
7793 ret
= changed_cb(left_path
, right_path
,
7795 BTRFS_COMPARE_TREE_DELETED
,
7799 down_read(&fs_info
->commit_root_sem
);
7801 advance_right
= ADVANCE
;
7803 } else if (right_end_reached
) {
7804 if (left_level
== 0) {
7805 up_read(&fs_info
->commit_root_sem
);
7806 ret
= changed_cb(left_path
, right_path
,
7808 BTRFS_COMPARE_TREE_NEW
,
7812 down_read(&fs_info
->commit_root_sem
);
7814 advance_left
= ADVANCE
;
7818 if (left_level
== 0 && right_level
== 0) {
7819 up_read(&fs_info
->commit_root_sem
);
7820 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
7822 ret
= changed_cb(left_path
, right_path
,
7824 BTRFS_COMPARE_TREE_NEW
,
7826 advance_left
= ADVANCE
;
7827 } else if (cmp
> 0) {
7828 ret
= changed_cb(left_path
, right_path
,
7830 BTRFS_COMPARE_TREE_DELETED
,
7832 advance_right
= ADVANCE
;
7834 enum btrfs_compare_tree_result result
;
7836 WARN_ON(!extent_buffer_uptodate(left_path
->nodes
[0]));
7837 ret
= tree_compare_item(left_path
, right_path
,
7840 result
= BTRFS_COMPARE_TREE_CHANGED
;
7842 result
= BTRFS_COMPARE_TREE_SAME
;
7843 ret
= changed_cb(left_path
, right_path
,
7844 &left_key
, result
, sctx
);
7845 advance_left
= ADVANCE
;
7846 advance_right
= ADVANCE
;
7851 down_read(&fs_info
->commit_root_sem
);
7852 } else if (left_level
== right_level
) {
7853 cmp
= btrfs_comp_cpu_keys(&left_key
, &right_key
);
7855 advance_left
= ADVANCE
;
7856 } else if (cmp
> 0) {
7857 advance_right
= ADVANCE
;
7859 left_blockptr
= btrfs_node_blockptr(
7860 left_path
->nodes
[left_level
],
7861 left_path
->slots
[left_level
]);
7862 right_blockptr
= btrfs_node_blockptr(
7863 right_path
->nodes
[right_level
],
7864 right_path
->slots
[right_level
]);
7865 left_gen
= btrfs_node_ptr_generation(
7866 left_path
->nodes
[left_level
],
7867 left_path
->slots
[left_level
]);
7868 right_gen
= btrfs_node_ptr_generation(
7869 right_path
->nodes
[right_level
],
7870 right_path
->slots
[right_level
]);
7871 if (left_blockptr
== right_blockptr
&&
7872 left_gen
== right_gen
) {
7874 * As we're on a shared block, don't
7875 * allow to go deeper.
7877 advance_left
= ADVANCE_ONLY_NEXT
;
7878 advance_right
= ADVANCE_ONLY_NEXT
;
7880 advance_left
= ADVANCE
;
7881 advance_right
= ADVANCE
;
7884 } else if (left_level
< right_level
) {
7885 advance_right
= ADVANCE
;
7887 advance_left
= ADVANCE
;
7892 up_read(&fs_info
->commit_root_sem
);
7894 btrfs_free_path(left_path
);
7895 btrfs_free_path(right_path
);
7900 static int send_subvol(struct send_ctx
*sctx
)
7904 if (!(sctx
->flags
& BTRFS_SEND_FLAG_OMIT_STREAM_HEADER
)) {
7905 ret
= send_header(sctx
);
7910 ret
= send_subvol_begin(sctx
);
7914 if (sctx
->parent_root
) {
7915 ret
= btrfs_compare_trees(sctx
->send_root
, sctx
->parent_root
, sctx
);
7918 ret
= finish_inode_if_needed(sctx
, 1);
7922 ret
= full_send_tree(sctx
);
7928 free_recorded_refs(sctx
);
7933 * If orphan cleanup did remove any orphans from a root, it means the tree
7934 * was modified and therefore the commit root is not the same as the current
7935 * root anymore. This is a problem, because send uses the commit root and
7936 * therefore can see inode items that don't exist in the current root anymore,
7937 * and for example make calls to btrfs_iget, which will do tree lookups based
7938 * on the current root and not on the commit root. Those lookups will fail,
7939 * returning a -ESTALE error, and making send fail with that error. So make
7940 * sure a send does not see any orphans we have just removed, and that it will
7941 * see the same inodes regardless of whether a transaction commit happened
7942 * before it started (meaning that the commit root will be the same as the
7943 * current root) or not.
7945 static int ensure_commit_roots_uptodate(struct send_ctx
*sctx
)
7947 struct btrfs_root
*root
= sctx
->parent_root
;
7949 if (root
&& root
->node
!= root
->commit_root
)
7950 return btrfs_commit_current_transaction(root
);
7952 for (int i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
7953 root
= sctx
->clone_roots
[i
].root
;
7954 if (root
->node
!= root
->commit_root
)
7955 return btrfs_commit_current_transaction(root
);
7962 * Make sure any existing dellaloc is flushed for any root used by a send
7963 * operation so that we do not miss any data and we do not race with writeback
7964 * finishing and changing a tree while send is using the tree. This could
7965 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
7966 * a send operation then uses the subvolume.
7967 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
7969 static int flush_delalloc_roots(struct send_ctx
*sctx
)
7971 struct btrfs_root
*root
= sctx
->parent_root
;
7976 ret
= btrfs_start_delalloc_snapshot(root
, false);
7979 btrfs_wait_ordered_extents(root
, U64_MAX
, NULL
);
7982 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
7983 root
= sctx
->clone_roots
[i
].root
;
7984 ret
= btrfs_start_delalloc_snapshot(root
, false);
7987 btrfs_wait_ordered_extents(root
, U64_MAX
, NULL
);
7993 static void btrfs_root_dec_send_in_progress(struct btrfs_root
* root
)
7995 spin_lock(&root
->root_item_lock
);
7996 root
->send_in_progress
--;
7998 * Not much left to do, we don't know why it's unbalanced and
7999 * can't blindly reset it to 0.
8001 if (root
->send_in_progress
< 0)
8002 btrfs_err(root
->fs_info
,
8003 "send_in_progress unbalanced %d root %llu",
8004 root
->send_in_progress
, btrfs_root_id(root
));
8005 spin_unlock(&root
->root_item_lock
);
8008 static void dedupe_in_progress_warn(const struct btrfs_root
*root
)
8010 btrfs_warn_rl(root
->fs_info
,
8011 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
8012 btrfs_root_id(root
), root
->dedupe_in_progress
);
8015 long btrfs_ioctl_send(struct btrfs_root
*send_root
, const struct btrfs_ioctl_send_args
*arg
)
8018 struct btrfs_fs_info
*fs_info
= send_root
->fs_info
;
8019 struct btrfs_root
*clone_root
;
8020 struct send_ctx
*sctx
= NULL
;
8022 u64
*clone_sources_tmp
= NULL
;
8023 int clone_sources_to_rollback
= 0;
8025 int sort_clone_roots
= 0;
8026 struct btrfs_lru_cache_entry
*entry
;
8027 struct btrfs_lru_cache_entry
*tmp
;
8029 if (!capable(CAP_SYS_ADMIN
))
8033 * The subvolume must remain read-only during send, protect against
8034 * making it RW. This also protects against deletion.
8036 spin_lock(&send_root
->root_item_lock
);
8038 * Unlikely but possible, if the subvolume is marked for deletion but
8039 * is slow to remove the directory entry, send can still be started.
8041 if (btrfs_root_dead(send_root
)) {
8042 spin_unlock(&send_root
->root_item_lock
);
8045 /* Userspace tools do the checks and warn the user if it's not RO. */
8046 if (!btrfs_root_readonly(send_root
)) {
8047 spin_unlock(&send_root
->root_item_lock
);
8050 if (send_root
->dedupe_in_progress
) {
8051 dedupe_in_progress_warn(send_root
);
8052 spin_unlock(&send_root
->root_item_lock
);
8055 send_root
->send_in_progress
++;
8056 spin_unlock(&send_root
->root_item_lock
);
8059 * Check that we don't overflow at later allocations, we request
8060 * clone_sources_count + 1 items, and compare to unsigned long inside
8061 * access_ok. Also set an upper limit for allocation size so this can't
8062 * easily exhaust memory. Max number of clone sources is about 200K.
8064 if (arg
->clone_sources_count
> SZ_8M
/ sizeof(struct clone_root
)) {
8069 if (arg
->flags
& ~BTRFS_SEND_FLAG_MASK
) {
8074 sctx
= kzalloc(sizeof(struct send_ctx
), GFP_KERNEL
);
8080 init_path(&sctx
->cur_inode_path
);
8081 INIT_LIST_HEAD(&sctx
->new_refs
);
8082 INIT_LIST_HEAD(&sctx
->deleted_refs
);
8084 btrfs_lru_cache_init(&sctx
->name_cache
, SEND_MAX_NAME_CACHE_SIZE
);
8085 btrfs_lru_cache_init(&sctx
->backref_cache
, SEND_MAX_BACKREF_CACHE_SIZE
);
8086 btrfs_lru_cache_init(&sctx
->dir_created_cache
,
8087 SEND_MAX_DIR_CREATED_CACHE_SIZE
);
8089 * This cache is periodically trimmed to a fixed size elsewhere, see
8090 * cache_dir_utimes() and trim_dir_utimes_cache().
8092 btrfs_lru_cache_init(&sctx
->dir_utimes_cache
, 0);
8094 sctx
->pending_dir_moves
= RB_ROOT
;
8095 sctx
->waiting_dir_moves
= RB_ROOT
;
8096 sctx
->orphan_dirs
= RB_ROOT
;
8097 sctx
->rbtree_new_refs
= RB_ROOT
;
8098 sctx
->rbtree_deleted_refs
= RB_ROOT
;
8100 sctx
->flags
= arg
->flags
;
8102 if (arg
->flags
& BTRFS_SEND_FLAG_VERSION
) {
8103 if (arg
->version
> BTRFS_SEND_STREAM_VERSION
) {
8107 /* Zero means "use the highest version" */
8108 sctx
->proto
= arg
->version
?: BTRFS_SEND_STREAM_VERSION
;
8112 if ((arg
->flags
& BTRFS_SEND_FLAG_COMPRESSED
) && sctx
->proto
< 2) {
8117 sctx
->send_filp
= fget(arg
->send_fd
);
8118 if (!sctx
->send_filp
|| !(sctx
->send_filp
->f_mode
& FMODE_WRITE
)) {
8123 sctx
->send_root
= send_root
;
8124 sctx
->clone_roots_cnt
= arg
->clone_sources_count
;
8126 if (sctx
->proto
>= 2) {
8127 u32 send_buf_num_pages
;
8129 sctx
->send_max_size
= BTRFS_SEND_BUF_SIZE_V2
;
8130 sctx
->send_buf
= vmalloc(sctx
->send_max_size
);
8131 if (!sctx
->send_buf
) {
8135 send_buf_num_pages
= sctx
->send_max_size
>> PAGE_SHIFT
;
8136 sctx
->send_buf_pages
= kcalloc(send_buf_num_pages
,
8137 sizeof(*sctx
->send_buf_pages
),
8139 if (!sctx
->send_buf_pages
) {
8143 for (i
= 0; i
< send_buf_num_pages
; i
++) {
8144 sctx
->send_buf_pages
[i
] =
8145 vmalloc_to_page(sctx
->send_buf
+ (i
<< PAGE_SHIFT
));
8148 sctx
->send_max_size
= BTRFS_SEND_BUF_SIZE_V1
;
8149 sctx
->send_buf
= kvmalloc(sctx
->send_max_size
, GFP_KERNEL
);
8151 if (!sctx
->send_buf
) {
8156 sctx
->clone_roots
= kvcalloc(arg
->clone_sources_count
+ 1,
8157 sizeof(*sctx
->clone_roots
),
8159 if (!sctx
->clone_roots
) {
8164 alloc_size
= array_size(sizeof(*arg
->clone_sources
),
8165 arg
->clone_sources_count
);
8167 if (arg
->clone_sources_count
) {
8168 clone_sources_tmp
= kvmalloc(alloc_size
, GFP_KERNEL
);
8169 if (!clone_sources_tmp
) {
8174 ret
= copy_from_user(clone_sources_tmp
, arg
->clone_sources
,
8181 for (i
= 0; i
< arg
->clone_sources_count
; i
++) {
8182 clone_root
= btrfs_get_fs_root(fs_info
,
8183 clone_sources_tmp
[i
], true);
8184 if (IS_ERR(clone_root
)) {
8185 ret
= PTR_ERR(clone_root
);
8188 spin_lock(&clone_root
->root_item_lock
);
8189 if (!btrfs_root_readonly(clone_root
) ||
8190 btrfs_root_dead(clone_root
)) {
8191 spin_unlock(&clone_root
->root_item_lock
);
8192 btrfs_put_root(clone_root
);
8196 if (clone_root
->dedupe_in_progress
) {
8197 dedupe_in_progress_warn(clone_root
);
8198 spin_unlock(&clone_root
->root_item_lock
);
8199 btrfs_put_root(clone_root
);
8203 clone_root
->send_in_progress
++;
8204 spin_unlock(&clone_root
->root_item_lock
);
8206 sctx
->clone_roots
[i
].root
= clone_root
;
8207 clone_sources_to_rollback
= i
+ 1;
8209 kvfree(clone_sources_tmp
);
8210 clone_sources_tmp
= NULL
;
8213 if (arg
->parent_root
) {
8214 sctx
->parent_root
= btrfs_get_fs_root(fs_info
, arg
->parent_root
,
8216 if (IS_ERR(sctx
->parent_root
)) {
8217 ret
= PTR_ERR(sctx
->parent_root
);
8221 spin_lock(&sctx
->parent_root
->root_item_lock
);
8222 sctx
->parent_root
->send_in_progress
++;
8223 if (!btrfs_root_readonly(sctx
->parent_root
) ||
8224 btrfs_root_dead(sctx
->parent_root
)) {
8225 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8229 if (sctx
->parent_root
->dedupe_in_progress
) {
8230 dedupe_in_progress_warn(sctx
->parent_root
);
8231 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8235 spin_unlock(&sctx
->parent_root
->root_item_lock
);
8239 * Clones from send_root are allowed, but only if the clone source
8240 * is behind the current send position. This is checked while searching
8241 * for possible clone sources.
8243 sctx
->clone_roots
[sctx
->clone_roots_cnt
++].root
=
8244 btrfs_grab_root(sctx
->send_root
);
8246 /* We do a bsearch later */
8247 sort(sctx
->clone_roots
, sctx
->clone_roots_cnt
,
8248 sizeof(*sctx
->clone_roots
), __clone_root_cmp_sort
,
8250 sort_clone_roots
= 1;
8252 ret
= flush_delalloc_roots(sctx
);
8256 ret
= ensure_commit_roots_uptodate(sctx
);
8260 ret
= send_subvol(sctx
);
8264 btrfs_lru_cache_for_each_entry_safe(&sctx
->dir_utimes_cache
, entry
, tmp
) {
8265 ret
= send_utimes(sctx
, entry
->key
, entry
->gen
);
8268 btrfs_lru_cache_remove(&sctx
->dir_utimes_cache
, entry
);
8271 if (!(sctx
->flags
& BTRFS_SEND_FLAG_OMIT_END_CMD
)) {
8272 ret
= begin_cmd(sctx
, BTRFS_SEND_C_END
);
8275 ret
= send_cmd(sctx
);
8281 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->pending_dir_moves
));
8282 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->pending_dir_moves
)) {
8284 struct pending_dir_move
*pm
;
8286 n
= rb_first(&sctx
->pending_dir_moves
);
8287 pm
= rb_entry(n
, struct pending_dir_move
, node
);
8288 while (!list_empty(&pm
->list
)) {
8289 struct pending_dir_move
*pm2
;
8291 pm2
= list_first_entry(&pm
->list
,
8292 struct pending_dir_move
, list
);
8293 free_pending_move(sctx
, pm2
);
8295 free_pending_move(sctx
, pm
);
8298 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
));
8299 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->waiting_dir_moves
)) {
8301 struct waiting_dir_move
*dm
;
8303 n
= rb_first(&sctx
->waiting_dir_moves
);
8304 dm
= rb_entry(n
, struct waiting_dir_move
, node
);
8305 rb_erase(&dm
->node
, &sctx
->waiting_dir_moves
);
8309 WARN_ON(sctx
&& !ret
&& !RB_EMPTY_ROOT(&sctx
->orphan_dirs
));
8310 while (sctx
&& !RB_EMPTY_ROOT(&sctx
->orphan_dirs
)) {
8312 struct orphan_dir_info
*odi
;
8314 n
= rb_first(&sctx
->orphan_dirs
);
8315 odi
= rb_entry(n
, struct orphan_dir_info
, node
);
8316 free_orphan_dir_info(sctx
, odi
);
8319 if (sort_clone_roots
) {
8320 for (i
= 0; i
< sctx
->clone_roots_cnt
; i
++) {
8321 btrfs_root_dec_send_in_progress(
8322 sctx
->clone_roots
[i
].root
);
8323 btrfs_put_root(sctx
->clone_roots
[i
].root
);
8326 for (i
= 0; sctx
&& i
< clone_sources_to_rollback
; i
++) {
8327 btrfs_root_dec_send_in_progress(
8328 sctx
->clone_roots
[i
].root
);
8329 btrfs_put_root(sctx
->clone_roots
[i
].root
);
8332 btrfs_root_dec_send_in_progress(send_root
);
8334 if (sctx
&& !IS_ERR_OR_NULL(sctx
->parent_root
)) {
8335 btrfs_root_dec_send_in_progress(sctx
->parent_root
);
8336 btrfs_put_root(sctx
->parent_root
);
8339 kvfree(clone_sources_tmp
);
8342 if (sctx
->send_filp
)
8343 fput(sctx
->send_filp
);
8345 kvfree(sctx
->clone_roots
);
8346 kfree(sctx
->send_buf_pages
);
8347 kvfree(sctx
->send_buf
);
8348 kvfree(sctx
->verity_descriptor
);
8350 close_current_inode(sctx
);
8352 btrfs_lru_cache_clear(&sctx
->name_cache
);
8353 btrfs_lru_cache_clear(&sctx
->backref_cache
);
8354 btrfs_lru_cache_clear(&sctx
->dir_created_cache
);
8355 btrfs_lru_cache_clear(&sctx
->dir_utimes_cache
);
8357 if (sctx
->cur_inode_path
.buf
!= sctx
->cur_inode_path
.inline_buf
)
8358 kfree(sctx
->cur_inode_path
.buf
);