]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - queue-5.0/btrfs-fix-race-between-send-and-deduplication-that-lead-to-failures-and-crashes.patch
Linux 4.19.45
[thirdparty/kernel/stable-queue.git] / queue-5.0 / btrfs-fix-race-between-send-and-deduplication-that-lead-to-failures-and-crashes.patch
1 From 62d54f3a7fa27ef6a74d6cdf643ce04beba3afa7 Mon Sep 17 00:00:00 2001
2 From: Filipe Manana <fdmanana@suse.com>
3 Date: Mon, 22 Apr 2019 16:43:42 +0100
4 Subject: Btrfs: fix race between send and deduplication that lead to failures and crashes
5
6 From: Filipe Manana <fdmanana@suse.com>
7
8 commit 62d54f3a7fa27ef6a74d6cdf643ce04beba3afa7 upstream.
9
10 Send operates on read only trees and expects them to never change while it
11 is using them. This is part of its initial design, and this expection is
12 due to two different reasons:
13
14 1) When it was introduced, no operations were allowed to modifiy read-only
15 subvolumes/snapshots (including defrag for example).
16
17 2) It keeps send from having an impact on other filesystem operations.
18 Namely send does not need to keep locks on the trees nor needs to hold on
19 to transaction handles and delay transaction commits. This ends up being
20 a consequence of the former reason.
21
22 However the deduplication feature was introduced later (on September 2013,
23 while send was introduced in July 2012) and it allowed for deduplication
24 with destination files that belong to read-only trees (subvolumes and
25 snapshots).
26
27 That means that having a send operation (either full or incremental) running
28 in parallel with a deduplication that has the destination inode in one of
29 the trees used by the send operation, can result in tree nodes and leaves
30 getting freed and reused while send is using them. This problem is similar
31 to the problem solved for the root nodes getting freed and reused when a
32 snapshot is made against one tree that is currenly being used by a send
33 operation, fixed in commits [1] and [2]. These commits explain in detail
34 how the problem happens and the explanation is valid for any node or leaf
35 that is not the root of a tree as well. This problem was also discussed
36 and explained recently in a thread [3].
37
38 The problem is very easy to reproduce when using send with large trees
39 (snapshots) and just a few concurrent deduplication operations that target
40 files in the trees used by send. A stress test case is being sent for
41 fstests that triggers the issue easily. The most common error to hit is
42 the send ioctl return -EIO with the following messages in dmesg/syslog:
43
44 [1631617.204075] BTRFS error (device sdc): did not find backref in send_root. inode=63292, offset=0, disk_byte=5228134400 found extent=5228134400
45 [1631633.251754] BTRFS error (device sdc): parent transid verify failed on 32243712 wanted 24 found 27
46
47 The first one is very easy to hit while the second one happens much less
48 frequently, except for very large trees (in that test case, snapshots
49 with 100000 files having large xattrs to get deep and wide trees).
50 Less frequently, at least one BUG_ON can be hit:
51
52 [1631742.130080] ------------[ cut here ]------------
53 [1631742.130625] kernel BUG at fs/btrfs/ctree.c:1806!
54 [1631742.131188] invalid opcode: 0000 [#6] SMP DEBUG_PAGEALLOC PTI
55 [1631742.131726] CPU: 1 PID: 13394 Comm: btrfs Tainted: G B D W 5.0.0-rc8-btrfs-next-45 #1
56 [1631742.132265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
57 [1631742.133399] RIP: 0010:read_node_slot+0x122/0x130 [btrfs]
58 (...)
59 [1631742.135061] RSP: 0018:ffffb530021ebaa0 EFLAGS: 00010246
60 [1631742.135615] RAX: ffff93ac8912e000 RBX: 000000000000009d RCX: 0000000000000002
61 [1631742.136173] RDX: 000000000000009d RSI: ffff93ac564b0d08 RDI: ffff93ad5b48c000
62 [1631742.136759] RBP: ffffb530021ebb7d R08: 0000000000000001 R09: ffffb530021ebb7d
63 [1631742.137324] R10: ffffb530021eba70 R11: 0000000000000000 R12: ffff93ac87d0a708
64 [1631742.137900] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
65 [1631742.138455] FS: 00007f4cdb1528c0(0000) GS:ffff93ad76a80000(0000) knlGS:0000000000000000
66 [1631742.139010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
67 [1631742.139568] CR2: 00007f5acb3d0420 CR3: 000000012be3e006 CR4: 00000000003606e0
68 [1631742.140131] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
69 [1631742.140719] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
70 [1631742.141272] Call Trace:
71 [1631742.141826] ? do_raw_spin_unlock+0x49/0xc0
72 [1631742.142390] tree_advance+0x173/0x1d0 [btrfs]
73 [1631742.142948] btrfs_compare_trees+0x268/0x690 [btrfs]
74 [1631742.143533] ? process_extent+0x1070/0x1070 [btrfs]
75 [1631742.144088] btrfs_ioctl_send+0x1037/0x1270 [btrfs]
76 [1631742.144645] _btrfs_ioctl_send+0x80/0x110 [btrfs]
77 [1631742.145161] ? trace_sched_stick_numa+0xe0/0xe0
78 [1631742.145685] btrfs_ioctl+0x13fe/0x3120 [btrfs]
79 [1631742.146179] ? account_entity_enqueue+0xd3/0x100
80 [1631742.146662] ? reweight_entity+0x154/0x1a0
81 [1631742.147135] ? update_curr+0x20/0x2a0
82 [1631742.147593] ? check_preempt_wakeup+0x103/0x250
83 [1631742.148053] ? do_vfs_ioctl+0xa2/0x6f0
84 [1631742.148510] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
85 [1631742.148942] do_vfs_ioctl+0xa2/0x6f0
86 [1631742.149361] ? __fget+0x113/0x200
87 [1631742.149767] ksys_ioctl+0x70/0x80
88 [1631742.150159] __x64_sys_ioctl+0x16/0x20
89 [1631742.150543] do_syscall_64+0x60/0x1b0
90 [1631742.150931] entry_SYSCALL_64_after_hwframe+0x49/0xbe
91 [1631742.151326] RIP: 0033:0x7f4cd9f5add7
92 (...)
93 [1631742.152509] RSP: 002b:00007ffe91017708 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
94 [1631742.152892] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f4cd9f5add7
95 [1631742.153268] RDX: 00007ffe91017790 RSI: 0000000040489426 RDI: 0000000000000007
96 [1631742.153633] RBP: 0000000000000007 R08: 00007f4cd9e79700 R09: 00007f4cd9e79700
97 [1631742.153999] R10: 00007f4cd9e799d0 R11: 0000000000000202 R12: 0000000000000003
98 [1631742.154365] R13: 0000555dfae53020 R14: 0000000000000000 R15: 0000000000000001
99 (...)
100 [1631742.156696] ---[ end trace 5dac9f96dcc3fd6b ]---
101
102 That BUG_ON happens because while send is using a node, that node is COWed
103 by a concurrent deduplication, gets freed and gets reused as a leaf (because
104 a transaction commit happened in between), so when it attempts to read a
105 slot from the extent buffer, at ctree.c:read_node_slot(), the extent buffer
106 contents were wiped out and it now matches a leaf (which can even belong to
107 some other tree now), hitting the BUG_ON(level == 0).
108
109 Fix this concurrency issue by not allowing send and deduplication to run
110 in parallel if both operate on the same readonly trees, returning EAGAIN
111 to user space and logging an exlicit warning in dmesg/syslog.
112
113 [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6821f82c3cc36e026f5afd10249988852b35ea
114 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2
115 [3] https://lore.kernel.org/linux-btrfs/CAL3q7H7iqSEEyFaEtpRZw3cp613y+4k2Q8b4W7mweR3tZA05bQ@mail.gmail.com/
116
117 CC: stable@vger.kernel.org # 4.4+
118 Signed-off-by: Filipe Manana <fdmanana@suse.com>
119 Reviewed-by: David Sterba <dsterba@suse.com>
120 Signed-off-by: David Sterba <dsterba@suse.com>
121 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
122
123 ---
124 fs/btrfs/ctree.h | 6 ++++++
125 fs/btrfs/ioctl.c | 19 ++++++++++++++++++-
126 fs/btrfs/send.c | 26 ++++++++++++++++++++++++++
127 3 files changed, 50 insertions(+), 1 deletion(-)
128
129 --- a/fs/btrfs/ctree.h
130 +++ b/fs/btrfs/ctree.h
131 @@ -1316,6 +1316,12 @@ struct btrfs_root {
132 * manipulation with the read-only status via SUBVOL_SETFLAGS
133 */
134 int send_in_progress;
135 + /*
136 + * Number of currently running deduplication operations that have a
137 + * destination inode belonging to this root. Protected by the lock
138 + * root_item_lock.
139 + */
140 + int dedupe_in_progress;
141 struct btrfs_subvolume_writers *subv_writers;
142 atomic_t will_be_snapshotted;
143 atomic_t snapshot_force_cow;
144 --- a/fs/btrfs/ioctl.c
145 +++ b/fs/btrfs/ioctl.c
146 @@ -3275,6 +3275,19 @@ static int btrfs_extent_same(struct inod
147 int ret;
148 int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
149 u64 i, tail_len, chunk_count;
150 + struct btrfs_root *root_dst = BTRFS_I(dst)->root;
151 +
152 + spin_lock(&root_dst->root_item_lock);
153 + if (root_dst->send_in_progress) {
154 + btrfs_warn_rl(root_dst->fs_info,
155 +"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
156 + root_dst->root_key.objectid,
157 + root_dst->send_in_progress);
158 + spin_unlock(&root_dst->root_item_lock);
159 + return -EAGAIN;
160 + }
161 + root_dst->dedupe_in_progress++;
162 + spin_unlock(&root_dst->root_item_lock);
163
164 /* don't make the dst file partly checksummed */
165 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
166 @@ -3293,7 +3306,7 @@ static int btrfs_extent_same(struct inod
167 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
168 dst, dst_loff);
169 if (ret)
170 - return ret;
171 + goto out;
172
173 loff += BTRFS_MAX_DEDUPE_LEN;
174 dst_loff += BTRFS_MAX_DEDUPE_LEN;
175 @@ -3302,6 +3315,10 @@ static int btrfs_extent_same(struct inod
176 if (tail_len > 0)
177 ret = btrfs_extent_same_range(src, loff, tail_len, dst,
178 dst_loff);
179 +out:
180 + spin_lock(&root_dst->root_item_lock);
181 + root_dst->dedupe_in_progress--;
182 + spin_unlock(&root_dst->root_item_lock);
183
184 return ret;
185 }
186 --- a/fs/btrfs/send.c
187 +++ b/fs/btrfs/send.c
188 @@ -6626,6 +6626,13 @@ static void btrfs_root_dec_send_in_progr
189 spin_unlock(&root->root_item_lock);
190 }
191
192 +static void dedupe_in_progress_warn(const struct btrfs_root *root)
193 +{
194 + btrfs_warn_rl(root->fs_info,
195 +"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
196 + root->root_key.objectid, root->dedupe_in_progress);
197 +}
198 +
199 long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
200 {
201 int ret = 0;
202 @@ -6649,6 +6656,11 @@ long btrfs_ioctl_send(struct file *mnt_f
203 * making it RW. This also protects against deletion.
204 */
205 spin_lock(&send_root->root_item_lock);
206 + if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
207 + dedupe_in_progress_warn(send_root);
208 + spin_unlock(&send_root->root_item_lock);
209 + return -EAGAIN;
210 + }
211 send_root->send_in_progress++;
212 spin_unlock(&send_root->root_item_lock);
213
214 @@ -6783,6 +6795,13 @@ long btrfs_ioctl_send(struct file *mnt_f
215 ret = -EPERM;
216 goto out;
217 }
218 + if (clone_root->dedupe_in_progress) {
219 + dedupe_in_progress_warn(clone_root);
220 + spin_unlock(&clone_root->root_item_lock);
221 + srcu_read_unlock(&fs_info->subvol_srcu, index);
222 + ret = -EAGAIN;
223 + goto out;
224 + }
225 clone_root->send_in_progress++;
226 spin_unlock(&clone_root->root_item_lock);
227 srcu_read_unlock(&fs_info->subvol_srcu, index);
228 @@ -6817,6 +6836,13 @@ long btrfs_ioctl_send(struct file *mnt_f
229 ret = -EPERM;
230 goto out;
231 }
232 + if (sctx->parent_root->dedupe_in_progress) {
233 + dedupe_in_progress_warn(sctx->parent_root);
234 + spin_unlock(&sctx->parent_root->root_item_lock);
235 + srcu_read_unlock(&fs_info->subvol_srcu, index);
236 + ret = -EAGAIN;
237 + goto out;
238 + }
239 spin_unlock(&sctx->parent_root->root_item_lock);
240
241 srcu_read_unlock(&fs_info->subvol_srcu, index);