]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob
b883ad85e0bd7e8327b33d08ff7bf2184b6f7625
[thirdparty/kernel/stable-queue.git] /
1 From foo@baz Thu Oct 4 12:32:08 PDT 2018
2 From: Robbie Ko <robbieko@synology.com>
3 Date: Mon, 6 Aug 2018 10:30:30 +0800
4 Subject: Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
5
6 From: Robbie Ko <robbieko@synology.com>
7
8 [ Upstream commit 8ecebf4d767e2307a946c8905278d6358eda35c3 ]
9
10 Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced
11 nocow writes to fallback to COW, during writeback, when a snapshot is
12 created. This resulted in writes made before creating the snapshot to
13 unexpectedly fail with ENOSPC during writeback when success (0) was
14 returned to user space through the write system call.
15
16 The steps leading to this problem are:
17
18 1. When it's not possible to allocate data space for a write, the
19 buffered write path checks if a NOCOW write is possible. If it is,
20 it will not reserve space and success (0) is returned to user space.
21
22 2. Then when a snapshot is created, the root's will_be_snapshotted
23 atomic is incremented and writeback is triggered for all inode's that
24 belong to the root being snapshotted. Incrementing that atomic forces
25 all previous writes to fallback to COW during writeback (running
26 delalloc).
27
28 3. This results in the writeback for the inodes to fail and therefore
29 setting the ENOSPC error in their mappings, so that a subsequent
30 fsync on them will report the error to user space. So it's not a
31 completely silent data loss (since fsync will report ENOSPC) but it's
32 a very unexpected and undesirable behaviour, because if a clean
33 shutdown/unmount of the filesystem happens without previous calls to
34 fsync, it is expected to have the data present in the files after
35 mounting the filesystem again.
36
37 So fix this by adding a new atomic named snapshot_force_cow to the
38 root structure which prevents this behaviour and works the following way:
39
40 1. It is incremented when we start to create a snapshot after triggering
41 writeback and before waiting for writeback to finish.
42
43 2. This new atomic is now what is used by writeback (running delalloc)
44 to decide whether we need to fallback to COW or not. Because we
45 incremented this new atomic after triggering writeback in the
46 snapshot creation ioctl, we ensure that all buffered writes that
47 happened before snapshot creation will succeed and not fallback to
48 COW (which would make them fail with ENOSPC).
49
50 3. The existing atomic, will_be_snapshotted, is kept because it is used
51 to force new buffered writes, that start after we started
52 snapshotting, to reserve data space even when NOCOW is possible.
53 This makes these writes fail early with ENOSPC when there's no
54 available space to allocate, preventing the unexpected behaviour of
55 writeback later failing with ENOSPC due to a fallback to COW mode.
56
57 Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
58 Signed-off-by: Robbie Ko <robbieko@synology.com>
59 Reviewed-by: Filipe Manana <fdmanana@suse.com>
60 Signed-off-by: David Sterba <dsterba@suse.com>
61 Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
62 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
63 ---
64 fs/btrfs/ctree.h | 1 +
65 fs/btrfs/disk-io.c | 1 +
66 fs/btrfs/inode.c | 25 ++++---------------------
67 fs/btrfs/ioctl.c | 16 ++++++++++++++++
68 4 files changed, 22 insertions(+), 21 deletions(-)
69
70 --- a/fs/btrfs/ctree.h
71 +++ b/fs/btrfs/ctree.h
72 @@ -1277,6 +1277,7 @@ struct btrfs_root {
73 int send_in_progress;
74 struct btrfs_subvolume_writers *subv_writers;
75 atomic_t will_be_snapshotted;
76 + atomic_t snapshot_force_cow;
77
78 /* For qgroup metadata reserved space */
79 spinlock_t qgroup_meta_rsv_lock;
80 --- a/fs/btrfs/disk-io.c
81 +++ b/fs/btrfs/disk-io.c
82 @@ -1217,6 +1217,7 @@ static void __setup_root(struct btrfs_ro
83 atomic_set(&root->log_batch, 0);
84 refcount_set(&root->refs, 1);
85 atomic_set(&root->will_be_snapshotted, 0);
86 + atomic_set(&root->snapshot_force_cow, 0);
87 root->log_transid = 0;
88 root->log_transid_committed = -1;
89 root->last_log_commit = 0;
90 --- a/fs/btrfs/inode.c
91 +++ b/fs/btrfs/inode.c
92 @@ -1275,7 +1275,7 @@ static noinline int run_delalloc_nocow(s
93 u64 disk_num_bytes;
94 u64 ram_bytes;
95 int extent_type;
96 - int ret, err;
97 + int ret;
98 int type;
99 int nocow;
100 int check_prev = 1;
101 @@ -1407,11 +1407,8 @@ next_slot:
102 * if there are pending snapshots for this root,
103 * we fall into common COW way.
104 */
105 - if (!nolock) {
106 - err = btrfs_start_write_no_snapshotting(root);
107 - if (!err)
108 - goto out_check;
109 - }
110 + if (!nolock && atomic_read(&root->snapshot_force_cow))
111 + goto out_check;
112 /*
113 * force cow if csum exists in the range.
114 * this ensure that csum for a given extent are
115 @@ -1420,9 +1417,6 @@ next_slot:
116 ret = csum_exist_in_range(fs_info, disk_bytenr,
117 num_bytes);
118 if (ret) {
119 - if (!nolock)
120 - btrfs_end_write_no_snapshotting(root);
121 -
122 /*
123 * ret could be -EIO if the above fails to read
124 * metadata.
125 @@ -1435,11 +1429,8 @@ next_slot:
126 WARN_ON_ONCE(nolock);
127 goto out_check;
128 }
129 - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
130 - if (!nolock)
131 - btrfs_end_write_no_snapshotting(root);
132 + if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
133 goto out_check;
134 - }
135 nocow = 1;
136 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
137 extent_end = found_key.offset +
138 @@ -1453,8 +1444,6 @@ next_slot:
139 out_check:
140 if (extent_end <= start) {
141 path->slots[0]++;
142 - if (!nolock && nocow)
143 - btrfs_end_write_no_snapshotting(root);
144 if (nocow)
145 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
146 goto next_slot;
147 @@ -1476,8 +1465,6 @@ out_check:
148 end, page_started, nr_written, 1,
149 NULL);
150 if (ret) {
151 - if (!nolock && nocow)
152 - btrfs_end_write_no_snapshotting(root);
153 if (nocow)
154 btrfs_dec_nocow_writers(fs_info,
155 disk_bytenr);
156 @@ -1497,8 +1484,6 @@ out_check:
157 ram_bytes, BTRFS_COMPRESS_NONE,
158 BTRFS_ORDERED_PREALLOC);
159 if (IS_ERR(em)) {
160 - if (!nolock && nocow)
161 - btrfs_end_write_no_snapshotting(root);
162 if (nocow)
163 btrfs_dec_nocow_writers(fs_info,
164 disk_bytenr);
165 @@ -1537,8 +1522,6 @@ out_check:
166 EXTENT_CLEAR_DATA_RESV,
167 PAGE_UNLOCK | PAGE_SET_PRIVATE2);
168
169 - if (!nolock && nocow)
170 - btrfs_end_write_no_snapshotting(root);
171 cur_offset = extent_end;
172
173 /*
174 --- a/fs/btrfs/ioctl.c
175 +++ b/fs/btrfs/ioctl.c
176 @@ -761,6 +761,7 @@ static int create_snapshot(struct btrfs_
177 struct btrfs_pending_snapshot *pending_snapshot;
178 struct btrfs_trans_handle *trans;
179 int ret;
180 + bool snapshot_force_cow = false;
181
182 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
183 return -EINVAL;
184 @@ -777,6 +778,11 @@ static int create_snapshot(struct btrfs_
185 goto free_pending;
186 }
187
188 + /*
189 + * Force new buffered writes to reserve space even when NOCOW is
190 + * possible. This is to avoid later writeback (running dealloc) to
191 + * fallback to COW mode and unexpectedly fail with ENOSPC.
192 + */
193 atomic_inc(&root->will_be_snapshotted);
194 smp_mb__after_atomic();
195 /* wait for no snapshot writes */
196 @@ -787,6 +793,14 @@ static int create_snapshot(struct btrfs_
197 if (ret)
198 goto dec_and_free;
199
200 + /*
201 + * All previous writes have started writeback in NOCOW mode, so now
202 + * we force future writes to fallback to COW mode during snapshot
203 + * creation.
204 + */
205 + atomic_inc(&root->snapshot_force_cow);
206 + snapshot_force_cow = true;
207 +
208 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
209
210 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
211 @@ -851,6 +865,8 @@ static int create_snapshot(struct btrfs_
212 fail:
213 btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
214 dec_and_free:
215 + if (snapshot_force_cow)
216 + atomic_dec(&root->snapshot_force_cow);
217 if (atomic_dec_and_test(&root->will_be_snapshotted))
218 wake_up_var(&root->will_be_snapshotted);
219 free_pending: