]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
c1d7c514 | 2 | |
d1310b2e | 3 | #include <linux/err.h> |
d1310b2e | 4 | #include <linux/slab.h> |
a52d9a80 | 5 | #include <linux/spinlock.h> |
9b569ea0 | 6 | #include "messages.h" |
261507a0 | 7 | #include "ctree.h" |
1c11b63e | 8 | #include "volumes.h" |
a52d9a80 | 9 | #include "extent_map.h" |
ebb8765b | 10 | #include "compression.h" |
4c0c8cfc | 11 | #include "btrfs_inode.h" |
a52d9a80 | 12 | |
86479a04 | 13 | |
a52d9a80 | 14 | static struct kmem_cache *extent_map_cache; |
ca664626 | 15 | |
2f4cbe64 | 16 | int __init extent_map_init(void) |
a52d9a80 | 17 | { |
837e1972 | 18 | extent_map_cache = kmem_cache_create("btrfs_extent_map", |
9601e3f6 | 19 | sizeof(struct extent_map), 0, |
fba4b697 | 20 | SLAB_MEM_SPREAD, NULL); |
2f4cbe64 WB |
21 | if (!extent_map_cache) |
22 | return -ENOMEM; | |
2f4cbe64 | 23 | return 0; |
a52d9a80 CM |
24 | } |
25 | ||
e67c718b | 26 | void __cold extent_map_exit(void) |
a52d9a80 | 27 | { |
5598e900 | 28 | kmem_cache_destroy(extent_map_cache); |
a52d9a80 CM |
29 | } |
30 | ||
43dd529a DS |
31 | /* |
32 | * Initialize the extent tree @tree. Should be called for each new inode or | |
33 | * other user of the extent_map interface. | |
9d2423c5 | 34 | */ |
a8067e02 | 35 | void extent_map_tree_init(struct extent_map_tree *tree) |
a52d9a80 | 36 | { |
07e1ce09 | 37 | tree->map = RB_ROOT_CACHED; |
5dc562c5 | 38 | INIT_LIST_HEAD(&tree->modified_extents); |
890871be | 39 | rwlock_init(&tree->lock); |
a52d9a80 | 40 | } |
a52d9a80 | 41 | |
43dd529a DS |
42 | /* |
43 | * Allocate a new extent_map structure. The new structure is returned with a | |
44 | * reference count of one and needs to be freed using free_extent_map() | |
9d2423c5 | 45 | */ |
172ddd60 | 46 | struct extent_map *alloc_extent_map(void) |
a52d9a80 CM |
47 | { |
48 | struct extent_map *em; | |
70c8a91c | 49 | em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); |
c26a9203 TI |
50 | if (!em) |
51 | return NULL; | |
cbc0e928 | 52 | RB_CLEAR_NODE(&em->rb_node); |
261507a0 | 53 | em->compress_type = BTRFS_COMPRESS_NONE; |
490b54d6 | 54 | refcount_set(&em->refs, 1); |
5dc562c5 | 55 | INIT_LIST_HEAD(&em->list); |
a52d9a80 CM |
56 | return em; |
57 | } | |
a52d9a80 | 58 | |
43dd529a DS |
59 | /* |
60 | * Drop the reference out on @em by one and free the structure if the reference | |
61 | * count hits zero. | |
9d2423c5 | 62 | */ |
a52d9a80 CM |
63 | void free_extent_map(struct extent_map *em) |
64 | { | |
2bf5a725 CM |
65 | if (!em) |
66 | return; | |
490b54d6 | 67 | if (refcount_dec_and_test(&em->refs)) { |
cbc0e928 | 68 | WARN_ON(extent_map_in_tree(em)); |
5dc562c5 | 69 | WARN_ON(!list_empty(&em->list)); |
a52d9a80 CM |
70 | kmem_cache_free(extent_map_cache, em); |
71 | } | |
72 | } | |
a52d9a80 | 73 | |
43dd529a | 74 | /* Do the math around the end of an extent, handling wrapping. */ |
32193c14 FDBM |
75 | static u64 range_end(u64 start, u64 len) |
76 | { | |
77 | if (start + len < start) | |
78 | return (u64)-1; | |
79 | return start + len; | |
80 | } | |
81 | ||
07e1ce09 | 82 | static int tree_insert(struct rb_root_cached *root, struct extent_map *em) |
a52d9a80 | 83 | { |
07e1ce09 | 84 | struct rb_node **p = &root->rb_root.rb_node; |
d397712b | 85 | struct rb_node *parent = NULL; |
32193c14 FDBM |
86 | struct extent_map *entry = NULL; |
87 | struct rb_node *orig_parent = NULL; | |
88 | u64 end = range_end(em->start, em->len); | |
07e1ce09 | 89 | bool leftmost = true; |
a52d9a80 | 90 | |
d397712b | 91 | while (*p) { |
a52d9a80 | 92 | parent = *p; |
d1310b2e CM |
93 | entry = rb_entry(parent, struct extent_map, rb_node); |
94 | ||
07e1ce09 | 95 | if (em->start < entry->start) { |
a52d9a80 | 96 | p = &(*p)->rb_left; |
07e1ce09 | 97 | } else if (em->start >= extent_map_end(entry)) { |
a52d9a80 | 98 | p = &(*p)->rb_right; |
07e1ce09 LB |
99 | leftmost = false; |
100 | } else { | |
32193c14 | 101 | return -EEXIST; |
07e1ce09 | 102 | } |
a52d9a80 CM |
103 | } |
104 | ||
32193c14 FDBM |
105 | orig_parent = parent; |
106 | while (parent && em->start >= extent_map_end(entry)) { | |
107 | parent = rb_next(parent); | |
108 | entry = rb_entry(parent, struct extent_map, rb_node); | |
109 | } | |
110 | if (parent) | |
111 | if (end > entry->start && em->start < extent_map_end(entry)) | |
112 | return -EEXIST; | |
113 | ||
114 | parent = orig_parent; | |
115 | entry = rb_entry(parent, struct extent_map, rb_node); | |
116 | while (parent && em->start < entry->start) { | |
117 | parent = rb_prev(parent); | |
118 | entry = rb_entry(parent, struct extent_map, rb_node); | |
119 | } | |
120 | if (parent) | |
121 | if (end > entry->start && em->start < extent_map_end(entry)) | |
122 | return -EEXIST; | |
123 | ||
32193c14 | 124 | rb_link_node(&em->rb_node, orig_parent, p); |
07e1ce09 | 125 | rb_insert_color_cached(&em->rb_node, root, leftmost); |
32193c14 | 126 | return 0; |
a52d9a80 CM |
127 | } |
128 | ||
d352ac68 | 129 | /* |
43dd529a DS |
130 | * Search through the tree for an extent_map with a given offset. If it can't |
131 | * be found, try to find some neighboring extents | |
d352ac68 | 132 | */ |
a52d9a80 | 133 | static struct rb_node *__tree_search(struct rb_root *root, u64 offset, |
6c05813e | 134 | struct rb_node **prev_or_next_ret) |
a52d9a80 | 135 | { |
d397712b | 136 | struct rb_node *n = root->rb_node; |
a52d9a80 | 137 | struct rb_node *prev = NULL; |
5f56406a | 138 | struct rb_node *orig_prev = NULL; |
d1310b2e CM |
139 | struct extent_map *entry; |
140 | struct extent_map *prev_entry = NULL; | |
a52d9a80 | 141 | |
6c05813e | 142 | ASSERT(prev_or_next_ret); |
08f088dd | 143 | |
d397712b | 144 | while (n) { |
d1310b2e | 145 | entry = rb_entry(n, struct extent_map, rb_node); |
a52d9a80 CM |
146 | prev = n; |
147 | prev_entry = entry; | |
148 | ||
149 | if (offset < entry->start) | |
150 | n = n->rb_left; | |
d1310b2e | 151 | else if (offset >= extent_map_end(entry)) |
a52d9a80 CM |
152 | n = n->rb_right; |
153 | else | |
154 | return n; | |
155 | } | |
5f56406a | 156 | |
08f088dd FM |
157 | orig_prev = prev; |
158 | while (prev && offset >= extent_map_end(prev_entry)) { | |
159 | prev = rb_next(prev); | |
160 | prev_entry = rb_entry(prev, struct extent_map, rb_node); | |
5f56406a CM |
161 | } |
162 | ||
6c05813e FM |
163 | /* |
164 | * Previous extent map found, return as in this case the caller does not | |
165 | * care about the next one. | |
166 | */ | |
167 | if (prev) { | |
168 | *prev_or_next_ret = prev; | |
169 | return NULL; | |
170 | } | |
171 | ||
172 | prev = orig_prev; | |
08f088dd FM |
173 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
174 | while (prev && offset < prev_entry->start) { | |
175 | prev = rb_prev(prev); | |
d1310b2e | 176 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
a52d9a80 | 177 | } |
6c05813e | 178 | *prev_or_next_ret = prev; |
08f088dd | 179 | |
a52d9a80 CM |
180 | return NULL; |
181 | } | |
182 | ||
2ecec0d6 FM |
183 | static inline u64 extent_map_block_end(const struct extent_map *em) |
184 | { | |
185 | if (em->block_start + em->block_len < em->block_start) | |
186 | return (u64)-1; | |
187 | return em->block_start + em->block_len; | |
188 | } | |
189 | ||
1a9fb16c | 190 | static bool can_merge_extent_map(const struct extent_map *em) |
a52d9a80 | 191 | { |
1a9fb16c FM |
192 | if (test_bit(EXTENT_FLAG_PINNED, &em->flags)) |
193 | return false; | |
7f3c74fb | 194 | |
1a9fb16c FM |
195 | /* Don't merge compressed extents, we need to know their actual size. */ |
196 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) | |
197 | return false; | |
c8b97818 | 198 | |
1a9fb16c FM |
199 | if (test_bit(EXTENT_FLAG_LOGGING, &em->flags)) |
200 | return false; | |
201a9038 | 201 | |
09a2a8f9 JB |
202 | /* |
203 | * We don't want to merge stuff that hasn't been written to the log yet | |
204 | * since it may not reflect exactly what is on disk, and that would be | |
205 | * bad. | |
206 | */ | |
1a9fb16c FM |
207 | if (!list_empty(&em->list)) |
208 | return false; | |
209 | ||
210 | return true; | |
211 | } | |
09a2a8f9 | 212 | |
1a9fb16c FM |
213 | /* Check to see if two extent_map structs are adjacent and safe to merge. */ |
214 | static int mergable_maps(struct extent_map *prev, struct extent_map *next) | |
215 | { | |
d1310b2e CM |
216 | if (extent_map_end(prev) == next->start && |
217 | prev->flags == next->flags && | |
d1310b2e CM |
218 | ((next->block_start == EXTENT_MAP_HOLE && |
219 | prev->block_start == EXTENT_MAP_HOLE) || | |
220 | (next->block_start == EXTENT_MAP_INLINE && | |
221 | prev->block_start == EXTENT_MAP_INLINE) || | |
d1310b2e CM |
222 | (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && |
223 | next->block_start == extent_map_block_end(prev)))) { | |
224 | return 1; | |
225 | } | |
a52d9a80 CM |
226 | return 0; |
227 | } | |
228 | ||
4d2c8f62 | 229 | static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) |
a1ed835e | 230 | { |
a1ed835e CM |
231 | struct extent_map *merge = NULL; |
232 | struct rb_node *rb; | |
a1ed835e | 233 | |
ac05ca91 FM |
234 | /* |
235 | * We can't modify an extent map that is in the tree and that is being | |
236 | * used by another task, as it can cause that other task to see it in | |
237 | * inconsistent state during the merging. We always have 1 reference for | |
238 | * the tree and 1 for this task (which is unpinning the extent map or | |
239 | * clearing the logging flag), so anything > 2 means it's being used by | |
240 | * other tasks too. | |
241 | */ | |
242 | if (refcount_read(&em->refs) > 2) | |
243 | return; | |
244 | ||
1a9fb16c FM |
245 | if (!can_merge_extent_map(em)) |
246 | return; | |
247 | ||
a1ed835e CM |
248 | if (em->start != 0) { |
249 | rb = rb_prev(&em->rb_node); | |
250 | if (rb) | |
251 | merge = rb_entry(rb, struct extent_map, rb_node); | |
1a9fb16c | 252 | if (rb && can_merge_extent_map(merge) && mergable_maps(merge, em)) { |
a1ed835e | 253 | em->start = merge->start; |
70c8a91c | 254 | em->orig_start = merge->orig_start; |
a1ed835e CM |
255 | em->len += merge->len; |
256 | em->block_len += merge->block_len; | |
257 | em->block_start = merge->block_start; | |
70c8a91c JB |
258 | em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; |
259 | em->mod_start = merge->mod_start; | |
260 | em->generation = max(em->generation, merge->generation); | |
199257a7 | 261 | set_bit(EXTENT_FLAG_MERGED, &em->flags); |
5dc562c5 | 262 | |
07e1ce09 | 263 | rb_erase_cached(&merge->rb_node, &tree->map); |
cbc0e928 | 264 | RB_CLEAR_NODE(&merge->rb_node); |
a1ed835e CM |
265 | free_extent_map(merge); |
266 | } | |
267 | } | |
268 | ||
269 | rb = rb_next(&em->rb_node); | |
270 | if (rb) | |
271 | merge = rb_entry(rb, struct extent_map, rb_node); | |
1a9fb16c | 272 | if (rb && can_merge_extent_map(merge) && mergable_maps(em, merge)) { |
a1ed835e | 273 | em->len += merge->len; |
d527afe1 | 274 | em->block_len += merge->block_len; |
07e1ce09 | 275 | rb_erase_cached(&merge->rb_node, &tree->map); |
cbc0e928 | 276 | RB_CLEAR_NODE(&merge->rb_node); |
70c8a91c JB |
277 | em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; |
278 | em->generation = max(em->generation, merge->generation); | |
199257a7 | 279 | set_bit(EXTENT_FLAG_MERGED, &em->flags); |
a1ed835e CM |
280 | free_extent_map(merge); |
281 | } | |
4d2c8f62 LZ |
282 | } |
283 | ||
43dd529a DS |
284 | /* |
285 | * Unpin an extent from the cache. | |
286 | * | |
00deaf04 | 287 | * @inode: the inode from which we are unpinning an extent range |
5dc562c5 JB |
288 | * @start: logical offset in the file |
289 | * @len: length of the extent | |
290 | * @gen: generation that this extent has been modified in | |
5dc562c5 JB |
291 | * |
292 | * Called after an extent has been written to disk properly. Set the generation | |
293 | * to the generation that actually added the file item to the inode so we know | |
294 | * we need to sync this extent when we call fsync(). | |
295 | */ | |
00deaf04 | 296 | int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) |
4d2c8f62 | 297 | { |
00deaf04 FM |
298 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
299 | struct extent_map_tree *tree = &inode->extent_tree; | |
4d2c8f62 LZ |
300 | int ret = 0; |
301 | struct extent_map *em; | |
4e2f84e6 | 302 | bool prealloc = false; |
4d2c8f62 LZ |
303 | |
304 | write_lock(&tree->lock); | |
305 | em = lookup_extent_mapping(tree, start, len); | |
306 | ||
00deaf04 FM |
307 | if (WARN_ON(!em)) { |
308 | btrfs_warn(fs_info, | |
309 | "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", | |
310 | btrfs_ino(inode), btrfs_root_id(inode->root), | |
311 | start, len, gen); | |
4d2c8f62 | 312 | goto out; |
00deaf04 FM |
313 | } |
314 | ||
315 | if (WARN_ON(em->start != start)) | |
316 | btrfs_warn(fs_info, | |
317 | "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", | |
318 | btrfs_ino(inode), btrfs_root_id(inode->root), | |
319 | em->start, start, len, gen); | |
4d2c8f62 | 320 | |
5dc562c5 | 321 | em->generation = gen; |
4d2c8f62 | 322 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); |
4e2f84e6 LB |
323 | em->mod_start = em->start; |
324 | em->mod_len = em->len; | |
325 | ||
b11e234d | 326 | if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { |
4e2f84e6 | 327 | prealloc = true; |
b11e234d | 328 | clear_bit(EXTENT_FLAG_FILLING, &em->flags); |
4e2f84e6 | 329 | } |
4d2c8f62 LZ |
330 | |
331 | try_merge_map(tree, em); | |
4e2f84e6 LB |
332 | |
333 | if (prealloc) { | |
334 | em->mod_start = em->start; | |
335 | em->mod_len = em->len; | |
336 | } | |
337 | ||
a1ed835e CM |
338 | free_extent_map(em); |
339 | out: | |
340 | write_unlock(&tree->lock); | |
341 | return ret; | |
342 | ||
343 | } | |
344 | ||
201a9038 JB |
345 | void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) |
346 | { | |
74333c7d FM |
347 | lockdep_assert_held_write(&tree->lock); |
348 | ||
201a9038 | 349 | clear_bit(EXTENT_FLAG_LOGGING, &em->flags); |
cbc0e928 | 350 | if (extent_map_in_tree(em)) |
222c81dc | 351 | try_merge_map(tree, em); |
201a9038 JB |
352 | } |
353 | ||
176840b3 FM |
354 | static inline void setup_extent_mapping(struct extent_map_tree *tree, |
355 | struct extent_map *em, | |
356 | int modified) | |
357 | { | |
490b54d6 | 358 | refcount_inc(&em->refs); |
176840b3 FM |
359 | em->mod_start = em->start; |
360 | em->mod_len = em->len; | |
361 | ||
32d53f6f FM |
362 | ASSERT(list_empty(&em->list)); |
363 | ||
176840b3 | 364 | if (modified) |
32d53f6f | 365 | list_add(&em->list, &tree->modified_extents); |
176840b3 FM |
366 | else |
367 | try_merge_map(tree, em); | |
368 | } | |
369 | ||
43dd529a | 370 | /* |
401bd2dd NB |
371 | * Add new extent map to the extent tree |
372 | * | |
9d2423c5 CH |
373 | * @tree: tree to insert new map in |
374 | * @em: map to insert | |
401bd2dd NB |
375 | * @modified: indicate whether the given @em should be added to the |
376 | * modified list, which indicates the extent needs to be logged | |
9d2423c5 CH |
377 | * |
378 | * Insert @em into @tree or perform a simple forward/backward merge with | |
379 | * existing mappings. The extent_map struct passed in will be inserted | |
380 | * into the tree directly, with an additional reference taken, or a | |
25985edc | 381 | * reference dropped if the merge attempt was successful. |
a52d9a80 | 382 | */ |
db9d9446 FM |
383 | static int add_extent_mapping(struct extent_map_tree *tree, |
384 | struct extent_map *em, int modified) | |
a52d9a80 CM |
385 | { |
386 | int ret = 0; | |
a52d9a80 | 387 | |
d23ea3fa DS |
388 | lockdep_assert_held_write(&tree->lock); |
389 | ||
32193c14 FDBM |
390 | ret = tree_insert(&tree->map, em); |
391 | if (ret) | |
a52d9a80 | 392 | goto out; |
32193c14 | 393 | |
176840b3 | 394 | setup_extent_mapping(tree, em, modified); |
a52d9a80 | 395 | out: |
a52d9a80 CM |
396 | return ret; |
397 | } | |
a52d9a80 | 398 | |
48a3b636 ES |
399 | static struct extent_map * |
400 | __lookup_extent_mapping(struct extent_map_tree *tree, | |
401 | u64 start, u64 len, int strict) | |
a52d9a80 CM |
402 | { |
403 | struct extent_map *em; | |
404 | struct rb_node *rb_node; | |
6c05813e | 405 | struct rb_node *prev_or_next = NULL; |
306929f3 CH |
406 | u64 end = range_end(start, len); |
407 | ||
6c05813e | 408 | rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); |
a52d9a80 | 409 | if (!rb_node) { |
6c05813e FM |
410 | if (prev_or_next) |
411 | rb_node = prev_or_next; | |
ed64f066 LZ |
412 | else |
413 | return NULL; | |
a52d9a80 | 414 | } |
ed64f066 | 415 | |
a52d9a80 | 416 | em = rb_entry(rb_node, struct extent_map, rb_node); |
d1310b2e | 417 | |
ed64f066 LZ |
418 | if (strict && !(end > em->start && start < extent_map_end(em))) |
419 | return NULL; | |
d1310b2e | 420 | |
490b54d6 | 421 | refcount_inc(&em->refs); |
a52d9a80 CM |
422 | return em; |
423 | } | |
a52d9a80 | 424 | |
43dd529a DS |
425 | /* |
426 | * Lookup extent_map that intersects @start + @len range. | |
427 | * | |
ed64f066 LZ |
428 | * @tree: tree to lookup in |
429 | * @start: byte offset to start the search | |
430 | * @len: length of the lookup range | |
431 | * | |
432 | * Find and return the first extent_map struct in @tree that intersects the | |
433 | * [start, len] range. There may be additional objects in the tree that | |
434 | * intersect, so check the object returned carefully to make sure that no | |
435 | * additional lookups are needed. | |
436 | */ | |
437 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | |
438 | u64 start, u64 len) | |
439 | { | |
440 | return __lookup_extent_mapping(tree, start, len, 1); | |
441 | } | |
442 | ||
43dd529a DS |
443 | /* |
444 | * Find a nearby extent map intersecting @start + @len (not an exact search). | |
445 | * | |
b917b7c3 CM |
446 | * @tree: tree to lookup in |
447 | * @start: byte offset to start the search | |
448 | * @len: length of the lookup range | |
449 | * | |
450 | * Find and return the first extent_map struct in @tree that intersects the | |
451 | * [start, len] range. | |
452 | * | |
453 | * If one can't be found, any nearby extent may be returned | |
454 | */ | |
455 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, | |
456 | u64 start, u64 len) | |
457 | { | |
ed64f066 | 458 | return __lookup_extent_mapping(tree, start, len, 0); |
b917b7c3 CM |
459 | } |
460 | ||
43dd529a DS |
461 | /* |
462 | * Remove an extent_map from the extent tree. | |
463 | * | |
9d2423c5 | 464 | * @tree: extent tree to remove from |
bb7ab3b9 | 465 | * @em: extent map being removed |
9d2423c5 | 466 | * |
43dd529a DS |
467 | * Remove @em from @tree. No reference counts are dropped, and no checks |
468 | * are done to see if the range is in use. | |
a52d9a80 | 469 | */ |
c1766dd7 | 470 | void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) |
a52d9a80 | 471 | { |
6d3b050e FM |
472 | lockdep_assert_held_write(&tree->lock); |
473 | ||
7f3c74fb | 474 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); |
07e1ce09 | 475 | rb_erase_cached(&em->rb_node, &tree->map); |
ff44c6e3 JB |
476 | if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) |
477 | list_del_init(&em->list); | |
cbc0e928 | 478 | RB_CLEAR_NODE(&em->rb_node); |
a52d9a80 | 479 | } |
176840b3 | 480 | |
a6f3e205 CH |
481 | static void replace_extent_mapping(struct extent_map_tree *tree, |
482 | struct extent_map *cur, | |
483 | struct extent_map *new, | |
484 | int modified) | |
176840b3 | 485 | { |
6d3b050e FM |
486 | lockdep_assert_held_write(&tree->lock); |
487 | ||
176840b3 FM |
488 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); |
489 | ASSERT(extent_map_in_tree(cur)); | |
490 | if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) | |
491 | list_del_init(&cur->list); | |
07e1ce09 | 492 | rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); |
176840b3 FM |
493 | RB_CLEAR_NODE(&cur->rb_node); |
494 | ||
495 | setup_extent_mapping(tree, new, modified); | |
496 | } | |
c04e61b5 | 497 | |
d47704bd | 498 | static struct extent_map *next_extent_map(const struct extent_map *em) |
c04e61b5 LB |
499 | { |
500 | struct rb_node *next; | |
501 | ||
502 | next = rb_next(&em->rb_node); | |
503 | if (!next) | |
504 | return NULL; | |
505 | return container_of(next, struct extent_map, rb_node); | |
506 | } | |
507 | ||
508 | static struct extent_map *prev_extent_map(struct extent_map *em) | |
509 | { | |
510 | struct rb_node *prev; | |
511 | ||
512 | prev = rb_prev(&em->rb_node); | |
513 | if (!prev) | |
514 | return NULL; | |
515 | return container_of(prev, struct extent_map, rb_node); | |
516 | } | |
517 | ||
52042d8e AG |
518 | /* |
519 | * Helper for btrfs_get_extent. Given an existing extent in the tree, | |
c04e61b5 LB |
520 | * the existing extent is the nearest extent to map_start, |
521 | * and an extent that you want to insert, deal with overlap and insert | |
522 | * the best fitted new extent into the tree. | |
523 | */ | |
5f4791f4 LB |
524 | static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, |
525 | struct extent_map *existing, | |
526 | struct extent_map *em, | |
527 | u64 map_start) | |
c04e61b5 LB |
528 | { |
529 | struct extent_map *prev; | |
530 | struct extent_map *next; | |
531 | u64 start; | |
532 | u64 end; | |
533 | u64 start_diff; | |
534 | ||
535 | BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); | |
536 | ||
537 | if (existing->start > map_start) { | |
538 | next = existing; | |
539 | prev = prev_extent_map(next); | |
540 | } else { | |
541 | prev = existing; | |
542 | next = next_extent_map(prev); | |
543 | } | |
544 | ||
545 | start = prev ? extent_map_end(prev) : em->start; | |
546 | start = max_t(u64, start, em->start); | |
547 | end = next ? next->start : extent_map_end(em); | |
548 | end = min_t(u64, end, extent_map_end(em)); | |
549 | start_diff = start - em->start; | |
550 | em->start = start; | |
551 | em->len = end - start; | |
552 | if (em->block_start < EXTENT_MAP_LAST_BYTE && | |
553 | !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | |
554 | em->block_start += start_diff; | |
555 | em->block_len = em->len; | |
556 | } | |
557 | return add_extent_mapping(em_tree, em, 0); | |
558 | } | |
559 | ||
43dd529a DS |
560 | /* |
561 | * Add extent mapping into em_tree. | |
9ad37bb3 NB |
562 | * |
563 | * @fs_info: the filesystem | |
564 | * @em_tree: extent tree into which we want to insert the extent mapping | |
565 | * @em_in: extent we are inserting | |
566 | * @start: start of the logical range btrfs_get_extent() is requesting | |
567 | * @len: length of the logical range btrfs_get_extent() is requesting | |
c04e61b5 LB |
568 | * |
569 | * Note that @em_in's range may be different from [start, start+len), | |
570 | * but they must be overlapped. | |
571 | * | |
572 | * Insert @em_in into @em_tree. In case there is an overlapping range, handle | |
573 | * the -EEXIST by either: | |
574 | * a) Returning the existing extent in @em_in if @start is within the | |
575 | * existing em. | |
576 | * b) Merge the existing extent with @em_in passed in. | |
577 | * | |
578 | * Return 0 on success, otherwise -EEXIST. | |
579 | * | |
580 | */ | |
f46b24c9 DS |
581 | int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, |
582 | struct extent_map_tree *em_tree, | |
c04e61b5 LB |
583 | struct extent_map **em_in, u64 start, u64 len) |
584 | { | |
585 | int ret; | |
586 | struct extent_map *em = *em_in; | |
587 | ||
d52a1365 QW |
588 | /* |
589 | * Tree-checker should have rejected any inline extent with non-zero | |
590 | * file offset. Here just do a sanity check. | |
591 | */ | |
592 | if (em->block_start == EXTENT_MAP_INLINE) | |
593 | ASSERT(em->start == 0); | |
594 | ||
c04e61b5 LB |
595 | ret = add_extent_mapping(em_tree, em, 0); |
596 | /* it is possible that someone inserted the extent into the tree | |
597 | * while we had the lock dropped. It is also possible that | |
598 | * an overlapping map exists in the tree | |
599 | */ | |
600 | if (ret == -EEXIST) { | |
601 | struct extent_map *existing; | |
602 | ||
c04e61b5 | 603 | existing = search_extent_mapping(em_tree, start, len); |
393da918 | 604 | |
f46b24c9 | 605 | trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); |
393da918 | 606 | |
c04e61b5 LB |
607 | /* |
608 | * existing will always be non-NULL, since there must be | |
609 | * extent causing the -EEXIST. | |
610 | */ | |
611 | if (start >= existing->start && | |
612 | start < extent_map_end(existing)) { | |
613 | free_extent_map(em); | |
614 | *em_in = existing; | |
615 | ret = 0; | |
616 | } else { | |
9a7e10e7 LB |
617 | u64 orig_start = em->start; |
618 | u64 orig_len = em->len; | |
619 | ||
c04e61b5 LB |
620 | /* |
621 | * The existing extent map is the one nearest to | |
622 | * the [start, start + len) range which overlaps | |
623 | */ | |
624 | ret = merge_extent_mapping(em_tree, existing, | |
625 | em, start); | |
c04e61b5 LB |
626 | if (ret) { |
627 | free_extent_map(em); | |
628 | *em_in = NULL; | |
9a7e10e7 LB |
629 | WARN_ONCE(ret, |
630 | "unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", | |
631 | ret, existing->start, existing->len, | |
632 | orig_start, orig_len); | |
c04e61b5 | 633 | } |
9a7e10e7 | 634 | free_extent_map(existing); |
c04e61b5 LB |
635 | } |
636 | } | |
637 | ||
638 | ASSERT(ret == 0 || ret == -EEXIST); | |
639 | return ret; | |
640 | } | |
4c0c8cfc | 641 | |
9c9d1b4f FM |
642 | /* |
643 | * Drop all extent maps from a tree in the fastest possible way, rescheduling | |
644 | * if needed. This avoids searching the tree, from the root down to the first | |
645 | * extent map, before each deletion. | |
646 | */ | |
647 | static void drop_all_extent_maps_fast(struct extent_map_tree *tree) | |
648 | { | |
649 | write_lock(&tree->lock); | |
650 | while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { | |
651 | struct extent_map *em; | |
652 | struct rb_node *node; | |
653 | ||
654 | node = rb_first_cached(&tree->map); | |
655 | em = rb_entry(node, struct extent_map, rb_node); | |
656 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | |
657 | clear_bit(EXTENT_FLAG_LOGGING, &em->flags); | |
658 | remove_extent_mapping(tree, em); | |
659 | free_extent_map(em); | |
660 | cond_resched_rwlock_write(&tree->lock); | |
661 | } | |
662 | write_unlock(&tree->lock); | |
663 | } | |
664 | ||
4c0c8cfc FM |
665 | /* |
666 | * Drop all extent maps in a given range. | |
667 | * | |
668 | * @inode: The target inode. | |
669 | * @start: Start offset of the range. | |
670 | * @end: End offset of the range (inclusive value). | |
671 | * @skip_pinned: Indicate if pinned extent maps should be ignored or not. | |
672 | * | |
673 | * This drops all the extent maps that intersect the given range [@start, @end]. | |
674 | * Extent maps that partially overlap the range and extend behind or beyond it, | |
675 | * are split. | |
676 | * The caller should have locked an appropriate file range in the inode's io | |
677 | * tree before calling this function. | |
678 | */ | |
679 | void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, | |
680 | bool skip_pinned) | |
681 | { | |
db21370b FM |
682 | struct extent_map *split; |
683 | struct extent_map *split2; | |
684 | struct extent_map *em; | |
4c0c8cfc FM |
685 | struct extent_map_tree *em_tree = &inode->extent_tree; |
686 | u64 len = end - start + 1; | |
4c0c8cfc FM |
687 | |
688 | WARN_ON(end < start); | |
689 | if (end == (u64)-1) { | |
9c9d1b4f FM |
690 | if (start == 0 && !skip_pinned) { |
691 | drop_all_extent_maps_fast(em_tree); | |
692 | return; | |
693 | } | |
4c0c8cfc | 694 | len = (u64)-1; |
db21370b FM |
695 | } else { |
696 | /* Make end offset exclusive for use in the loop below. */ | |
697 | end++; | |
4c0c8cfc | 698 | } |
db21370b FM |
699 | |
700 | /* | |
701 | * It's ok if we fail to allocate the extent maps, see the comment near | |
702 | * the bottom of the loop below. We only need two spare extent maps in | |
703 | * the worst case, where the first extent map that intersects our range | |
704 | * starts before the range and the last extent map that intersects our | |
705 | * range ends after our range (and they might be the same extent map), | |
706 | * because we need to split those two extent maps at the boundaries. | |
707 | */ | |
708 | split = alloc_extent_map(); | |
709 | split2 = alloc_extent_map(); | |
710 | ||
711 | write_lock(&em_tree->lock); | |
712 | em = lookup_extent_mapping(em_tree, start, len); | |
713 | ||
714 | while (em) { | |
715 | /* extent_map_end() returns exclusive value (last byte + 1). */ | |
716 | const u64 em_end = extent_map_end(em); | |
717 | struct extent_map *next_em = NULL; | |
4c0c8cfc FM |
718 | u64 gen; |
719 | unsigned long flags; | |
4c0c8cfc FM |
720 | bool modified; |
721 | bool compressed; | |
722 | ||
db21370b FM |
723 | if (em_end < end) { |
724 | next_em = next_extent_map(em); | |
725 | if (next_em) { | |
726 | if (next_em->start < end) | |
727 | refcount_inc(&next_em->refs); | |
728 | else | |
729 | next_em = NULL; | |
730 | } | |
4c0c8cfc | 731 | } |
db21370b | 732 | |
4c0c8cfc | 733 | if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { |
f3109e33 | 734 | start = em_end; |
db21370b | 735 | goto next; |
4c0c8cfc | 736 | } |
db21370b | 737 | |
e4cc1483 | 738 | flags = em->flags; |
4c0c8cfc | 739 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); |
e4cc1483 FM |
740 | /* |
741 | * In case we split the extent map, we want to preserve the | |
742 | * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want | |
743 | * it on the new extent maps. | |
744 | */ | |
4c0c8cfc FM |
745 | clear_bit(EXTENT_FLAG_LOGGING, &flags); |
746 | modified = !list_empty(&em->list); | |
db21370b FM |
747 | |
748 | /* | |
749 | * The extent map does not cross our target range, so no need to | |
750 | * split it, we can remove it directly. | |
751 | */ | |
752 | if (em->start >= start && em_end <= end) | |
753 | goto remove_em; | |
754 | ||
db21370b FM |
755 | gen = em->generation; |
756 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | |
4c0c8cfc FM |
757 | |
758 | if (em->start < start) { | |
db21370b FM |
759 | if (!split) { |
760 | split = split2; | |
761 | split2 = NULL; | |
762 | if (!split) | |
763 | goto remove_em; | |
764 | } | |
4c0c8cfc FM |
765 | split->start = em->start; |
766 | split->len = start - em->start; | |
767 | ||
768 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { | |
769 | split->orig_start = em->orig_start; | |
770 | split->block_start = em->block_start; | |
771 | ||
772 | if (compressed) | |
773 | split->block_len = em->block_len; | |
774 | else | |
775 | split->block_len = split->len; | |
776 | split->orig_block_len = max(split->block_len, | |
777 | em->orig_block_len); | |
778 | split->ram_bytes = em->ram_bytes; | |
779 | } else { | |
780 | split->orig_start = split->start; | |
781 | split->block_len = 0; | |
782 | split->block_start = em->block_start; | |
783 | split->orig_block_len = 0; | |
784 | split->ram_bytes = split->len; | |
785 | } | |
786 | ||
787 | split->generation = gen; | |
788 | split->flags = flags; | |
789 | split->compress_type = em->compress_type; | |
790 | replace_extent_mapping(em_tree, em, split, modified); | |
791 | free_extent_map(split); | |
792 | split = split2; | |
793 | split2 = NULL; | |
794 | } | |
db21370b FM |
795 | if (em_end > end) { |
796 | if (!split) { | |
797 | split = split2; | |
798 | split2 = NULL; | |
799 | if (!split) | |
800 | goto remove_em; | |
801 | } | |
c962098c JB |
802 | split->start = end; |
803 | split->len = em_end - end; | |
4c0c8cfc FM |
804 | split->block_start = em->block_start; |
805 | split->flags = flags; | |
806 | split->compress_type = em->compress_type; | |
807 | split->generation = gen; | |
808 | ||
809 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { | |
810 | split->orig_block_len = max(em->block_len, | |
811 | em->orig_block_len); | |
812 | ||
813 | split->ram_bytes = em->ram_bytes; | |
814 | if (compressed) { | |
815 | split->block_len = em->block_len; | |
816 | split->orig_start = em->orig_start; | |
817 | } else { | |
818 | const u64 diff = start + len - em->start; | |
819 | ||
820 | split->block_len = split->len; | |
821 | split->block_start += diff; | |
822 | split->orig_start = em->orig_start; | |
823 | } | |
824 | } else { | |
825 | split->ram_bytes = split->len; | |
826 | split->orig_start = split->start; | |
827 | split->block_len = 0; | |
828 | split->orig_block_len = 0; | |
829 | } | |
830 | ||
831 | if (extent_map_in_tree(em)) { | |
832 | replace_extent_mapping(em_tree, em, split, | |
833 | modified); | |
834 | } else { | |
835 | int ret; | |
836 | ||
837 | ret = add_extent_mapping(em_tree, split, | |
838 | modified); | |
839 | /* Logic error, shouldn't happen. */ | |
840 | ASSERT(ret == 0); | |
841 | if (WARN_ON(ret != 0) && modified) | |
842 | btrfs_set_inode_full_sync(inode); | |
843 | } | |
844 | free_extent_map(split); | |
845 | split = NULL; | |
846 | } | |
db21370b | 847 | remove_em: |
4c0c8cfc FM |
848 | if (extent_map_in_tree(em)) { |
849 | /* | |
850 | * If the extent map is still in the tree it means that | |
851 | * either of the following is true: | |
852 | * | |
853 | * 1) It fits entirely in our range (doesn't end beyond | |
854 | * it or starts before it); | |
855 | * | |
856 | * 2) It starts before our range and/or ends after our | |
857 | * range, and we were not able to allocate the extent | |
858 | * maps for split operations, @split and @split2. | |
859 | * | |
860 | * If we are at case 2) then we just remove the entire | |
861 | * extent map - this is fine since if anyone needs it to | |
862 | * access the subranges outside our range, will just | |
863 | * load it again from the subvolume tree's file extent | |
864 | * item. However if the extent map was in the list of | |
865 | * modified extents, then we must mark the inode for a | |
866 | * full fsync, otherwise a fast fsync will miss this | |
867 | * extent if it's new and needs to be logged. | |
868 | */ | |
db21370b FM |
869 | if ((em->start < start || em_end > end) && modified) { |
870 | ASSERT(!split); | |
4c0c8cfc FM |
871 | btrfs_set_inode_full_sync(inode); |
872 | } | |
873 | remove_extent_mapping(em_tree, em); | |
874 | } | |
4c0c8cfc | 875 | |
db21370b FM |
876 | /* |
877 | * Once for the tree reference (we replaced or removed the | |
878 | * extent map from the tree). | |
879 | */ | |
4c0c8cfc | 880 | free_extent_map(em); |
db21370b FM |
881 | next: |
882 | /* Once for us (for our lookup reference). */ | |
4c0c8cfc | 883 | free_extent_map(em); |
db21370b FM |
884 | |
885 | em = next_em; | |
4c0c8cfc FM |
886 | } |
887 | ||
db21370b FM |
888 | write_unlock(&em_tree->lock); |
889 | ||
4c0c8cfc FM |
890 | free_extent_map(split); |
891 | free_extent_map(split2); | |
892 | } | |
a1ba4c08 FM |
893 | |
894 | /* | |
895 | * Replace a range in the inode's extent map tree with a new extent map. | |
896 | * | |
897 | * @inode: The target inode. | |
898 | * @new_em: The new extent map to add to the inode's extent map tree. | |
899 | * @modified: Indicate if the new extent map should be added to the list of | |
900 | * modified extents (for fast fsync tracking). | |
901 | * | |
902 | * Drops all the extent maps in the inode's extent map tree that intersect the | |
903 | * range of the new extent map and adds the new extent map to the tree. | |
904 | * The caller should have locked an appropriate file range in the inode's io | |
905 | * tree before calling this function. | |
906 | */ | |
907 | int btrfs_replace_extent_map_range(struct btrfs_inode *inode, | |
908 | struct extent_map *new_em, | |
909 | bool modified) | |
910 | { | |
911 | const u64 end = new_em->start + new_em->len - 1; | |
912 | struct extent_map_tree *tree = &inode->extent_tree; | |
913 | int ret; | |
914 | ||
915 | ASSERT(!extent_map_in_tree(new_em)); | |
916 | ||
917 | /* | |
918 | * The caller has locked an appropriate file range in the inode's io | |
919 | * tree, but getting -EEXIST when adding the new extent map can still | |
920 | * happen in case there are extents that partially cover the range, and | |
921 | * this is due to two tasks operating on different parts of the extent. | |
922 | * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from | |
923 | * btrfs_get_extent") for an example and details. | |
924 | */ | |
925 | do { | |
926 | btrfs_drop_extent_map_range(inode, new_em->start, end, false); | |
927 | write_lock(&tree->lock); | |
928 | ret = add_extent_mapping(tree, new_em, modified); | |
929 | write_unlock(&tree->lock); | |
930 | } while (ret == -EEXIST); | |
931 | ||
932 | return ret; | |
933 | } | |
a6f3e205 CH |
934 | |
935 | /* | |
f000bc6f CH |
936 | * Split off the first pre bytes from the extent_map at [start, start + len], |
937 | * and set the block_start for it to new_logical. | |
a6f3e205 CH |
938 | * |
939 | * This function is used when an ordered_extent needs to be split. | |
940 | */ | |
f000bc6f CH |
941 | int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, |
942 | u64 new_logical) | |
a6f3e205 CH |
943 | { |
944 | struct extent_map_tree *em_tree = &inode->extent_tree; | |
945 | struct extent_map *em; | |
946 | struct extent_map *split_pre = NULL; | |
947 | struct extent_map *split_mid = NULL; | |
948 | int ret = 0; | |
949 | unsigned long flags; | |
950 | ||
951 | ASSERT(pre != 0); | |
952 | ASSERT(pre < len); | |
953 | ||
954 | split_pre = alloc_extent_map(); | |
955 | if (!split_pre) | |
956 | return -ENOMEM; | |
957 | split_mid = alloc_extent_map(); | |
958 | if (!split_mid) { | |
959 | ret = -ENOMEM; | |
960 | goto out_free_pre; | |
961 | } | |
962 | ||
963 | lock_extent(&inode->io_tree, start, start + len - 1, NULL); | |
964 | write_lock(&em_tree->lock); | |
965 | em = lookup_extent_mapping(em_tree, start, len); | |
966 | if (!em) { | |
967 | ret = -EIO; | |
968 | goto out_unlock; | |
969 | } | |
970 | ||
971 | ASSERT(em->len == len); | |
972 | ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); | |
973 | ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); | |
974 | ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); | |
975 | ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); | |
976 | ASSERT(!list_empty(&em->list)); | |
977 | ||
978 | flags = em->flags; | |
979 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | |
980 | ||
981 | /* First, replace the em with a new extent_map starting from * em->start */ | |
982 | split_pre->start = em->start; | |
983 | split_pre->len = pre; | |
984 | split_pre->orig_start = split_pre->start; | |
f000bc6f | 985 | split_pre->block_start = new_logical; |
a6f3e205 CH |
986 | split_pre->block_len = split_pre->len; |
987 | split_pre->orig_block_len = split_pre->block_len; | |
988 | split_pre->ram_bytes = split_pre->len; | |
989 | split_pre->flags = flags; | |
990 | split_pre->compress_type = em->compress_type; | |
991 | split_pre->generation = em->generation; | |
992 | ||
993 | replace_extent_mapping(em_tree, em, split_pre, 1); | |
994 | ||
995 | /* | |
996 | * Now we only have an extent_map at: | |
997 | * [em->start, em->start + pre] | |
998 | */ | |
999 | ||
1000 | /* Insert the middle extent_map. */ | |
1001 | split_mid->start = em->start + pre; | |
1002 | split_mid->len = em->len - pre; | |
1003 | split_mid->orig_start = split_mid->start; | |
1004 | split_mid->block_start = em->block_start + pre; | |
1005 | split_mid->block_len = split_mid->len; | |
1006 | split_mid->orig_block_len = split_mid->block_len; | |
1007 | split_mid->ram_bytes = split_mid->len; | |
1008 | split_mid->flags = flags; | |
1009 | split_mid->compress_type = em->compress_type; | |
1010 | split_mid->generation = em->generation; | |
1011 | add_extent_mapping(em_tree, split_mid, 1); | |
1012 | ||
1013 | /* Once for us */ | |
1014 | free_extent_map(em); | |
1015 | /* Once for the tree */ | |
1016 | free_extent_map(em); | |
1017 | ||
1018 | out_unlock: | |
1019 | write_unlock(&em_tree->lock); | |
1020 | unlock_extent(&inode->io_tree, start, start + len - 1, NULL); | |
1021 | free_extent_map(split_mid); | |
1022 | out_free_pre: | |
1023 | free_extent_map(split_pre); | |
1024 | return ret; | |
1025 | } |