]> git.ipfire.org Git - thirdparty/linux.git/blame - fs/btrfs/extent_map.c
btrfs: avoid useless rbtree iterations when attempting to merge extent map
[thirdparty/linux.git] / fs / btrfs / extent_map.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c1d7c514 2
d1310b2e 3#include <linux/err.h>
d1310b2e 4#include <linux/slab.h>
a52d9a80 5#include <linux/spinlock.h>
9b569ea0 6#include "messages.h"
261507a0 7#include "ctree.h"
1c11b63e 8#include "volumes.h"
a52d9a80 9#include "extent_map.h"
ebb8765b 10#include "compression.h"
4c0c8cfc 11#include "btrfs_inode.h"
a52d9a80 12
86479a04 13
a52d9a80 14static struct kmem_cache *extent_map_cache;
ca664626 15
2f4cbe64 16int __init extent_map_init(void)
a52d9a80 17{
837e1972 18 extent_map_cache = kmem_cache_create("btrfs_extent_map",
9601e3f6 19 sizeof(struct extent_map), 0,
fba4b697 20 SLAB_MEM_SPREAD, NULL);
2f4cbe64
WB
21 if (!extent_map_cache)
22 return -ENOMEM;
2f4cbe64 23 return 0;
a52d9a80
CM
24}
25
e67c718b 26void __cold extent_map_exit(void)
a52d9a80 27{
5598e900 28 kmem_cache_destroy(extent_map_cache);
a52d9a80
CM
29}
30
43dd529a
DS
31/*
32 * Initialize the extent tree @tree. Should be called for each new inode or
33 * other user of the extent_map interface.
9d2423c5 34 */
a8067e02 35void extent_map_tree_init(struct extent_map_tree *tree)
a52d9a80 36{
07e1ce09 37 tree->map = RB_ROOT_CACHED;
5dc562c5 38 INIT_LIST_HEAD(&tree->modified_extents);
890871be 39 rwlock_init(&tree->lock);
a52d9a80 40}
a52d9a80 41
43dd529a
DS
42/*
43 * Allocate a new extent_map structure. The new structure is returned with a
44 * reference count of one and needs to be freed using free_extent_map()
9d2423c5 45 */
172ddd60 46struct extent_map *alloc_extent_map(void)
a52d9a80
CM
47{
48 struct extent_map *em;
70c8a91c 49 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
c26a9203
TI
50 if (!em)
51 return NULL;
cbc0e928 52 RB_CLEAR_NODE(&em->rb_node);
261507a0 53 em->compress_type = BTRFS_COMPRESS_NONE;
490b54d6 54 refcount_set(&em->refs, 1);
5dc562c5 55 INIT_LIST_HEAD(&em->list);
a52d9a80
CM
56 return em;
57}
a52d9a80 58
43dd529a
DS
59/*
60 * Drop the reference out on @em by one and free the structure if the reference
61 * count hits zero.
9d2423c5 62 */
a52d9a80
CM
63void free_extent_map(struct extent_map *em)
64{
2bf5a725
CM
65 if (!em)
66 return;
490b54d6 67 if (refcount_dec_and_test(&em->refs)) {
cbc0e928 68 WARN_ON(extent_map_in_tree(em));
5dc562c5 69 WARN_ON(!list_empty(&em->list));
a52d9a80
CM
70 kmem_cache_free(extent_map_cache, em);
71 }
72}
a52d9a80 73
43dd529a 74/* Do the math around the end of an extent, handling wrapping. */
32193c14
FDBM
75static u64 range_end(u64 start, u64 len)
76{
77 if (start + len < start)
78 return (u64)-1;
79 return start + len;
80}
81
07e1ce09 82static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
a52d9a80 83{
07e1ce09 84 struct rb_node **p = &root->rb_root.rb_node;
d397712b 85 struct rb_node *parent = NULL;
32193c14
FDBM
86 struct extent_map *entry = NULL;
87 struct rb_node *orig_parent = NULL;
88 u64 end = range_end(em->start, em->len);
07e1ce09 89 bool leftmost = true;
a52d9a80 90
d397712b 91 while (*p) {
a52d9a80 92 parent = *p;
d1310b2e
CM
93 entry = rb_entry(parent, struct extent_map, rb_node);
94
07e1ce09 95 if (em->start < entry->start) {
a52d9a80 96 p = &(*p)->rb_left;
07e1ce09 97 } else if (em->start >= extent_map_end(entry)) {
a52d9a80 98 p = &(*p)->rb_right;
07e1ce09
LB
99 leftmost = false;
100 } else {
32193c14 101 return -EEXIST;
07e1ce09 102 }
a52d9a80
CM
103 }
104
32193c14
FDBM
105 orig_parent = parent;
106 while (parent && em->start >= extent_map_end(entry)) {
107 parent = rb_next(parent);
108 entry = rb_entry(parent, struct extent_map, rb_node);
109 }
110 if (parent)
111 if (end > entry->start && em->start < extent_map_end(entry))
112 return -EEXIST;
113
114 parent = orig_parent;
115 entry = rb_entry(parent, struct extent_map, rb_node);
116 while (parent && em->start < entry->start) {
117 parent = rb_prev(parent);
118 entry = rb_entry(parent, struct extent_map, rb_node);
119 }
120 if (parent)
121 if (end > entry->start && em->start < extent_map_end(entry))
122 return -EEXIST;
123
32193c14 124 rb_link_node(&em->rb_node, orig_parent, p);
07e1ce09 125 rb_insert_color_cached(&em->rb_node, root, leftmost);
32193c14 126 return 0;
a52d9a80
CM
127}
128
d352ac68 129/*
43dd529a
DS
130 * Search through the tree for an extent_map with a given offset. If it can't
131 * be found, try to find some neighboring extents
d352ac68 132 */
a52d9a80 133static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
6c05813e 134 struct rb_node **prev_or_next_ret)
a52d9a80 135{
d397712b 136 struct rb_node *n = root->rb_node;
a52d9a80 137 struct rb_node *prev = NULL;
5f56406a 138 struct rb_node *orig_prev = NULL;
d1310b2e
CM
139 struct extent_map *entry;
140 struct extent_map *prev_entry = NULL;
a52d9a80 141
6c05813e 142 ASSERT(prev_or_next_ret);
08f088dd 143
d397712b 144 while (n) {
d1310b2e 145 entry = rb_entry(n, struct extent_map, rb_node);
a52d9a80
CM
146 prev = n;
147 prev_entry = entry;
148
149 if (offset < entry->start)
150 n = n->rb_left;
d1310b2e 151 else if (offset >= extent_map_end(entry))
a52d9a80
CM
152 n = n->rb_right;
153 else
154 return n;
155 }
5f56406a 156
08f088dd
FM
157 orig_prev = prev;
158 while (prev && offset >= extent_map_end(prev_entry)) {
159 prev = rb_next(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
5f56406a
CM
161 }
162
6c05813e
FM
163 /*
164 * Previous extent map found, return as in this case the caller does not
165 * care about the next one.
166 */
167 if (prev) {
168 *prev_or_next_ret = prev;
169 return NULL;
170 }
171
172 prev = orig_prev;
08f088dd
FM
173 prev_entry = rb_entry(prev, struct extent_map, rb_node);
174 while (prev && offset < prev_entry->start) {
175 prev = rb_prev(prev);
d1310b2e 176 prev_entry = rb_entry(prev, struct extent_map, rb_node);
a52d9a80 177 }
6c05813e 178 *prev_or_next_ret = prev;
08f088dd 179
a52d9a80
CM
180 return NULL;
181}
182
2ecec0d6
FM
183static inline u64 extent_map_block_end(const struct extent_map *em)
184{
185 if (em->block_start + em->block_len < em->block_start)
186 return (u64)-1;
187 return em->block_start + em->block_len;
188}
189
1a9fb16c 190static bool can_merge_extent_map(const struct extent_map *em)
a52d9a80 191{
1a9fb16c
FM
192 if (test_bit(EXTENT_FLAG_PINNED, &em->flags))
193 return false;
7f3c74fb 194
1a9fb16c
FM
195 /* Don't merge compressed extents, we need to know their actual size. */
196 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
197 return false;
c8b97818 198
1a9fb16c
FM
199 if (test_bit(EXTENT_FLAG_LOGGING, &em->flags))
200 return false;
201a9038 201
09a2a8f9
JB
202 /*
203 * We don't want to merge stuff that hasn't been written to the log yet
204 * since it may not reflect exactly what is on disk, and that would be
205 * bad.
206 */
1a9fb16c
FM
207 if (!list_empty(&em->list))
208 return false;
209
210 return true;
211}
09a2a8f9 212
1a9fb16c
FM
213/* Check to see if two extent_map structs are adjacent and safe to merge. */
214static int mergable_maps(struct extent_map *prev, struct extent_map *next)
215{
d1310b2e
CM
216 if (extent_map_end(prev) == next->start &&
217 prev->flags == next->flags &&
d1310b2e
CM
218 ((next->block_start == EXTENT_MAP_HOLE &&
219 prev->block_start == EXTENT_MAP_HOLE) ||
220 (next->block_start == EXTENT_MAP_INLINE &&
221 prev->block_start == EXTENT_MAP_INLINE) ||
d1310b2e
CM
222 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
223 next->block_start == extent_map_block_end(prev)))) {
224 return 1;
225 }
a52d9a80
CM
226 return 0;
227}
228
4d2c8f62 229static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
a1ed835e 230{
a1ed835e
CM
231 struct extent_map *merge = NULL;
232 struct rb_node *rb;
a1ed835e 233
ac05ca91
FM
234 /*
235 * We can't modify an extent map that is in the tree and that is being
236 * used by another task, as it can cause that other task to see it in
237 * inconsistent state during the merging. We always have 1 reference for
238 * the tree and 1 for this task (which is unpinning the extent map or
239 * clearing the logging flag), so anything > 2 means it's being used by
240 * other tasks too.
241 */
242 if (refcount_read(&em->refs) > 2)
243 return;
244
1a9fb16c
FM
245 if (!can_merge_extent_map(em))
246 return;
247
a1ed835e
CM
248 if (em->start != 0) {
249 rb = rb_prev(&em->rb_node);
250 if (rb)
251 merge = rb_entry(rb, struct extent_map, rb_node);
1a9fb16c 252 if (rb && can_merge_extent_map(merge) && mergable_maps(merge, em)) {
a1ed835e 253 em->start = merge->start;
70c8a91c 254 em->orig_start = merge->orig_start;
a1ed835e
CM
255 em->len += merge->len;
256 em->block_len += merge->block_len;
257 em->block_start = merge->block_start;
70c8a91c
JB
258 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
259 em->mod_start = merge->mod_start;
260 em->generation = max(em->generation, merge->generation);
199257a7 261 set_bit(EXTENT_FLAG_MERGED, &em->flags);
5dc562c5 262
07e1ce09 263 rb_erase_cached(&merge->rb_node, &tree->map);
cbc0e928 264 RB_CLEAR_NODE(&merge->rb_node);
a1ed835e
CM
265 free_extent_map(merge);
266 }
267 }
268
269 rb = rb_next(&em->rb_node);
270 if (rb)
271 merge = rb_entry(rb, struct extent_map, rb_node);
1a9fb16c 272 if (rb && can_merge_extent_map(merge) && mergable_maps(em, merge)) {
a1ed835e 273 em->len += merge->len;
d527afe1 274 em->block_len += merge->block_len;
07e1ce09 275 rb_erase_cached(&merge->rb_node, &tree->map);
cbc0e928 276 RB_CLEAR_NODE(&merge->rb_node);
70c8a91c
JB
277 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
278 em->generation = max(em->generation, merge->generation);
199257a7 279 set_bit(EXTENT_FLAG_MERGED, &em->flags);
a1ed835e
CM
280 free_extent_map(merge);
281 }
4d2c8f62
LZ
282}
283
43dd529a
DS
284/*
285 * Unpin an extent from the cache.
286 *
00deaf04 287 * @inode: the inode from which we are unpinning an extent range
5dc562c5
JB
288 * @start: logical offset in the file
289 * @len: length of the extent
290 * @gen: generation that this extent has been modified in
5dc562c5
JB
291 *
292 * Called after an extent has been written to disk properly. Set the generation
293 * to the generation that actually added the file item to the inode so we know
294 * we need to sync this extent when we call fsync().
295 */
00deaf04 296int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
4d2c8f62 297{
00deaf04
FM
298 struct btrfs_fs_info *fs_info = inode->root->fs_info;
299 struct extent_map_tree *tree = &inode->extent_tree;
4d2c8f62
LZ
300 int ret = 0;
301 struct extent_map *em;
4e2f84e6 302 bool prealloc = false;
4d2c8f62
LZ
303
304 write_lock(&tree->lock);
305 em = lookup_extent_mapping(tree, start, len);
306
00deaf04
FM
307 if (WARN_ON(!em)) {
308 btrfs_warn(fs_info,
309"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
310 btrfs_ino(inode), btrfs_root_id(inode->root),
311 start, len, gen);
4d2c8f62 312 goto out;
00deaf04
FM
313 }
314
315 if (WARN_ON(em->start != start))
316 btrfs_warn(fs_info,
317"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
318 btrfs_ino(inode), btrfs_root_id(inode->root),
319 em->start, start, len, gen);
4d2c8f62 320
5dc562c5 321 em->generation = gen;
4d2c8f62 322 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4e2f84e6
LB
323 em->mod_start = em->start;
324 em->mod_len = em->len;
325
b11e234d 326 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
4e2f84e6 327 prealloc = true;
b11e234d 328 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
4e2f84e6 329 }
4d2c8f62
LZ
330
331 try_merge_map(tree, em);
4e2f84e6
LB
332
333 if (prealloc) {
334 em->mod_start = em->start;
335 em->mod_len = em->len;
336 }
337
a1ed835e
CM
338 free_extent_map(em);
339out:
340 write_unlock(&tree->lock);
341 return ret;
342
343}
344
201a9038
JB
345void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
346{
74333c7d
FM
347 lockdep_assert_held_write(&tree->lock);
348
201a9038 349 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
cbc0e928 350 if (extent_map_in_tree(em))
222c81dc 351 try_merge_map(tree, em);
201a9038
JB
352}
353
176840b3
FM
354static inline void setup_extent_mapping(struct extent_map_tree *tree,
355 struct extent_map *em,
356 int modified)
357{
490b54d6 358 refcount_inc(&em->refs);
176840b3
FM
359 em->mod_start = em->start;
360 em->mod_len = em->len;
361
32d53f6f
FM
362 ASSERT(list_empty(&em->list));
363
176840b3 364 if (modified)
32d53f6f 365 list_add(&em->list, &tree->modified_extents);
176840b3
FM
366 else
367 try_merge_map(tree, em);
368}
369
43dd529a 370/*
401bd2dd
NB
371 * Add new extent map to the extent tree
372 *
9d2423c5
CH
373 * @tree: tree to insert new map in
374 * @em: map to insert
401bd2dd
NB
375 * @modified: indicate whether the given @em should be added to the
376 * modified list, which indicates the extent needs to be logged
9d2423c5
CH
377 *
378 * Insert @em into @tree or perform a simple forward/backward merge with
379 * existing mappings. The extent_map struct passed in will be inserted
380 * into the tree directly, with an additional reference taken, or a
25985edc 381 * reference dropped if the merge attempt was successful.
a52d9a80 382 */
db9d9446
FM
383static int add_extent_mapping(struct extent_map_tree *tree,
384 struct extent_map *em, int modified)
a52d9a80
CM
385{
386 int ret = 0;
a52d9a80 387
d23ea3fa
DS
388 lockdep_assert_held_write(&tree->lock);
389
32193c14
FDBM
390 ret = tree_insert(&tree->map, em);
391 if (ret)
a52d9a80 392 goto out;
32193c14 393
176840b3 394 setup_extent_mapping(tree, em, modified);
a52d9a80 395out:
a52d9a80
CM
396 return ret;
397}
a52d9a80 398
48a3b636
ES
399static struct extent_map *
400__lookup_extent_mapping(struct extent_map_tree *tree,
401 u64 start, u64 len, int strict)
a52d9a80
CM
402{
403 struct extent_map *em;
404 struct rb_node *rb_node;
6c05813e 405 struct rb_node *prev_or_next = NULL;
306929f3
CH
406 u64 end = range_end(start, len);
407
6c05813e 408 rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
a52d9a80 409 if (!rb_node) {
6c05813e
FM
410 if (prev_or_next)
411 rb_node = prev_or_next;
ed64f066
LZ
412 else
413 return NULL;
a52d9a80 414 }
ed64f066 415
a52d9a80 416 em = rb_entry(rb_node, struct extent_map, rb_node);
d1310b2e 417
ed64f066
LZ
418 if (strict && !(end > em->start && start < extent_map_end(em)))
419 return NULL;
d1310b2e 420
490b54d6 421 refcount_inc(&em->refs);
a52d9a80
CM
422 return em;
423}
a52d9a80 424
43dd529a
DS
425/*
426 * Lookup extent_map that intersects @start + @len range.
427 *
ed64f066
LZ
428 * @tree: tree to lookup in
429 * @start: byte offset to start the search
430 * @len: length of the lookup range
431 *
432 * Find and return the first extent_map struct in @tree that intersects the
433 * [start, len] range. There may be additional objects in the tree that
434 * intersect, so check the object returned carefully to make sure that no
435 * additional lookups are needed.
436 */
437struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
438 u64 start, u64 len)
439{
440 return __lookup_extent_mapping(tree, start, len, 1);
441}
442
43dd529a
DS
443/*
444 * Find a nearby extent map intersecting @start + @len (not an exact search).
445 *
b917b7c3
CM
446 * @tree: tree to lookup in
447 * @start: byte offset to start the search
448 * @len: length of the lookup range
449 *
450 * Find and return the first extent_map struct in @tree that intersects the
451 * [start, len] range.
452 *
453 * If one can't be found, any nearby extent may be returned
454 */
455struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
456 u64 start, u64 len)
457{
ed64f066 458 return __lookup_extent_mapping(tree, start, len, 0);
b917b7c3
CM
459}
460
43dd529a
DS
461/*
462 * Remove an extent_map from the extent tree.
463 *
9d2423c5 464 * @tree: extent tree to remove from
bb7ab3b9 465 * @em: extent map being removed
9d2423c5 466 *
43dd529a
DS
467 * Remove @em from @tree. No reference counts are dropped, and no checks
468 * are done to see if the range is in use.
a52d9a80 469 */
c1766dd7 470void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
a52d9a80 471{
6d3b050e
FM
472 lockdep_assert_held_write(&tree->lock);
473
7f3c74fb 474 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
07e1ce09 475 rb_erase_cached(&em->rb_node, &tree->map);
ff44c6e3
JB
476 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
477 list_del_init(&em->list);
cbc0e928 478 RB_CLEAR_NODE(&em->rb_node);
a52d9a80 479}
176840b3 480
a6f3e205
CH
481static void replace_extent_mapping(struct extent_map_tree *tree,
482 struct extent_map *cur,
483 struct extent_map *new,
484 int modified)
176840b3 485{
6d3b050e
FM
486 lockdep_assert_held_write(&tree->lock);
487
176840b3
FM
488 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
489 ASSERT(extent_map_in_tree(cur));
490 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
491 list_del_init(&cur->list);
07e1ce09 492 rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
176840b3
FM
493 RB_CLEAR_NODE(&cur->rb_node);
494
495 setup_extent_mapping(tree, new, modified);
496}
c04e61b5 497
d47704bd 498static struct extent_map *next_extent_map(const struct extent_map *em)
c04e61b5
LB
499{
500 struct rb_node *next;
501
502 next = rb_next(&em->rb_node);
503 if (!next)
504 return NULL;
505 return container_of(next, struct extent_map, rb_node);
506}
507
508static struct extent_map *prev_extent_map(struct extent_map *em)
509{
510 struct rb_node *prev;
511
512 prev = rb_prev(&em->rb_node);
513 if (!prev)
514 return NULL;
515 return container_of(prev, struct extent_map, rb_node);
516}
517
52042d8e
AG
518/*
519 * Helper for btrfs_get_extent. Given an existing extent in the tree,
c04e61b5
LB
520 * the existing extent is the nearest extent to map_start,
521 * and an extent that you want to insert, deal with overlap and insert
522 * the best fitted new extent into the tree.
523 */
5f4791f4
LB
524static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
525 struct extent_map *existing,
526 struct extent_map *em,
527 u64 map_start)
c04e61b5
LB
528{
529 struct extent_map *prev;
530 struct extent_map *next;
531 u64 start;
532 u64 end;
533 u64 start_diff;
534
535 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
536
537 if (existing->start > map_start) {
538 next = existing;
539 prev = prev_extent_map(next);
540 } else {
541 prev = existing;
542 next = next_extent_map(prev);
543 }
544
545 start = prev ? extent_map_end(prev) : em->start;
546 start = max_t(u64, start, em->start);
547 end = next ? next->start : extent_map_end(em);
548 end = min_t(u64, end, extent_map_end(em));
549 start_diff = start - em->start;
550 em->start = start;
551 em->len = end - start;
552 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
553 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
554 em->block_start += start_diff;
555 em->block_len = em->len;
556 }
557 return add_extent_mapping(em_tree, em, 0);
558}
559
43dd529a
DS
560/*
561 * Add extent mapping into em_tree.
9ad37bb3
NB
562 *
563 * @fs_info: the filesystem
564 * @em_tree: extent tree into which we want to insert the extent mapping
565 * @em_in: extent we are inserting
566 * @start: start of the logical range btrfs_get_extent() is requesting
567 * @len: length of the logical range btrfs_get_extent() is requesting
c04e61b5
LB
568 *
569 * Note that @em_in's range may be different from [start, start+len),
570 * but they must be overlapped.
571 *
572 * Insert @em_in into @em_tree. In case there is an overlapping range, handle
573 * the -EEXIST by either:
574 * a) Returning the existing extent in @em_in if @start is within the
575 * existing em.
576 * b) Merge the existing extent with @em_in passed in.
577 *
578 * Return 0 on success, otherwise -EEXIST.
579 *
580 */
f46b24c9
DS
581int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
582 struct extent_map_tree *em_tree,
c04e61b5
LB
583 struct extent_map **em_in, u64 start, u64 len)
584{
585 int ret;
586 struct extent_map *em = *em_in;
587
d52a1365
QW
588 /*
589 * Tree-checker should have rejected any inline extent with non-zero
590 * file offset. Here just do a sanity check.
591 */
592 if (em->block_start == EXTENT_MAP_INLINE)
593 ASSERT(em->start == 0);
594
c04e61b5
LB
595 ret = add_extent_mapping(em_tree, em, 0);
596 /* it is possible that someone inserted the extent into the tree
597 * while we had the lock dropped. It is also possible that
598 * an overlapping map exists in the tree
599 */
600 if (ret == -EEXIST) {
601 struct extent_map *existing;
602
c04e61b5 603 existing = search_extent_mapping(em_tree, start, len);
393da918 604
f46b24c9 605 trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
393da918 606
c04e61b5
LB
607 /*
608 * existing will always be non-NULL, since there must be
609 * extent causing the -EEXIST.
610 */
611 if (start >= existing->start &&
612 start < extent_map_end(existing)) {
613 free_extent_map(em);
614 *em_in = existing;
615 ret = 0;
616 } else {
9a7e10e7
LB
617 u64 orig_start = em->start;
618 u64 orig_len = em->len;
619
c04e61b5
LB
620 /*
621 * The existing extent map is the one nearest to
622 * the [start, start + len) range which overlaps
623 */
624 ret = merge_extent_mapping(em_tree, existing,
625 em, start);
c04e61b5
LB
626 if (ret) {
627 free_extent_map(em);
628 *em_in = NULL;
9a7e10e7
LB
629 WARN_ONCE(ret,
630"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
631 ret, existing->start, existing->len,
632 orig_start, orig_len);
c04e61b5 633 }
9a7e10e7 634 free_extent_map(existing);
c04e61b5
LB
635 }
636 }
637
638 ASSERT(ret == 0 || ret == -EEXIST);
639 return ret;
640}
4c0c8cfc 641
9c9d1b4f
FM
642/*
643 * Drop all extent maps from a tree in the fastest possible way, rescheduling
644 * if needed. This avoids searching the tree, from the root down to the first
645 * extent map, before each deletion.
646 */
647static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
648{
649 write_lock(&tree->lock);
650 while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
651 struct extent_map *em;
652 struct rb_node *node;
653
654 node = rb_first_cached(&tree->map);
655 em = rb_entry(node, struct extent_map, rb_node);
656 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
657 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
658 remove_extent_mapping(tree, em);
659 free_extent_map(em);
660 cond_resched_rwlock_write(&tree->lock);
661 }
662 write_unlock(&tree->lock);
663}
664
4c0c8cfc
FM
665/*
666 * Drop all extent maps in a given range.
667 *
668 * @inode: The target inode.
669 * @start: Start offset of the range.
670 * @end: End offset of the range (inclusive value).
671 * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
672 *
673 * This drops all the extent maps that intersect the given range [@start, @end].
674 * Extent maps that partially overlap the range and extend behind or beyond it,
675 * are split.
676 * The caller should have locked an appropriate file range in the inode's io
677 * tree before calling this function.
678 */
679void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
680 bool skip_pinned)
681{
db21370b
FM
682 struct extent_map *split;
683 struct extent_map *split2;
684 struct extent_map *em;
4c0c8cfc
FM
685 struct extent_map_tree *em_tree = &inode->extent_tree;
686 u64 len = end - start + 1;
4c0c8cfc
FM
687
688 WARN_ON(end < start);
689 if (end == (u64)-1) {
9c9d1b4f
FM
690 if (start == 0 && !skip_pinned) {
691 drop_all_extent_maps_fast(em_tree);
692 return;
693 }
4c0c8cfc 694 len = (u64)-1;
db21370b
FM
695 } else {
696 /* Make end offset exclusive for use in the loop below. */
697 end++;
4c0c8cfc 698 }
db21370b
FM
699
700 /*
701 * It's ok if we fail to allocate the extent maps, see the comment near
702 * the bottom of the loop below. We only need two spare extent maps in
703 * the worst case, where the first extent map that intersects our range
704 * starts before the range and the last extent map that intersects our
705 * range ends after our range (and they might be the same extent map),
706 * because we need to split those two extent maps at the boundaries.
707 */
708 split = alloc_extent_map();
709 split2 = alloc_extent_map();
710
711 write_lock(&em_tree->lock);
712 em = lookup_extent_mapping(em_tree, start, len);
713
714 while (em) {
715 /* extent_map_end() returns exclusive value (last byte + 1). */
716 const u64 em_end = extent_map_end(em);
717 struct extent_map *next_em = NULL;
4c0c8cfc
FM
718 u64 gen;
719 unsigned long flags;
4c0c8cfc
FM
720 bool modified;
721 bool compressed;
722
db21370b
FM
723 if (em_end < end) {
724 next_em = next_extent_map(em);
725 if (next_em) {
726 if (next_em->start < end)
727 refcount_inc(&next_em->refs);
728 else
729 next_em = NULL;
730 }
4c0c8cfc 731 }
db21370b 732
4c0c8cfc 733 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
f3109e33 734 start = em_end;
db21370b 735 goto next;
4c0c8cfc 736 }
db21370b 737
e4cc1483 738 flags = em->flags;
4c0c8cfc 739 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
e4cc1483
FM
740 /*
741 * In case we split the extent map, we want to preserve the
742 * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
743 * it on the new extent maps.
744 */
4c0c8cfc
FM
745 clear_bit(EXTENT_FLAG_LOGGING, &flags);
746 modified = !list_empty(&em->list);
db21370b
FM
747
748 /*
749 * The extent map does not cross our target range, so no need to
750 * split it, we can remove it directly.
751 */
752 if (em->start >= start && em_end <= end)
753 goto remove_em;
754
db21370b
FM
755 gen = em->generation;
756 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4c0c8cfc
FM
757
758 if (em->start < start) {
db21370b
FM
759 if (!split) {
760 split = split2;
761 split2 = NULL;
762 if (!split)
763 goto remove_em;
764 }
4c0c8cfc
FM
765 split->start = em->start;
766 split->len = start - em->start;
767
768 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
769 split->orig_start = em->orig_start;
770 split->block_start = em->block_start;
771
772 if (compressed)
773 split->block_len = em->block_len;
774 else
775 split->block_len = split->len;
776 split->orig_block_len = max(split->block_len,
777 em->orig_block_len);
778 split->ram_bytes = em->ram_bytes;
779 } else {
780 split->orig_start = split->start;
781 split->block_len = 0;
782 split->block_start = em->block_start;
783 split->orig_block_len = 0;
784 split->ram_bytes = split->len;
785 }
786
787 split->generation = gen;
788 split->flags = flags;
789 split->compress_type = em->compress_type;
790 replace_extent_mapping(em_tree, em, split, modified);
791 free_extent_map(split);
792 split = split2;
793 split2 = NULL;
794 }
db21370b
FM
795 if (em_end > end) {
796 if (!split) {
797 split = split2;
798 split2 = NULL;
799 if (!split)
800 goto remove_em;
801 }
c962098c
JB
802 split->start = end;
803 split->len = em_end - end;
4c0c8cfc
FM
804 split->block_start = em->block_start;
805 split->flags = flags;
806 split->compress_type = em->compress_type;
807 split->generation = gen;
808
809 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
810 split->orig_block_len = max(em->block_len,
811 em->orig_block_len);
812
813 split->ram_bytes = em->ram_bytes;
814 if (compressed) {
815 split->block_len = em->block_len;
816 split->orig_start = em->orig_start;
817 } else {
818 const u64 diff = start + len - em->start;
819
820 split->block_len = split->len;
821 split->block_start += diff;
822 split->orig_start = em->orig_start;
823 }
824 } else {
825 split->ram_bytes = split->len;
826 split->orig_start = split->start;
827 split->block_len = 0;
828 split->orig_block_len = 0;
829 }
830
831 if (extent_map_in_tree(em)) {
832 replace_extent_mapping(em_tree, em, split,
833 modified);
834 } else {
835 int ret;
836
837 ret = add_extent_mapping(em_tree, split,
838 modified);
839 /* Logic error, shouldn't happen. */
840 ASSERT(ret == 0);
841 if (WARN_ON(ret != 0) && modified)
842 btrfs_set_inode_full_sync(inode);
843 }
844 free_extent_map(split);
845 split = NULL;
846 }
db21370b 847remove_em:
4c0c8cfc
FM
848 if (extent_map_in_tree(em)) {
849 /*
850 * If the extent map is still in the tree it means that
851 * either of the following is true:
852 *
853 * 1) It fits entirely in our range (doesn't end beyond
854 * it or starts before it);
855 *
856 * 2) It starts before our range and/or ends after our
857 * range, and we were not able to allocate the extent
858 * maps for split operations, @split and @split2.
859 *
860 * If we are at case 2) then we just remove the entire
861 * extent map - this is fine since if anyone needs it to
862 * access the subranges outside our range, will just
863 * load it again from the subvolume tree's file extent
864 * item. However if the extent map was in the list of
865 * modified extents, then we must mark the inode for a
866 * full fsync, otherwise a fast fsync will miss this
867 * extent if it's new and needs to be logged.
868 */
db21370b
FM
869 if ((em->start < start || em_end > end) && modified) {
870 ASSERT(!split);
4c0c8cfc
FM
871 btrfs_set_inode_full_sync(inode);
872 }
873 remove_extent_mapping(em_tree, em);
874 }
4c0c8cfc 875
db21370b
FM
876 /*
877 * Once for the tree reference (we replaced or removed the
878 * extent map from the tree).
879 */
4c0c8cfc 880 free_extent_map(em);
db21370b
FM
881next:
882 /* Once for us (for our lookup reference). */
4c0c8cfc 883 free_extent_map(em);
db21370b
FM
884
885 em = next_em;
4c0c8cfc
FM
886 }
887
db21370b
FM
888 write_unlock(&em_tree->lock);
889
4c0c8cfc
FM
890 free_extent_map(split);
891 free_extent_map(split2);
892}
a1ba4c08
FM
893
894/*
895 * Replace a range in the inode's extent map tree with a new extent map.
896 *
897 * @inode: The target inode.
898 * @new_em: The new extent map to add to the inode's extent map tree.
899 * @modified: Indicate if the new extent map should be added to the list of
900 * modified extents (for fast fsync tracking).
901 *
902 * Drops all the extent maps in the inode's extent map tree that intersect the
903 * range of the new extent map and adds the new extent map to the tree.
904 * The caller should have locked an appropriate file range in the inode's io
905 * tree before calling this function.
906 */
907int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
908 struct extent_map *new_em,
909 bool modified)
910{
911 const u64 end = new_em->start + new_em->len - 1;
912 struct extent_map_tree *tree = &inode->extent_tree;
913 int ret;
914
915 ASSERT(!extent_map_in_tree(new_em));
916
917 /*
918 * The caller has locked an appropriate file range in the inode's io
919 * tree, but getting -EEXIST when adding the new extent map can still
920 * happen in case there are extents that partially cover the range, and
921 * this is due to two tasks operating on different parts of the extent.
922 * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
923 * btrfs_get_extent") for an example and details.
924 */
925 do {
926 btrfs_drop_extent_map_range(inode, new_em->start, end, false);
927 write_lock(&tree->lock);
928 ret = add_extent_mapping(tree, new_em, modified);
929 write_unlock(&tree->lock);
930 } while (ret == -EEXIST);
931
932 return ret;
933}
a6f3e205
CH
934
935/*
f000bc6f
CH
936 * Split off the first pre bytes from the extent_map at [start, start + len],
937 * and set the block_start for it to new_logical.
a6f3e205
CH
938 *
939 * This function is used when an ordered_extent needs to be split.
940 */
f000bc6f
CH
941int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
942 u64 new_logical)
a6f3e205
CH
943{
944 struct extent_map_tree *em_tree = &inode->extent_tree;
945 struct extent_map *em;
946 struct extent_map *split_pre = NULL;
947 struct extent_map *split_mid = NULL;
948 int ret = 0;
949 unsigned long flags;
950
951 ASSERT(pre != 0);
952 ASSERT(pre < len);
953
954 split_pre = alloc_extent_map();
955 if (!split_pre)
956 return -ENOMEM;
957 split_mid = alloc_extent_map();
958 if (!split_mid) {
959 ret = -ENOMEM;
960 goto out_free_pre;
961 }
962
963 lock_extent(&inode->io_tree, start, start + len - 1, NULL);
964 write_lock(&em_tree->lock);
965 em = lookup_extent_mapping(em_tree, start, len);
966 if (!em) {
967 ret = -EIO;
968 goto out_unlock;
969 }
970
971 ASSERT(em->len == len);
972 ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
973 ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
974 ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
975 ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
976 ASSERT(!list_empty(&em->list));
977
978 flags = em->flags;
979 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
980
981 /* First, replace the em with a new extent_map starting from * em->start */
982 split_pre->start = em->start;
983 split_pre->len = pre;
984 split_pre->orig_start = split_pre->start;
f000bc6f 985 split_pre->block_start = new_logical;
a6f3e205
CH
986 split_pre->block_len = split_pre->len;
987 split_pre->orig_block_len = split_pre->block_len;
988 split_pre->ram_bytes = split_pre->len;
989 split_pre->flags = flags;
990 split_pre->compress_type = em->compress_type;
991 split_pre->generation = em->generation;
992
993 replace_extent_mapping(em_tree, em, split_pre, 1);
994
995 /*
996 * Now we only have an extent_map at:
997 * [em->start, em->start + pre]
998 */
999
1000 /* Insert the middle extent_map. */
1001 split_mid->start = em->start + pre;
1002 split_mid->len = em->len - pre;
1003 split_mid->orig_start = split_mid->start;
1004 split_mid->block_start = em->block_start + pre;
1005 split_mid->block_len = split_mid->len;
1006 split_mid->orig_block_len = split_mid->block_len;
1007 split_mid->ram_bytes = split_mid->len;
1008 split_mid->flags = flags;
1009 split_mid->compress_type = em->compress_type;
1010 split_mid->generation = em->generation;
1011 add_extent_mapping(em_tree, split_mid, 1);
1012
1013 /* Once for us */
1014 free_extent_map(em);
1015 /* Once for the tree */
1016 free_extent_map(em);
1017
1018out_unlock:
1019 write_unlock(&em_tree->lock);
1020 unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
1021 free_extent_map(split_mid);
1022out_free_pre:
1023 free_extent_map(split_pre);
1024 return ret;
1025}