]> git.ipfire.org Git - thirdparty/linux.git/blob - fs/btrfs/extent_map.c
0d1167f454d2a3bbeddfa51823d4039a04589a5a
[thirdparty/linux.git] / fs / btrfs / extent_map.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/err.h>
4 #include <linux/slab.h>
5 #include <linux/spinlock.h>
6 #include "messages.h"
7 #include "ctree.h"
8 #include "volumes.h"
9 #include "extent_map.h"
10 #include "compression.h"
11 #include "btrfs_inode.h"
12
13
14 static struct kmem_cache *extent_map_cache;
15
16 int __init extent_map_init(void)
17 {
18 extent_map_cache = kmem_cache_create("btrfs_extent_map",
19 sizeof(struct extent_map), 0,
20 SLAB_MEM_SPREAD, NULL);
21 if (!extent_map_cache)
22 return -ENOMEM;
23 return 0;
24 }
25
26 void __cold extent_map_exit(void)
27 {
28 kmem_cache_destroy(extent_map_cache);
29 }
30
31 /*
32 * Initialize the extent tree @tree. Should be called for each new inode or
33 * other user of the extent_map interface.
34 */
35 void extent_map_tree_init(struct extent_map_tree *tree)
36 {
37 tree->map = RB_ROOT_CACHED;
38 INIT_LIST_HEAD(&tree->modified_extents);
39 rwlock_init(&tree->lock);
40 }
41
42 /*
43 * Allocate a new extent_map structure. The new structure is returned with a
44 * reference count of one and needs to be freed using free_extent_map()
45 */
46 struct extent_map *alloc_extent_map(void)
47 {
48 struct extent_map *em;
49 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
50 if (!em)
51 return NULL;
52 RB_CLEAR_NODE(&em->rb_node);
53 em->compress_type = BTRFS_COMPRESS_NONE;
54 refcount_set(&em->refs, 1);
55 INIT_LIST_HEAD(&em->list);
56 return em;
57 }
58
59 /*
60 * Drop the reference out on @em by one and free the structure if the reference
61 * count hits zero.
62 */
63 void free_extent_map(struct extent_map *em)
64 {
65 if (!em)
66 return;
67 if (refcount_dec_and_test(&em->refs)) {
68 WARN_ON(extent_map_in_tree(em));
69 WARN_ON(!list_empty(&em->list));
70 kmem_cache_free(extent_map_cache, em);
71 }
72 }
73
74 /* Do the math around the end of an extent, handling wrapping. */
75 static u64 range_end(u64 start, u64 len)
76 {
77 if (start + len < start)
78 return (u64)-1;
79 return start + len;
80 }
81
82 static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
83 {
84 struct rb_node **p = &root->rb_root.rb_node;
85 struct rb_node *parent = NULL;
86 struct extent_map *entry = NULL;
87 struct rb_node *orig_parent = NULL;
88 u64 end = range_end(em->start, em->len);
89 bool leftmost = true;
90
91 while (*p) {
92 parent = *p;
93 entry = rb_entry(parent, struct extent_map, rb_node);
94
95 if (em->start < entry->start) {
96 p = &(*p)->rb_left;
97 } else if (em->start >= extent_map_end(entry)) {
98 p = &(*p)->rb_right;
99 leftmost = false;
100 } else {
101 return -EEXIST;
102 }
103 }
104
105 orig_parent = parent;
106 while (parent && em->start >= extent_map_end(entry)) {
107 parent = rb_next(parent);
108 entry = rb_entry(parent, struct extent_map, rb_node);
109 }
110 if (parent)
111 if (end > entry->start && em->start < extent_map_end(entry))
112 return -EEXIST;
113
114 parent = orig_parent;
115 entry = rb_entry(parent, struct extent_map, rb_node);
116 while (parent && em->start < entry->start) {
117 parent = rb_prev(parent);
118 entry = rb_entry(parent, struct extent_map, rb_node);
119 }
120 if (parent)
121 if (end > entry->start && em->start < extent_map_end(entry))
122 return -EEXIST;
123
124 rb_link_node(&em->rb_node, orig_parent, p);
125 rb_insert_color_cached(&em->rb_node, root, leftmost);
126 return 0;
127 }
128
129 /*
130 * Search through the tree for an extent_map with a given offset. If it can't
131 * be found, try to find some neighboring extents
132 */
133 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
134 struct rb_node **prev_or_next_ret)
135 {
136 struct rb_node *n = root->rb_node;
137 struct rb_node *prev = NULL;
138 struct rb_node *orig_prev = NULL;
139 struct extent_map *entry;
140 struct extent_map *prev_entry = NULL;
141
142 ASSERT(prev_or_next_ret);
143
144 while (n) {
145 entry = rb_entry(n, struct extent_map, rb_node);
146 prev = n;
147 prev_entry = entry;
148
149 if (offset < entry->start)
150 n = n->rb_left;
151 else if (offset >= extent_map_end(entry))
152 n = n->rb_right;
153 else
154 return n;
155 }
156
157 orig_prev = prev;
158 while (prev && offset >= extent_map_end(prev_entry)) {
159 prev = rb_next(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162
163 /*
164 * Previous extent map found, return as in this case the caller does not
165 * care about the next one.
166 */
167 if (prev) {
168 *prev_or_next_ret = prev;
169 return NULL;
170 }
171
172 prev = orig_prev;
173 prev_entry = rb_entry(prev, struct extent_map, rb_node);
174 while (prev && offset < prev_entry->start) {
175 prev = rb_prev(prev);
176 prev_entry = rb_entry(prev, struct extent_map, rb_node);
177 }
178 *prev_or_next_ret = prev;
179
180 return NULL;
181 }
182
183 static inline u64 extent_map_block_end(const struct extent_map *em)
184 {
185 if (em->block_start + em->block_len < em->block_start)
186 return (u64)-1;
187 return em->block_start + em->block_len;
188 }
189
190 static bool can_merge_extent_map(const struct extent_map *em)
191 {
192 if (test_bit(EXTENT_FLAG_PINNED, &em->flags))
193 return false;
194
195 /* Don't merge compressed extents, we need to know their actual size. */
196 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
197 return false;
198
199 if (test_bit(EXTENT_FLAG_LOGGING, &em->flags))
200 return false;
201
202 /*
203 * We don't want to merge stuff that hasn't been written to the log yet
204 * since it may not reflect exactly what is on disk, and that would be
205 * bad.
206 */
207 if (!list_empty(&em->list))
208 return false;
209
210 return true;
211 }
212
213 /* Check to see if two extent_map structs are adjacent and safe to merge. */
214 static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
215 {
216 if (extent_map_end(prev) != next->start)
217 return false;
218
219 if (prev->flags != next->flags)
220 return false;
221
222 if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
223 return next->block_start == extent_map_block_end(prev);
224
225 /* HOLES and INLINE extents. */
226 return next->block_start == prev->block_start;
227 }
228
229 static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
230 {
231 struct extent_map *merge = NULL;
232 struct rb_node *rb;
233
234 /*
235 * We can't modify an extent map that is in the tree and that is being
236 * used by another task, as it can cause that other task to see it in
237 * inconsistent state during the merging. We always have 1 reference for
238 * the tree and 1 for this task (which is unpinning the extent map or
239 * clearing the logging flag), so anything > 2 means it's being used by
240 * other tasks too.
241 */
242 if (refcount_read(&em->refs) > 2)
243 return;
244
245 if (!can_merge_extent_map(em))
246 return;
247
248 if (em->start != 0) {
249 rb = rb_prev(&em->rb_node);
250 if (rb)
251 merge = rb_entry(rb, struct extent_map, rb_node);
252 if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
253 em->start = merge->start;
254 em->orig_start = merge->orig_start;
255 em->len += merge->len;
256 em->block_len += merge->block_len;
257 em->block_start = merge->block_start;
258 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
259 em->mod_start = merge->mod_start;
260 em->generation = max(em->generation, merge->generation);
261 set_bit(EXTENT_FLAG_MERGED, &em->flags);
262
263 rb_erase_cached(&merge->rb_node, &tree->map);
264 RB_CLEAR_NODE(&merge->rb_node);
265 free_extent_map(merge);
266 }
267 }
268
269 rb = rb_next(&em->rb_node);
270 if (rb)
271 merge = rb_entry(rb, struct extent_map, rb_node);
272 if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
273 em->len += merge->len;
274 em->block_len += merge->block_len;
275 rb_erase_cached(&merge->rb_node, &tree->map);
276 RB_CLEAR_NODE(&merge->rb_node);
277 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
278 em->generation = max(em->generation, merge->generation);
279 set_bit(EXTENT_FLAG_MERGED, &em->flags);
280 free_extent_map(merge);
281 }
282 }
283
284 /*
285 * Unpin an extent from the cache.
286 *
287 * @inode: the inode from which we are unpinning an extent range
288 * @start: logical offset in the file
289 * @len: length of the extent
290 * @gen: generation that this extent has been modified in
291 *
292 * Called after an extent has been written to disk properly. Set the generation
293 * to the generation that actually added the file item to the inode so we know
294 * we need to sync this extent when we call fsync().
295 */
296 int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
297 {
298 struct btrfs_fs_info *fs_info = inode->root->fs_info;
299 struct extent_map_tree *tree = &inode->extent_tree;
300 int ret = 0;
301 struct extent_map *em;
302 bool prealloc = false;
303
304 write_lock(&tree->lock);
305 em = lookup_extent_mapping(tree, start, len);
306
307 if (WARN_ON(!em)) {
308 btrfs_warn(fs_info,
309 "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
310 btrfs_ino(inode), btrfs_root_id(inode->root),
311 start, len, gen);
312 goto out;
313 }
314
315 if (WARN_ON(em->start != start))
316 btrfs_warn(fs_info,
317 "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
318 btrfs_ino(inode), btrfs_root_id(inode->root),
319 em->start, start, len, gen);
320
321 em->generation = gen;
322 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
323 em->mod_start = em->start;
324 em->mod_len = em->len;
325
326 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
327 prealloc = true;
328 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
329 }
330
331 try_merge_map(tree, em);
332
333 if (prealloc) {
334 em->mod_start = em->start;
335 em->mod_len = em->len;
336 }
337
338 free_extent_map(em);
339 out:
340 write_unlock(&tree->lock);
341 return ret;
342
343 }
344
345 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
346 {
347 lockdep_assert_held_write(&tree->lock);
348
349 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
350 if (extent_map_in_tree(em))
351 try_merge_map(tree, em);
352 }
353
354 static inline void setup_extent_mapping(struct extent_map_tree *tree,
355 struct extent_map *em,
356 int modified)
357 {
358 refcount_inc(&em->refs);
359 em->mod_start = em->start;
360 em->mod_len = em->len;
361
362 ASSERT(list_empty(&em->list));
363
364 if (modified)
365 list_add(&em->list, &tree->modified_extents);
366 else
367 try_merge_map(tree, em);
368 }
369
370 /*
371 * Add new extent map to the extent tree
372 *
373 * @tree: tree to insert new map in
374 * @em: map to insert
375 * @modified: indicate whether the given @em should be added to the
376 * modified list, which indicates the extent needs to be logged
377 *
378 * Insert @em into @tree or perform a simple forward/backward merge with
379 * existing mappings. The extent_map struct passed in will be inserted
380 * into the tree directly, with an additional reference taken, or a
381 * reference dropped if the merge attempt was successful.
382 */
383 static int add_extent_mapping(struct extent_map_tree *tree,
384 struct extent_map *em, int modified)
385 {
386 int ret = 0;
387
388 lockdep_assert_held_write(&tree->lock);
389
390 ret = tree_insert(&tree->map, em);
391 if (ret)
392 goto out;
393
394 setup_extent_mapping(tree, em, modified);
395 out:
396 return ret;
397 }
398
399 static struct extent_map *
400 __lookup_extent_mapping(struct extent_map_tree *tree,
401 u64 start, u64 len, int strict)
402 {
403 struct extent_map *em;
404 struct rb_node *rb_node;
405 struct rb_node *prev_or_next = NULL;
406 u64 end = range_end(start, len);
407
408 rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
409 if (!rb_node) {
410 if (prev_or_next)
411 rb_node = prev_or_next;
412 else
413 return NULL;
414 }
415
416 em = rb_entry(rb_node, struct extent_map, rb_node);
417
418 if (strict && !(end > em->start && start < extent_map_end(em)))
419 return NULL;
420
421 refcount_inc(&em->refs);
422 return em;
423 }
424
425 /*
426 * Lookup extent_map that intersects @start + @len range.
427 *
428 * @tree: tree to lookup in
429 * @start: byte offset to start the search
430 * @len: length of the lookup range
431 *
432 * Find and return the first extent_map struct in @tree that intersects the
433 * [start, len] range. There may be additional objects in the tree that
434 * intersect, so check the object returned carefully to make sure that no
435 * additional lookups are needed.
436 */
437 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
438 u64 start, u64 len)
439 {
440 return __lookup_extent_mapping(tree, start, len, 1);
441 }
442
443 /*
444 * Find a nearby extent map intersecting @start + @len (not an exact search).
445 *
446 * @tree: tree to lookup in
447 * @start: byte offset to start the search
448 * @len: length of the lookup range
449 *
450 * Find and return the first extent_map struct in @tree that intersects the
451 * [start, len] range.
452 *
453 * If one can't be found, any nearby extent may be returned
454 */
455 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
456 u64 start, u64 len)
457 {
458 return __lookup_extent_mapping(tree, start, len, 0);
459 }
460
461 /*
462 * Remove an extent_map from the extent tree.
463 *
464 * @tree: extent tree to remove from
465 * @em: extent map being removed
466 *
467 * Remove @em from @tree. No reference counts are dropped, and no checks
468 * are done to see if the range is in use.
469 */
470 void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
471 {
472 lockdep_assert_held_write(&tree->lock);
473
474 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
475 rb_erase_cached(&em->rb_node, &tree->map);
476 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
477 list_del_init(&em->list);
478 RB_CLEAR_NODE(&em->rb_node);
479 }
480
481 static void replace_extent_mapping(struct extent_map_tree *tree,
482 struct extent_map *cur,
483 struct extent_map *new,
484 int modified)
485 {
486 lockdep_assert_held_write(&tree->lock);
487
488 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
489 ASSERT(extent_map_in_tree(cur));
490 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
491 list_del_init(&cur->list);
492 rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
493 RB_CLEAR_NODE(&cur->rb_node);
494
495 setup_extent_mapping(tree, new, modified);
496 }
497
498 static struct extent_map *next_extent_map(const struct extent_map *em)
499 {
500 struct rb_node *next;
501
502 next = rb_next(&em->rb_node);
503 if (!next)
504 return NULL;
505 return container_of(next, struct extent_map, rb_node);
506 }
507
508 static struct extent_map *prev_extent_map(struct extent_map *em)
509 {
510 struct rb_node *prev;
511
512 prev = rb_prev(&em->rb_node);
513 if (!prev)
514 return NULL;
515 return container_of(prev, struct extent_map, rb_node);
516 }
517
518 /*
519 * Helper for btrfs_get_extent. Given an existing extent in the tree,
520 * the existing extent is the nearest extent to map_start,
521 * and an extent that you want to insert, deal with overlap and insert
522 * the best fitted new extent into the tree.
523 */
524 static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
525 struct extent_map *existing,
526 struct extent_map *em,
527 u64 map_start)
528 {
529 struct extent_map *prev;
530 struct extent_map *next;
531 u64 start;
532 u64 end;
533 u64 start_diff;
534
535 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
536
537 if (existing->start > map_start) {
538 next = existing;
539 prev = prev_extent_map(next);
540 } else {
541 prev = existing;
542 next = next_extent_map(prev);
543 }
544
545 start = prev ? extent_map_end(prev) : em->start;
546 start = max_t(u64, start, em->start);
547 end = next ? next->start : extent_map_end(em);
548 end = min_t(u64, end, extent_map_end(em));
549 start_diff = start - em->start;
550 em->start = start;
551 em->len = end - start;
552 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
553 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
554 em->block_start += start_diff;
555 em->block_len = em->len;
556 }
557 return add_extent_mapping(em_tree, em, 0);
558 }
559
560 /*
561 * Add extent mapping into em_tree.
562 *
563 * @fs_info: the filesystem
564 * @em_tree: extent tree into which we want to insert the extent mapping
565 * @em_in: extent we are inserting
566 * @start: start of the logical range btrfs_get_extent() is requesting
567 * @len: length of the logical range btrfs_get_extent() is requesting
568 *
569 * Note that @em_in's range may be different from [start, start+len),
570 * but they must be overlapped.
571 *
572 * Insert @em_in into @em_tree. In case there is an overlapping range, handle
573 * the -EEXIST by either:
574 * a) Returning the existing extent in @em_in if @start is within the
575 * existing em.
576 * b) Merge the existing extent with @em_in passed in.
577 *
578 * Return 0 on success, otherwise -EEXIST.
579 *
580 */
581 int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
582 struct extent_map_tree *em_tree,
583 struct extent_map **em_in, u64 start, u64 len)
584 {
585 int ret;
586 struct extent_map *em = *em_in;
587
588 /*
589 * Tree-checker should have rejected any inline extent with non-zero
590 * file offset. Here just do a sanity check.
591 */
592 if (em->block_start == EXTENT_MAP_INLINE)
593 ASSERT(em->start == 0);
594
595 ret = add_extent_mapping(em_tree, em, 0);
596 /* it is possible that someone inserted the extent into the tree
597 * while we had the lock dropped. It is also possible that
598 * an overlapping map exists in the tree
599 */
600 if (ret == -EEXIST) {
601 struct extent_map *existing;
602
603 existing = search_extent_mapping(em_tree, start, len);
604
605 trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
606
607 /*
608 * existing will always be non-NULL, since there must be
609 * extent causing the -EEXIST.
610 */
611 if (start >= existing->start &&
612 start < extent_map_end(existing)) {
613 free_extent_map(em);
614 *em_in = existing;
615 ret = 0;
616 } else {
617 u64 orig_start = em->start;
618 u64 orig_len = em->len;
619
620 /*
621 * The existing extent map is the one nearest to
622 * the [start, start + len) range which overlaps
623 */
624 ret = merge_extent_mapping(em_tree, existing,
625 em, start);
626 if (ret) {
627 free_extent_map(em);
628 *em_in = NULL;
629 WARN_ONCE(ret,
630 "unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
631 ret, existing->start, existing->len,
632 orig_start, orig_len);
633 }
634 free_extent_map(existing);
635 }
636 }
637
638 ASSERT(ret == 0 || ret == -EEXIST);
639 return ret;
640 }
641
642 /*
643 * Drop all extent maps from a tree in the fastest possible way, rescheduling
644 * if needed. This avoids searching the tree, from the root down to the first
645 * extent map, before each deletion.
646 */
647 static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
648 {
649 write_lock(&tree->lock);
650 while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
651 struct extent_map *em;
652 struct rb_node *node;
653
654 node = rb_first_cached(&tree->map);
655 em = rb_entry(node, struct extent_map, rb_node);
656 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
657 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
658 remove_extent_mapping(tree, em);
659 free_extent_map(em);
660 cond_resched_rwlock_write(&tree->lock);
661 }
662 write_unlock(&tree->lock);
663 }
664
665 /*
666 * Drop all extent maps in a given range.
667 *
668 * @inode: The target inode.
669 * @start: Start offset of the range.
670 * @end: End offset of the range (inclusive value).
671 * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
672 *
673 * This drops all the extent maps that intersect the given range [@start, @end].
674 * Extent maps that partially overlap the range and extend behind or beyond it,
675 * are split.
676 * The caller should have locked an appropriate file range in the inode's io
677 * tree before calling this function.
678 */
679 void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
680 bool skip_pinned)
681 {
682 struct extent_map *split;
683 struct extent_map *split2;
684 struct extent_map *em;
685 struct extent_map_tree *em_tree = &inode->extent_tree;
686 u64 len = end - start + 1;
687
688 WARN_ON(end < start);
689 if (end == (u64)-1) {
690 if (start == 0 && !skip_pinned) {
691 drop_all_extent_maps_fast(em_tree);
692 return;
693 }
694 len = (u64)-1;
695 } else {
696 /* Make end offset exclusive for use in the loop below. */
697 end++;
698 }
699
700 /*
701 * It's ok if we fail to allocate the extent maps, see the comment near
702 * the bottom of the loop below. We only need two spare extent maps in
703 * the worst case, where the first extent map that intersects our range
704 * starts before the range and the last extent map that intersects our
705 * range ends after our range (and they might be the same extent map),
706 * because we need to split those two extent maps at the boundaries.
707 */
708 split = alloc_extent_map();
709 split2 = alloc_extent_map();
710
711 write_lock(&em_tree->lock);
712 em = lookup_extent_mapping(em_tree, start, len);
713
714 while (em) {
715 /* extent_map_end() returns exclusive value (last byte + 1). */
716 const u64 em_end = extent_map_end(em);
717 struct extent_map *next_em = NULL;
718 u64 gen;
719 unsigned long flags;
720 bool modified;
721 bool compressed;
722
723 if (em_end < end) {
724 next_em = next_extent_map(em);
725 if (next_em) {
726 if (next_em->start < end)
727 refcount_inc(&next_em->refs);
728 else
729 next_em = NULL;
730 }
731 }
732
733 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
734 start = em_end;
735 goto next;
736 }
737
738 flags = em->flags;
739 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
740 /*
741 * In case we split the extent map, we want to preserve the
742 * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
743 * it on the new extent maps.
744 */
745 clear_bit(EXTENT_FLAG_LOGGING, &flags);
746 modified = !list_empty(&em->list);
747
748 /*
749 * The extent map does not cross our target range, so no need to
750 * split it, we can remove it directly.
751 */
752 if (em->start >= start && em_end <= end)
753 goto remove_em;
754
755 gen = em->generation;
756 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
757
758 if (em->start < start) {
759 if (!split) {
760 split = split2;
761 split2 = NULL;
762 if (!split)
763 goto remove_em;
764 }
765 split->start = em->start;
766 split->len = start - em->start;
767
768 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
769 split->orig_start = em->orig_start;
770 split->block_start = em->block_start;
771
772 if (compressed)
773 split->block_len = em->block_len;
774 else
775 split->block_len = split->len;
776 split->orig_block_len = max(split->block_len,
777 em->orig_block_len);
778 split->ram_bytes = em->ram_bytes;
779 } else {
780 split->orig_start = split->start;
781 split->block_len = 0;
782 split->block_start = em->block_start;
783 split->orig_block_len = 0;
784 split->ram_bytes = split->len;
785 }
786
787 split->generation = gen;
788 split->flags = flags;
789 split->compress_type = em->compress_type;
790 replace_extent_mapping(em_tree, em, split, modified);
791 free_extent_map(split);
792 split = split2;
793 split2 = NULL;
794 }
795 if (em_end > end) {
796 if (!split) {
797 split = split2;
798 split2 = NULL;
799 if (!split)
800 goto remove_em;
801 }
802 split->start = end;
803 split->len = em_end - end;
804 split->block_start = em->block_start;
805 split->flags = flags;
806 split->compress_type = em->compress_type;
807 split->generation = gen;
808
809 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
810 split->orig_block_len = max(em->block_len,
811 em->orig_block_len);
812
813 split->ram_bytes = em->ram_bytes;
814 if (compressed) {
815 split->block_len = em->block_len;
816 split->orig_start = em->orig_start;
817 } else {
818 const u64 diff = start + len - em->start;
819
820 split->block_len = split->len;
821 split->block_start += diff;
822 split->orig_start = em->orig_start;
823 }
824 } else {
825 split->ram_bytes = split->len;
826 split->orig_start = split->start;
827 split->block_len = 0;
828 split->orig_block_len = 0;
829 }
830
831 if (extent_map_in_tree(em)) {
832 replace_extent_mapping(em_tree, em, split,
833 modified);
834 } else {
835 int ret;
836
837 ret = add_extent_mapping(em_tree, split,
838 modified);
839 /* Logic error, shouldn't happen. */
840 ASSERT(ret == 0);
841 if (WARN_ON(ret != 0) && modified)
842 btrfs_set_inode_full_sync(inode);
843 }
844 free_extent_map(split);
845 split = NULL;
846 }
847 remove_em:
848 if (extent_map_in_tree(em)) {
849 /*
850 * If the extent map is still in the tree it means that
851 * either of the following is true:
852 *
853 * 1) It fits entirely in our range (doesn't end beyond
854 * it or starts before it);
855 *
856 * 2) It starts before our range and/or ends after our
857 * range, and we were not able to allocate the extent
858 * maps for split operations, @split and @split2.
859 *
860 * If we are at case 2) then we just remove the entire
861 * extent map - this is fine since if anyone needs it to
862 * access the subranges outside our range, will just
863 * load it again from the subvolume tree's file extent
864 * item. However if the extent map was in the list of
865 * modified extents, then we must mark the inode for a
866 * full fsync, otherwise a fast fsync will miss this
867 * extent if it's new and needs to be logged.
868 */
869 if ((em->start < start || em_end > end) && modified) {
870 ASSERT(!split);
871 btrfs_set_inode_full_sync(inode);
872 }
873 remove_extent_mapping(em_tree, em);
874 }
875
876 /*
877 * Once for the tree reference (we replaced or removed the
878 * extent map from the tree).
879 */
880 free_extent_map(em);
881 next:
882 /* Once for us (for our lookup reference). */
883 free_extent_map(em);
884
885 em = next_em;
886 }
887
888 write_unlock(&em_tree->lock);
889
890 free_extent_map(split);
891 free_extent_map(split2);
892 }
893
894 /*
895 * Replace a range in the inode's extent map tree with a new extent map.
896 *
897 * @inode: The target inode.
898 * @new_em: The new extent map to add to the inode's extent map tree.
899 * @modified: Indicate if the new extent map should be added to the list of
900 * modified extents (for fast fsync tracking).
901 *
902 * Drops all the extent maps in the inode's extent map tree that intersect the
903 * range of the new extent map and adds the new extent map to the tree.
904 * The caller should have locked an appropriate file range in the inode's io
905 * tree before calling this function.
906 */
907 int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
908 struct extent_map *new_em,
909 bool modified)
910 {
911 const u64 end = new_em->start + new_em->len - 1;
912 struct extent_map_tree *tree = &inode->extent_tree;
913 int ret;
914
915 ASSERT(!extent_map_in_tree(new_em));
916
917 /*
918 * The caller has locked an appropriate file range in the inode's io
919 * tree, but getting -EEXIST when adding the new extent map can still
920 * happen in case there are extents that partially cover the range, and
921 * this is due to two tasks operating on different parts of the extent.
922 * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
923 * btrfs_get_extent") for an example and details.
924 */
925 do {
926 btrfs_drop_extent_map_range(inode, new_em->start, end, false);
927 write_lock(&tree->lock);
928 ret = add_extent_mapping(tree, new_em, modified);
929 write_unlock(&tree->lock);
930 } while (ret == -EEXIST);
931
932 return ret;
933 }
934
935 /*
936 * Split off the first pre bytes from the extent_map at [start, start + len],
937 * and set the block_start for it to new_logical.
938 *
939 * This function is used when an ordered_extent needs to be split.
940 */
941 int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
942 u64 new_logical)
943 {
944 struct extent_map_tree *em_tree = &inode->extent_tree;
945 struct extent_map *em;
946 struct extent_map *split_pre = NULL;
947 struct extent_map *split_mid = NULL;
948 int ret = 0;
949 unsigned long flags;
950
951 ASSERT(pre != 0);
952 ASSERT(pre < len);
953
954 split_pre = alloc_extent_map();
955 if (!split_pre)
956 return -ENOMEM;
957 split_mid = alloc_extent_map();
958 if (!split_mid) {
959 ret = -ENOMEM;
960 goto out_free_pre;
961 }
962
963 lock_extent(&inode->io_tree, start, start + len - 1, NULL);
964 write_lock(&em_tree->lock);
965 em = lookup_extent_mapping(em_tree, start, len);
966 if (!em) {
967 ret = -EIO;
968 goto out_unlock;
969 }
970
971 ASSERT(em->len == len);
972 ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
973 ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
974 ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
975 ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
976 ASSERT(!list_empty(&em->list));
977
978 flags = em->flags;
979 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
980
981 /* First, replace the em with a new extent_map starting from * em->start */
982 split_pre->start = em->start;
983 split_pre->len = pre;
984 split_pre->orig_start = split_pre->start;
985 split_pre->block_start = new_logical;
986 split_pre->block_len = split_pre->len;
987 split_pre->orig_block_len = split_pre->block_len;
988 split_pre->ram_bytes = split_pre->len;
989 split_pre->flags = flags;
990 split_pre->compress_type = em->compress_type;
991 split_pre->generation = em->generation;
992
993 replace_extent_mapping(em_tree, em, split_pre, 1);
994
995 /*
996 * Now we only have an extent_map at:
997 * [em->start, em->start + pre]
998 */
999
1000 /* Insert the middle extent_map. */
1001 split_mid->start = em->start + pre;
1002 split_mid->len = em->len - pre;
1003 split_mid->orig_start = split_mid->start;
1004 split_mid->block_start = em->block_start + pre;
1005 split_mid->block_len = split_mid->len;
1006 split_mid->orig_block_len = split_mid->block_len;
1007 split_mid->ram_bytes = split_mid->len;
1008 split_mid->flags = flags;
1009 split_mid->compress_type = em->compress_type;
1010 split_mid->generation = em->generation;
1011 add_extent_mapping(em_tree, split_mid, 1);
1012
1013 /* Once for us */
1014 free_extent_map(em);
1015 /* Once for the tree */
1016 free_extent_map(em);
1017
1018 out_unlock:
1019 write_unlock(&em_tree->lock);
1020 unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
1021 free_extent_map(split_mid);
1022 out_free_pre:
1023 free_extent_map(split_pre);
1024 return ret;
1025 }