]> git.ipfire.org Git - people/ms/linux.git/blobdiff - fs/btrfs/extent-tree.c
Merge branch 'for-6.0/dax' into libnvdimm-fixes
[people/ms/linux.git] / fs / btrfs / extent-tree.c
index 4157ecc27d4b6aeca1c290064b6c8baf0e69c580..6914cd8024ba040b21552be249ddfdf1da8a1d8e 100644 (file)
@@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
        return ret;
 }
 
-static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
 {
        struct btrfs_device *dev = stripe->dev;
        struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
        u64 discarded_bytes = 0;
        u64 end = bytenr + num_bytes;
        u64 cur = bytenr;
-       struct btrfs_io_context *bioc = NULL;
 
        /*
-        * Avoid races with device replace and make sure our bioc has devices
-        * associated to its stripes that don't go away while we are discarding.
+        * Avoid races with device replace and make sure the devices in the
+        * stripes don't go away while we are discarding.
         */
        btrfs_bio_counter_inc_blocked(fs_info);
        while (cur < end) {
-               struct btrfs_io_stripe *stripe;
+               struct btrfs_discard_stripe *stripes;
+               unsigned int num_stripes;
                int i;
 
                num_bytes = end - cur;
-               /* Tell the block device(s) that the sectors can be discarded */
-               ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
-                                     &num_bytes, &bioc, 0);
-               /*
-                * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
-                * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
-                * thus we can't continue anyway.
-                */
-               if (ret < 0)
-                       goto out;
+               stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+               if (IS_ERR(stripes)) {
+                       ret = PTR_ERR(stripes);
+                       if (ret == -EOPNOTSUPP)
+                               ret = 0;
+                       break;
+               }
 
-               stripe = bioc->stripes;
-               for (i = 0; i < bioc->num_stripes; i++, stripe++) {
+               for (i = 0; i < num_stripes; i++) {
+                       struct btrfs_discard_stripe *stripe = stripes + i;
                        u64 bytes;
-                       struct btrfs_device *device = stripe->dev;
 
-                       if (!device->bdev) {
+                       if (!stripe->dev->bdev) {
                                ASSERT(btrfs_test_opt(fs_info, DEGRADED));
                                continue;
                        }
 
-                       if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+                       if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+                                       &stripe->dev->dev_state))
                                continue;
 
                        ret = do_discard_extent(stripe, &bytes);
-                       if (!ret) {
-                               discarded_bytes += bytes;
-                       } else if (ret != -EOPNOTSUPP) {
+                       if (ret) {
                                /*
-                                * Logic errors or -ENOMEM, or -EIO, but
-                                * unlikely to happen.
-                                *
-                                * And since there are two loops, explicitly
-                                * go to out to avoid confusion.
+                                * Keep going if discard is not supported by the
+                                * device.
                                 */
-                               btrfs_put_bioc(bioc);
-                               goto out;
+                               if (ret != -EOPNOTSUPP)
+                                       break;
+                               ret = 0;
+                       } else {
+                               discarded_bytes += bytes;
                        }
-
-                       /*
-                        * Just in case we get back EOPNOTSUPP for some reason,
-                        * just ignore the return value so we don't screw up
-                        * people calling discard_extent.
-                        */
-                       ret = 0;
                }
-               btrfs_put_bioc(bioc);
+               kfree(stripes);
+               if (ret)
+                       break;
                cur += num_bytes;
        }
-out:
        btrfs_bio_counter_dec(fs_info);
-
        if (actual_bytes)
                *actual_bytes = discarded_bytes;
-
-
-       if (ret == -EOPNOTSUPP)
-               ret = 0;
        return ret;
 }
 
@@ -2567,17 +2551,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
                return -EINVAL;
 
        /*
-        * pull in the free space cache (if any) so that our pin
-        * removes the free space from the cache.  We have load_only set
-        * to one because the slow code to read in the free extents does check
-        * the pinned extents.
+        * Fully cache the free space first so that our pin removes the free space
+        * from the cache.
         */
-       btrfs_cache_block_group(cache, 1);
-       /*
-        * Make sure we wait until the cache is completely built in case it is
-        * missing or is invalid and therefore needs to be rebuilt.
-        */
-       ret = btrfs_wait_block_group_cache_done(cache);
+       ret = btrfs_cache_block_group(cache, true);
        if (ret)
                goto out;
 
@@ -2600,12 +2577,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
        if (!block_group)
                return -EINVAL;
 
-       btrfs_cache_block_group(block_group, 1);
-       /*
-        * Make sure we wait until the cache is completely built in case it is
-        * missing or is invalid and therefore needs to be rebuilt.
-        */
-       ret = btrfs_wait_block_group_cache_done(block_group);
+       ret = btrfs_cache_block_group(block_group, true);
        if (ret)
                goto out;
 
@@ -3981,23 +3953,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
        }
 }
 
-static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
-                              struct find_free_extent_ctl *ffe_ctl)
+static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+                                   struct find_free_extent_ctl *ffe_ctl)
+{
+       /* If we can activate new zone, just allocate a chunk and use it */
+       if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+               return 0;
+
+       /*
+        * We already reached the max active zones. Try to finish one block
+        * group to make a room for a new block group. This is only possible
+        * for a data block group because btrfs_zone_finish() may need to wait
+        * for a running transaction which can cause a deadlock for metadata
+        * allocation.
+        */
+       if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+               int ret = btrfs_zone_finish_one_bg(fs_info);
+
+               if (ret == 1)
+                       return 0;
+               else if (ret < 0)
+                       return ret;
+       }
+
+       /*
+        * If we have enough free space left in an already active block group
+        * and we can't activate any other zone now, do not allow allocating a
+        * new chunk and let find_free_extent() retry with a smaller size.
+        */
+       if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+               return -ENOSPC;
+
+       /*
+        * Even min_alloc_size is not left in any block groups. Since we cannot
+        * activate a new block group, allocating it may not help. Let's tell a
+        * caller to try again and hope it progress something by writing some
+        * parts of the region. That is only possible for data block groups,
+        * where a part of the region can be written.
+        */
+       if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
+               return -EAGAIN;
+
+       /*
+        * We cannot activate a new block group and no enough space left in any
+        * block groups. So, allocating a new block group may not help. But,
+        * there is nothing to do anyway, so let's go with it.
+        */
+       return 0;
+}
+
+static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+                             struct find_free_extent_ctl *ffe_ctl)
 {
        switch (ffe_ctl->policy) {
        case BTRFS_EXTENT_ALLOC_CLUSTERED:
-               return true;
+               return 0;
        case BTRFS_EXTENT_ALLOC_ZONED:
-               /*
-                * If we have enough free space left in an already
-                * active block group and we can't activate any other
-                * zone now, do not allow allocating a new chunk and
-                * let find_free_extent() retry with a smaller size.
-                */
-               if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
-                   !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
-                       return false;
-               return true;
+               return can_allocate_chunk_zoned(fs_info, ffe_ctl);
        default:
                BUG();
        }
@@ -4079,8 +4091,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
                        int exist = 0;
 
                        /*Check if allocation policy allows to create a new chunk */
-                       if (!can_allocate_chunk(fs_info, ffe_ctl))
-                               return -ENOSPC;
+                       ret = can_allocate_chunk(fs_info, ffe_ctl);
+                       if (ret)
+                               return ret;
 
                        trans = current->journal_info;
                        if (trans)
@@ -4374,7 +4387,7 @@ have_block_group:
                ffe_ctl->cached = btrfs_block_group_done(block_group);
                if (unlikely(!ffe_ctl->cached)) {
                        ffe_ctl->have_caching_bg = true;
-                       ret = btrfs_cache_block_group(block_group, 0);
+                       ret = btrfs_cache_block_group(block_group, false);
 
                        /*
                         * If we get ENOMEM here or something else we want to
@@ -4842,6 +4855,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *buf;
+       u64 lockdep_owner = owner;
 
        buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
        if (IS_ERR(buf))
@@ -4860,12 +4874,27 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                return ERR_PTR(-EUCLEAN);
        }
 
+       /*
+        * The reloc trees are just snapshots, so we need them to appear to be
+        * just like any other fs tree WRT lockdep.
+        *
+        * The exception however is in replace_path() in relocation, where we
+        * hold the lock on the original fs root and then search for the reloc
+        * root.  At that point we need to make sure any reloc root buffers are
+        * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
+        * lockdep happy.
+        */
+       if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
+           !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+               lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
        /*
         * This needs to stay, because we could allocate a freed block from an
         * old tree into a new tree, so we need to make sure this new block is
         * set to the appropriate level and owner.
         */
-       btrfs_set_buffer_lockdep_class(owner, buf, level);
+       btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+
        __btrfs_tree_lock(buf, nest);
        btrfs_clean_tree_block(buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -5829,7 +5858,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
        btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
        btrfs_qgroup_free_meta_all_pertrans(root);
 
-       if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+       if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
                btrfs_add_dropped_root(trans, root);
        else
                btrfs_put_root(root);
@@ -5992,7 +6021,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
  */
 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
 {
-       u64 start = SZ_1M, len = 0, end = 0;
+       u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
        int ret;
 
        *trimmed = 0;
@@ -6036,8 +6065,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
                        break;
                }
 
-               /* Ensure we skip the reserved area in the first 1M */
-               start = max_t(u64, start, SZ_1M);
+               /* Ensure we skip the reserved space on each device. */
+               start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
 
                /*
                 * If find_first_clear_extent_bit find a range that spans the
@@ -6128,13 +6157,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 
                if (end - start >= range->minlen) {
                        if (!btrfs_block_group_done(cache)) {
-                               ret = btrfs_cache_block_group(cache, 0);
-                               if (ret) {
-                                       bg_failed++;
-                                       bg_ret = ret;
-                                       continue;
-                               }
-                               ret = btrfs_wait_block_group_cache_done(cache);
+                               ret = btrfs_cache_block_group(cache, true);
                                if (ret) {
                                        bg_failed++;
                                        bg_ret = ret;