Merge branch 'for-6.0/dax' into libnvdimm-fixes

[people/ms/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 4157ecc27d4b6aeca1c290064b6c8baf0e69c580..6914cd8024ba040b21552be249ddfdf1da8a1d8e 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
         return ret;
  }
  
-static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
  {
         struct btrfs_device *dev = stripe->dev;
         struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
         u64 discarded_bytes = 0;
         u64 end = bytenr + num_bytes;
         u64 cur = bytenr;
-       struct btrfs_io_context *bioc = NULL;
  
         /*
-        * Avoid races with device replace and make sure our bioc has devices
-        * associated to its stripes that don't go away while we are discarding.
+        * Avoid races with device replace and make sure the devices in the
+        * stripes don't go away while we are discarding.
          */
         btrfs_bio_counter_inc_blocked(fs_info);
         while (cur < end) {
-               struct btrfs_io_stripe *stripe;
+               struct btrfs_discard_stripe *stripes;
+               unsigned int num_stripes;
                 int i;
  
                 num_bytes = end - cur;
-               /* Tell the block device(s) that the sectors can be discarded */
-               ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
-                                     &num_bytes, &bioc, 0);
-               /*
-                * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
-                * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
-                * thus we can't continue anyway.
-                */
-               if (ret < 0)
-                       goto out;
+               stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+               if (IS_ERR(stripes)) {
+                       ret = PTR_ERR(stripes);
+                       if (ret == -EOPNOTSUPP)
+                               ret = 0;
+                       break;
+               }
  
-               stripe = bioc->stripes;
-               for (i = 0; i < bioc->num_stripes; i++, stripe++) {
+               for (i = 0; i < num_stripes; i++) {
+                       struct btrfs_discard_stripe *stripe = stripes + i;
                         u64 bytes;
-                       struct btrfs_device *device = stripe->dev;
  
-                       if (!device->bdev) {
+                       if (!stripe->dev->bdev) {
                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
                                 continue;
                         }
  
-                       if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+                       if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+                                       &stripe->dev->dev_state))
                                 continue;
  
                         ret = do_discard_extent(stripe, &bytes);
-                       if (!ret) {
-                               discarded_bytes += bytes;
-                       } else if (ret != -EOPNOTSUPP) {
+                       if (ret) {
                                 /*
-                                * Logic errors or -ENOMEM, or -EIO, but
-                                * unlikely to happen.
-                                *
-                                * And since there are two loops, explicitly
-                                * go to out to avoid confusion.
+                                * Keep going if discard is not supported by the
+                                * device.
                                  */
-                               btrfs_put_bioc(bioc);
-                               goto out;
+                               if (ret != -EOPNOTSUPP)
+                                       break;
+                               ret = 0;
+                       } else {
+                               discarded_bytes += bytes;
                         }
-
-                       /*
-                        * Just in case we get back EOPNOTSUPP for some reason,
-                        * just ignore the return value so we don't screw up
-                        * people calling discard_extent.
-                        */
-                       ret = 0;
                 }
-               btrfs_put_bioc(bioc);
+               kfree(stripes);
+               if (ret)
+                       break;
                 cur += num_bytes;
         }
-out:
         btrfs_bio_counter_dec(fs_info);
-
         if (actual_bytes)
                 *actual_bytes = discarded_bytes;
-
-
-       if (ret == -EOPNOTSUPP)
-               ret = 0;
         return ret;
  }
  
@@ -2567,17 +2551,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
                 return -EINVAL;
  
         /*
-        * pull in the free space cache (if any) so that our pin
-        * removes the free space from the cache.  We have load_only set
-        * to one because the slow code to read in the free extents does check
-        * the pinned extents.
+        * Fully cache the free space first so that our pin removes the free space
+        * from the cache.
          */
-       btrfs_cache_block_group(cache, 1);
-       /*
-        * Make sure we wait until the cache is completely built in case it is
-        * missing or is invalid and therefore needs to be rebuilt.
-        */
-       ret = btrfs_wait_block_group_cache_done(cache);
+       ret = btrfs_cache_block_group(cache, true);
         if (ret)
                 goto out;
  
@@ -2600,12 +2577,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
         if (!block_group)
                 return -EINVAL;
  
-       btrfs_cache_block_group(block_group, 1);
-       /*
-        * Make sure we wait until the cache is completely built in case it is
-        * missing or is invalid and therefore needs to be rebuilt.
-        */
-       ret = btrfs_wait_block_group_cache_done(block_group);
+       ret = btrfs_cache_block_group(block_group, true);
         if (ret)
                 goto out;
  
@@ -3981,23 +3953,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
         }
  }
  
-static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
-                              struct find_free_extent_ctl *ffe_ctl)
+static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+                                   struct find_free_extent_ctl *ffe_ctl)
+{
+       /* If we can activate new zone, just allocate a chunk and use it */
+       if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+               return 0;
+
+       /*
+        * We already reached the max active zones. Try to finish one block
+        * group to make a room for a new block group. This is only possible
+        * for a data block group because btrfs_zone_finish() may need to wait
+        * for a running transaction which can cause a deadlock for metadata
+        * allocation.
+        */
+       if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+               int ret = btrfs_zone_finish_one_bg(fs_info);
+
+               if (ret == 1)
+                       return 0;
+               else if (ret < 0)
+                       return ret;
+       }
+
+       /*
+        * If we have enough free space left in an already active block group
+        * and we can't activate any other zone now, do not allow allocating a
+        * new chunk and let find_free_extent() retry with a smaller size.
+        */
+       if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+               return -ENOSPC;
+
+       /*
+        * Even min_alloc_size is not left in any block groups. Since we cannot
+        * activate a new block group, allocating it may not help. Let's tell a
+        * caller to try again and hope it progress something by writing some
+        * parts of the region. That is only possible for data block groups,
+        * where a part of the region can be written.
+        */
+       if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
+               return -EAGAIN;
+
+       /*
+        * We cannot activate a new block group and no enough space left in any
+        * block groups. So, allocating a new block group may not help. But,
+        * there is nothing to do anyway, so let's go with it.
+        */
+       return 0;
+}
+
+static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+                             struct find_free_extent_ctl *ffe_ctl)
  {
         switch (ffe_ctl->policy) {
         case BTRFS_EXTENT_ALLOC_CLUSTERED:
-               return true;
+               return 0;
         case BTRFS_EXTENT_ALLOC_ZONED:
-               /*
-                * If we have enough free space left in an already
-                * active block group and we can't activate any other
-                * zone now, do not allow allocating a new chunk and
-                * let find_free_extent() retry with a smaller size.
-                */
-               if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
-                   !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
-                       return false;
-               return true;
+               return can_allocate_chunk_zoned(fs_info, ffe_ctl);
         default:
                 BUG();
         }
@@ -4079,8 +4091,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
                         int exist = 0;
  
                         /*Check if allocation policy allows to create a new chunk */
-                       if (!can_allocate_chunk(fs_info, ffe_ctl))
-                               return -ENOSPC;
+                       ret = can_allocate_chunk(fs_info, ffe_ctl);
+                       if (ret)
+                               return ret;
  
                         trans = current->journal_info;
                         if (trans)
@@ -4374,7 +4387,7 @@ have_block_group:
                 ffe_ctl->cached = btrfs_block_group_done(block_group);
                 if (unlikely(!ffe_ctl->cached)) {
                         ffe_ctl->have_caching_bg = true;
-                       ret = btrfs_cache_block_group(block_group, 0);
+                       ret = btrfs_cache_block_group(block_group, false);
  
                         /*
                          * If we get ENOMEM here or something else we want to
@@ -4842,6 +4855,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct extent_buffer *buf;
+       u64 lockdep_owner = owner;
  
         buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
         if (IS_ERR(buf))
@@ -4860,12 +4874,27 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                 return ERR_PTR(-EUCLEAN);
         }
  
+       /*
+        * The reloc trees are just snapshots, so we need them to appear to be
+        * just like any other fs tree WRT lockdep.
+        *
+        * The exception however is in replace_path() in relocation, where we
+        * hold the lock on the original fs root and then search for the reloc
+        * root.  At that point we need to make sure any reloc root buffers are
+        * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
+        * lockdep happy.
+        */
+       if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
+           !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+               lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
         /*
          * This needs to stay, because we could allocate a freed block from an
          * old tree into a new tree, so we need to make sure this new block is
          * set to the appropriate level and owner.
          */
-       btrfs_set_buffer_lockdep_class(owner, buf, level);
+       btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+
         __btrfs_tree_lock(buf, nest);
         btrfs_clean_tree_block(buf);
         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -5829,7 +5858,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
         btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
         btrfs_qgroup_free_meta_all_pertrans(root);
  
-       if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+       if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
                 btrfs_add_dropped_root(trans, root);
         else
                 btrfs_put_root(root);
@@ -5992,7 +6021,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
   */
  static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
  {
-       u64 start = SZ_1M, len = 0, end = 0;
+       u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
         int ret;
  
         *trimmed = 0;
@@ -6036,8 +6065,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
                         break;
                 }
  
-               /* Ensure we skip the reserved area in the first 1M */
-               start = max_t(u64, start, SZ_1M);
+               /* Ensure we skip the reserved space on each device. */
+               start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
  
                 /*
                  * If find_first_clear_extent_bit find a range that spans the
@@ -6128,13 +6157,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
  
                 if (end - start >= range->minlen) {
                         if (!btrfs_block_group_done(cache)) {
-                               ret = btrfs_cache_block_group(cache, 0);
-                               if (ret) {
-                                       bg_failed++;
-                                       bg_ret = ret;
-                                       continue;
-                               }
-                               ret = btrfs_wait_block_group_cache_done(cache);
+                               ret = btrfs_cache_block_group(cache, true);
                                 if (ret) {
                                         bg_failed++;
                                         bg_ret = ret;