Merge tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 14 Feb 2024 23:47:02 +0000 (15:47 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 14 Feb 2024 23:47:02 +0000 (15:47 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 14 Feb 2024 23:47:02 +0000 (15:47 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 14 Feb 2024 23:47:02 +0000 (15:47 -0800)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c

index a9be9ac9922225bb32801aec5834c9e9d87ffc97..378d9103a2072b1628e66d850a42b9254be72b36 100644 (file)
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1455,6 +1455,7 @@ out:
   */
  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
  {
+       LIST_HEAD(retry_list);
         struct btrfs_block_group *block_group;
         struct btrfs_space_info *space_info;
         struct btrfs_trans_handle *trans;
@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
  
         spin_lock(&fs_info->unused_bgs_lock);
         while (!list_empty(&fs_info->unused_bgs)) {
+               u64 used;
                 int trimming;
  
                 block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                         goto next;
                 }
  
+               spin_lock(&space_info->lock);
                 spin_lock(&block_group->lock);
-               if (block_group->reserved || block_group->pinned ||
-                   block_group->used || block_group->ro ||
+               if (btrfs_is_block_group_used(block_group) || block_group->ro ||
                     list_is_singular(&block_group->list)) {
                         /*
                          * We want to bail if we made new allocations or have
@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                          */
                         trace_btrfs_skip_unused_block_group(block_group);
                         spin_unlock(&block_group->lock);
+                       spin_unlock(&space_info->lock);
                         up_write(&space_info->groups_sem);
                         goto next;
                 }
+
+               /*
+                * The block group may be unused but there may be space reserved
+                * accounting with the existence of that block group, that is,
+                * space_info->bytes_may_use was incremented by a task but no
+                * space was yet allocated from the block group by the task.
+                * That space may or may not be allocated, as we are generally
+                * pessimistic about space reservation for metadata as well as
+                * for data when using compression (as we reserve space based on
+                * the worst case, when data can't be compressed, and before
+                * actually attempting compression, before starting writeback).
+                *
+                * So check if the total space of the space_info minus the size
+                * of this block group is less than the used space of the
+                * space_info - if that's the case, then it means we have tasks
+                * that might be relying on the block group in order to allocate
+                * extents, and add back the block group to the unused list when
+                * we finish, so that we retry later in case no tasks ended up
+                * needing to allocate extents from the block group.
+                */
+               used = btrfs_space_info_used(space_info, true);
+               if (space_info->total_bytes - block_group->length < used) {
+                       /*
+                        * Add a reference for the list, compensate for the ref
+                        * drop under the "next" label for the
+                        * fs_info->unused_bgs list.
+                        */
+                       btrfs_get_block_group(block_group);
+                       list_add_tail(&block_group->bg_list, &retry_list);
+
+                       trace_btrfs_skip_unused_block_group(block_group);
+                       spin_unlock(&block_group->lock);
+                       spin_unlock(&space_info->lock);
+                       up_write(&space_info->groups_sem);
+                       goto next;
+               }
+
                 spin_unlock(&block_group->lock);
+               spin_unlock(&space_info->lock);
  
                 /* We don't want to force the issue, only flip if it's ok. */
                 ret = inc_block_group_ro(block_group, 0);
@@ -1650,12 +1691,16 @@ next:
                 btrfs_put_block_group(block_group);
                 spin_lock(&fs_info->unused_bgs_lock);
         }
+       list_splice_tail(&retry_list, &fs_info->unused_bgs);
         spin_unlock(&fs_info->unused_bgs_lock);
         mutex_unlock(&fs_info->reclaim_bgs_lock);
         return;
  
  flip_async:
         btrfs_end_transaction(trans);
+       spin_lock(&fs_info->unused_bgs_lock);
+       list_splice_tail(&retry_list, &fs_info->unused_bgs);
+       spin_unlock(&fs_info->unused_bgs_lock);
         mutex_unlock(&fs_info->reclaim_bgs_lock);
         btrfs_put_block_group(block_group);
         btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -2684,6 +2729,37 @@ next:
                 btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
                 list_del_init(&block_group->bg_list);
                 clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+               /*
+                * If the block group is still unused, add it to the list of
+                * unused block groups. The block group may have been created in
+                * order to satisfy a space reservation, in which case the
+                * extent allocation only happens later. But often we don't
+                * actually need to allocate space that we previously reserved,
+                * so the block group may become unused for a long time. For
+                * example for metadata we generally reserve space for a worst
+                * possible scenario, but then don't end up allocating all that
+                * space or none at all (due to no need to COW, extent buffers
+                * were already COWed in the current transaction and still
+                * unwritten, tree heights lower than the maximum possible
+                * height, etc). For data we generally reserve the axact amount
+                * of space we are going to allocate later, the exception is
+                * when using compression, as we must reserve space based on the
+                * uncompressed data size, because the compression is only done
+                * when writeback triggered and we don't know how much space we
+                * are actually going to need, so we reserve the uncompressed
+                * size because the data may be uncompressible in the worst case.
+                */
+               if (ret == 0) {
+                       bool used;
+
+                       spin_lock(&block_group->lock);
+                       used = btrfs_is_block_group_used(block_group);
+                       spin_unlock(&block_group->lock);
+
+                       if (!used)
+                               btrfs_mark_bg_unused(block_group);
+               }
         }
         btrfs_trans_release_chunk_metadata(trans);
  }
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h

index c4a1f01cc1c240d108702fc8899de9efe00da613..962b11983901a86ae16add7962c5ea5a26796b6f 100644 (file)
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
         return (block_group->start + block_group->length);
  }
  
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+       lockdep_assert_held(&bg->lock);
+
+       return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
  static inline bool btrfs_is_block_group_data_only(
                                         struct btrfs_block_group *block_group)
  {
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c

index 2833e8ef4c098f680a4883d41a1e925dc477bc2f..acf9f4b6c044025fe2ef288e99716d0373d01f31 100644 (file)
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
         u64 reserve_size = 0;
         u64 qgroup_rsv_size = 0;
-       u64 csum_leaves;
         unsigned outstanding_extents;
  
         lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
                                                 outstanding_extents);
                 reserve_size += btrfs_calc_metadata_size(fs_info, 1);
         }
-       csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
-                                                inode->csum_bytes);
-       reserve_size += btrfs_calc_insert_metadata_size(fs_info,
-                                                       csum_leaves);
+       if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+               u64 csum_leaves;
+
+               csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+               reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+       }
         /*
          * For qgroup rsv, the calculation is very simple:
          * account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
         spin_unlock(&block_rsv->lock);
  }
  
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
                                     u64 num_bytes, u64 disk_num_bytes,
                                     u64 *meta_reserve, u64 *qgroup_reserve)
  {
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
         u64 nr_extents = count_max_extents(fs_info, num_bytes);
-       u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+       u64 csum_leaves;
         u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
  
+       if (inode->flags & BTRFS_INODE_NODATASUM)
+               csum_leaves = 0;
+       else
+               csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
         *meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
                                                 nr_extents + csum_leaves);
  
@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
          * everything out and try again, which is bad.  This way we just
          * over-reserve slightly, and clean up the mess when we are done.
          */
-       calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+       calc_inode_reservations(inode, num_bytes, disk_num_bytes,
                                 &meta_reserve, &qgroup_reserve);
         ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
                                                  noflush);
@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
         nr_extents = count_max_extents(fs_info, num_bytes);
         spin_lock(&inode->lock);
         btrfs_mod_outstanding_extents(inode, nr_extents);
-       inode->csum_bytes += disk_num_bytes;
+       if (!(inode->flags & BTRFS_INODE_NODATASUM))
+               inode->csum_bytes += disk_num_bytes;
         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
         spin_unlock(&inode->lock);
  
@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
  
         num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
         spin_lock(&inode->lock);
-       inode->csum_bytes -= num_bytes;
+       if (!(inode->flags & BTRFS_INODE_NODATASUM))
+               inode->csum_bytes -= num_bytes;
         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
         spin_unlock(&inode->lock);
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 1eb93d3962aac4608cda0255ea31d7e53dbc8da2..f88e0ca8331d9b5448e8e07c89e5e66395c782b3 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3184,8 +3184,23 @@ out:
                         unwritten_start += logical_len;
                 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
  
-               /* Drop extent maps for the part of the extent we didn't write. */
-               btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+               /*
+                * Drop extent maps for the part of the extent we didn't write.
+                *
+                * We have an exception here for the free_space_inode, this is
+                * because when we do btrfs_get_extent() on the free space inode
+                * we will search the commit root.  If this is a new block group
+                * we won't find anything, and we will trip over the assert in
+                * writepage where we do ASSERT(em->block_start !=
+                * EXTENT_MAP_HOLE).
+                *
+                * Theoretically we could also skip this for any NOCOW extent as
+                * we don't mess with the extent map tree in the NOCOW case, but
+                * for now simply skip this if we are the free space inode.
+                */
+               if (!btrfs_is_free_space_inode(inode))
+                       btrfs_drop_extent_map_range(inode, unwritten_start,
+                                                   end, false);
  
                 /*
                  * If the ordered extent had an IOERR or something else went
@@ -10273,6 +10288,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
         if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
                 return -EINVAL;
  
+       /*
+        * Compressed extents should always have checksums, so error out if we
+        * have a NOCOW file or inode was created while mounted with NODATASUM.
+        */
+       if (inode->flags & BTRFS_INODE_NODATASUM)
+               return -EINVAL;
+
         orig_count = iov_iter_count(from);
  
         /* The extent size must be sane. */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 5b3333ceef04818dbf98270da4bb84c99e5c70f8..c52807d97efa553b0b5e4765e11606a8ce644161 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
                                         u64 num_bytes,
                                         u64 *delayed_refs_bytes)
  {
-       struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
         struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
-       u64 extra_delayed_refs_bytes = 0;
-       u64 bytes;
+       u64 bytes = num_bytes + *delayed_refs_bytes;
         int ret;
  
-       /*
-        * If there's a gap between the size of the delayed refs reserve and
-        * its reserved space, than some tasks have added delayed refs or bumped
-        * its size otherwise (due to block group creation or removal, or block
-        * group item update). Also try to allocate that gap in order to prevent
-        * using (and possibly abusing) the global reserve when committing the
-        * transaction.
-        */
-       if (flush == BTRFS_RESERVE_FLUSH_ALL &&
-           !btrfs_block_rsv_full(delayed_refs_rsv)) {
-               spin_lock(&delayed_refs_rsv->lock);
-               if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
-                       extra_delayed_refs_bytes = delayed_refs_rsv->size -
-                               delayed_refs_rsv->reserved;
-               spin_unlock(&delayed_refs_rsv->lock);
-       }
-
-       bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
         /*
          * We want to reserve all the bytes we may need all at once, so we only
          * do 1 enospc flushing cycle per transaction start.
          */
         ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
-       if (ret == 0) {
-               if (extra_delayed_refs_bytes > 0)
-                       btrfs_migrate_to_delayed_refs_rsv(fs_info,
-                                                         extra_delayed_refs_bytes);
-               return 0;
-       }
-
-       if (extra_delayed_refs_bytes > 0) {
-               bytes -= extra_delayed_refs_bytes;
-               ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
-               if (ret == 0)
-                       return 0;
-       }
  
         /*
          * If we are an emergency flush, which can steal from the global block
          * reserve, then attempt to not reserve space for the delayed refs, as
          * we will consume space for them from the global block reserve.
          */
-       if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+       if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
                 bytes -= *delayed_refs_bytes;
                 *delayed_refs_bytes = 0;
                 ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c

index 168af9d000d168324fcc8355781517ddeedeefd1..3a5d69ff25fc221f20c1e37a9854021eff246bad 100644 (file)
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1670,6 +1670,7 @@ out:
         }
         bitmap_free(active);
         kfree(zone_info);
+       btrfs_free_chunk_map(map);
  
         return ret;
  }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 14 Feb 2024 23:47:02 +0000 (15:47 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 14 Feb 2024 23:47:02 +0000 (15:47 -0800)
fs/btrfs/block-group.c		patch \| blob \| blame \| history
fs/btrfs/block-group.h		patch \| blob \| blame \| history
fs/btrfs/delalloc-space.c		patch \| blob \| blame \| history
fs/btrfs/inode.c		patch \| blob \| blame \| history
fs/btrfs/transaction.c		patch \| blob \| blame \| history
fs/btrfs/zoned.c		patch \| blob \| blame \| history