From 49fe6c46b88564ae73908ac44c93f5f4ef67e19c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 5 Mar 2022 14:35:52 +0100 Subject: [PATCH] 5.16-stable patches added patches: btrfs-fix-enospc-failure-when-attempting-direct-io-write-into-nocow-range.patch --- ...ing-direct-io-write-into-nocow-range.patch | 310 ++++++++++++++++++ queue-5.16/series | 1 + 2 files changed, 311 insertions(+) create mode 100644 queue-5.16/btrfs-fix-enospc-failure-when-attempting-direct-io-write-into-nocow-range.patch diff --git a/queue-5.16/btrfs-fix-enospc-failure-when-attempting-direct-io-write-into-nocow-range.patch b/queue-5.16/btrfs-fix-enospc-failure-when-attempting-direct-io-write-into-nocow-range.patch new file mode 100644 index 00000000000..4a164a2e5ea --- /dev/null +++ b/queue-5.16/btrfs-fix-enospc-failure-when-attempting-direct-io-write-into-nocow-range.patch @@ -0,0 +1,310 @@ +From f0bfa76a11e93d0fe2c896fcb566568c5e8b5d3f Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 28 Oct 2021 16:03:41 +0100 +Subject: btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW range + +From: Filipe Manana + +commit f0bfa76a11e93d0fe2c896fcb566568c5e8b5d3f upstream. + +When doing a direct IO write against a file range that either has +preallocated extents in that range or has regular extents and the file +has the NOCOW attribute set, the write fails with -ENOSPC when all of +the following conditions are met: + +1) There are no data blocks groups with enough free space matching + the size of the write; + +2) There's not enough unallocated space for allocating a new data block + group; + +3) The extents in the target file range are not shared, neither through + snapshots nor through reflinks. + +This is wrong because a NOCOW write can be done in such case, and in fact +it's possible to do it using a buffered IO write, since when failing to +allocate data space, the buffered IO path checks if a NOCOW write is +possible. + +The failure in direct IO write path comes from the fact that early on, +at btrfs_dio_iomap_begin(), we try to allocate data space for the write +and if it that fails we return the error and stop - we never check if we +can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check +if we can do a NOCOW write into the range, or a subset of the range, and +then release the previously reserved data space. + +Fix this by doing the data reservation only if needed, when we must COW, +at btrfs_get_blocks_direct_write() instead of doing it at +btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes +the inneficiency of doing unnecessary data reservations. + +The following example test script reproduces the problem: + + $ cat dio-nocow-enospc.sh + #!/bin/bash + + DEV=/dev/sdj + MNT=/mnt/sdj + + # Use a small fixed size (1G) filesystem so that it's quick to fill + # it up. + # Make sure the mixed block groups feature is not enabled because we + # later want to not have more space available for allocating data + # extents but still have enough metadata space free for the file writes. + mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV + mount $DEV $MNT + + # Create our test file with the NOCOW attribute set. + touch $MNT/foobar + chattr +C $MNT/foobar + + # Now fill in all unallocated space with data for our test file. + # This will allocate a data block group that will be full and leave + # no (or a very small amount of) unallocated space in the device, so + # that it will not be possible to allocate a new block group later. + echo + echo "Creating test file with initial data..." + xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar + + # Now try a direct IO write against file range [0, 10M[. + # This should succeed since this is a NOCOW file and an extent for the + # range was previously allocated. + echo + echo "Trying direct IO write over allocated space..." + xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar + + umount $MNT + +When running the test: + + $ ./dio-nocow-enospc.sh + (...) + + Creating test file with initial data... + wrote 943718400/943718400 bytes at offset 0 + 900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec) + + Trying direct IO write over allocated space... + pwrite: No space left on device + +A test case for fstests will follow, testing both this direct IO write +scenario as well as the buffered IO write scenario to make it less likely +to get future regressions on the buffered IO case. + +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 142 ++++++++++++++++++++++++++++++------------------------- + 1 file changed, 78 insertions(+), 64 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -61,8 +61,6 @@ struct btrfs_iget_args { + }; + + struct btrfs_dio_data { +- u64 reserve; +- loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; + }; +@@ -7773,6 +7771,10 @@ static int btrfs_get_blocks_direct_write + { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *em = *map; ++ int type; ++ u64 block_start, orig_start, orig_block_len, ram_bytes; ++ bool can_nocow = false; ++ bool space_reserved = false; + int ret = 0; + + /* +@@ -7787,9 +7789,6 @@ static int btrfs_get_blocks_direct_write + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { +- int type; +- u64 block_start, orig_start, orig_block_len, ram_bytes; +- + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else +@@ -7799,53 +7798,92 @@ static int btrfs_get_blocks_direct_write + + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes, false) == 1 && +- btrfs_inc_nocow_writers(fs_info, block_start)) { +- struct extent_map *em2; ++ btrfs_inc_nocow_writers(fs_info, block_start)) ++ can_nocow = true; ++ } + +- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, +- orig_start, block_start, +- len, orig_block_len, +- ram_bytes, type); ++ if (can_nocow) { ++ struct extent_map *em2; ++ ++ /* We can NOCOW, so only need to reserve metadata space. */ ++ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); ++ if (ret < 0) { ++ /* Our caller expects us to free the input extent map. */ ++ free_extent_map(em); ++ *map = NULL; + btrfs_dec_nocow_writers(fs_info, block_start); +- if (type == BTRFS_ORDERED_PREALLOC) { +- free_extent_map(em); +- *map = em = em2; +- } ++ goto out; ++ } ++ space_reserved = true; + +- if (em2 && IS_ERR(em2)) { +- ret = PTR_ERR(em2); +- goto out; +- } +- /* +- * For inode marked NODATACOW or extent marked PREALLOC, +- * use the existing or preallocated extent, so does not +- * need to adjust btrfs_space_info's bytes_may_use. +- */ +- btrfs_free_reserved_data_space_noquota(fs_info, len); +- goto skip_cow; ++ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, ++ orig_start, block_start, ++ len, orig_block_len, ++ ram_bytes, type); ++ btrfs_dec_nocow_writers(fs_info, block_start); ++ if (type == BTRFS_ORDERED_PREALLOC) { ++ free_extent_map(em); ++ *map = em = em2; + } +- } + +- /* this will cow the extent */ +- free_extent_map(em); +- *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); +- if (IS_ERR(em)) { +- ret = PTR_ERR(em); +- goto out; ++ if (IS_ERR(em2)) { ++ ret = PTR_ERR(em2); ++ goto out; ++ } ++ } else { ++ const u64 prev_len = len; ++ ++ /* Our caller expects us to free the input extent map. */ ++ free_extent_map(em); ++ *map = NULL; ++ ++ /* We have to COW, so need to reserve metadata and data space. */ ++ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), ++ &dio_data->data_reserved, ++ start, len); ++ if (ret < 0) ++ goto out; ++ space_reserved = true; ++ ++ em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); ++ if (IS_ERR(em)) { ++ ret = PTR_ERR(em); ++ goto out; ++ } ++ *map = em; ++ len = min(len, em->len - (start - em->start)); ++ if (len < prev_len) ++ btrfs_delalloc_release_space(BTRFS_I(inode), ++ dio_data->data_reserved, ++ start + len, prev_len - len, ++ true); + } + +- len = min(len, em->len - (start - em->start)); ++ /* ++ * We have created our ordered extent, so we can now release our reservation ++ * for an outstanding extent. ++ */ ++ btrfs_delalloc_release_extents(BTRFS_I(inode), len); + +-skip_cow: + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); +- +- dio_data->reserve -= len; + out: ++ if (ret && space_reserved) { ++ btrfs_delalloc_release_extents(BTRFS_I(inode), len); ++ if (can_nocow) { ++ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); ++ } else { ++ btrfs_delalloc_release_space(BTRFS_I(inode), ++ dio_data->data_reserved, ++ start, len, true); ++ extent_changeset_free(dio_data->data_reserved); ++ dio_data->data_reserved = NULL; ++ } ++ } + return ret; + } + +@@ -7887,18 +7925,6 @@ static int btrfs_dio_iomap_begin(struct + if (!dio_data) + return -ENOMEM; + +- dio_data->length = length; +- if (write) { +- dio_data->reserve = round_up(length, fs_info->sectorsize); +- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), +- &dio_data->data_reserved, +- start, dio_data->reserve); +- if (ret) { +- extent_changeset_free(dio_data->data_reserved); +- kfree(dio_data); +- return ret; +- } +- } + iomap->private = dio_data; + + +@@ -7991,14 +8017,8 @@ unlock_err: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); + err: +- if (dio_data) { +- btrfs_delalloc_release_space(BTRFS_I(inode), +- dio_data->data_reserved, start, +- dio_data->reserve, true); +- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); +- extent_changeset_free(dio_data->data_reserved); +- kfree(dio_data); +- } ++ kfree(dio_data); ++ + return ret; + } + +@@ -8028,14 +8048,8 @@ static int btrfs_dio_iomap_end(struct in + ret = -ENOTBLK; + } + +- if (write) { +- if (dio_data->reserve) +- btrfs_delalloc_release_space(BTRFS_I(inode), +- dio_data->data_reserved, pos, +- dio_data->reserve, true); +- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); ++ if (write) + extent_changeset_free(dio_data->data_reserved); +- } + out: + kfree(dio_data); + iomap->private = NULL; diff --git a/queue-5.16/series b/queue-5.16/series index d5e2aa04dbe..9c76671ad26 100644 --- a/queue-5.16/series +++ b/queue-5.16/series @@ -83,3 +83,4 @@ net-ipa-add-an-interconnect-dependency.patch net-smc-fix-connection-leak.patch net-smc-fix-unexpected-smc_clc_decl_err_regrmb-error-generated-by-client.patch net-smc-fix-unexpected-smc_clc_decl_err_regrmb-error-cause-by-server.patch +btrfs-fix-enospc-failure-when-attempting-direct-io-write-into-nocow-range.patch -- 2.47.2