From: Greg Kroah-Hartman Date: Wed, 19 Aug 2020 11:27:21 +0000 (+0200) Subject: 5.8-stable patches X-Git-Tag: v4.14.194~49 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7e6044d36298e9a2306e14daafeb29f5be2ad1b2;p=thirdparty%2Fkernel%2Fstable-queue.git 5.8-stable patches added patches: btrfs-add-missing-check-for-nocow-and-compression-inode-flags.patch btrfs-allow-use-of-global-block-reserve-for-balance-item-deletion.patch btrfs-avoid-possible-signal-interruption-of-btrfs_drop_snapshot-on-relocation-tree.patch btrfs-don-t-allocate-anonymous-block-device-for-user-invisible-roots.patch btrfs-don-t-traverse-into-the-seed-devices-in-show_devname.patch btrfs-don-t-warn-if-we-abort-a-transaction-with-erofs.patch btrfs-fix-race-between-page-release-and-a-fast-fsync.patch btrfs-free-anon-block-device-right-after-subvolume-deletion.patch btrfs-move-the-chunk_mutex-in-btrfs_read_chunk_tree.patch btrfs-only-commit-delayed-items-at-fsync-if-we-are-logging-a-directory.patch btrfs-only-commit-the-delayed-inode-when-doing-a-full-fsync.patch btrfs-open-device-without-device_list_mutex.patch btrfs-pass-checksum-type-via-btrfs_ioc_fs_info-ioctl.patch btrfs-preallocate-anon-block-device-at-first-phase-of-snapshot-creation.patch btrfs-ref-verify-fix-memory-leak-in-add_block_entry.patch btrfs-relocation-review-the-call-sites-which-can-be-interrupted-by-signal.patch btrfs-remove-no-longer-needed-use-of-log_writers-for-the-log-root-tree.patch btrfs-return-erofs-for-btrfs_fs_state_error-cases.patch btrfs-stop-incremening-log_batch-for-the-log-root-tree-when-syncing-log.patch btrfs-sysfs-use-nofs-for-device-creation.patch --- diff --git a/queue-5.8/btrfs-add-missing-check-for-nocow-and-compression-inode-flags.patch b/queue-5.8/btrfs-add-missing-check-for-nocow-and-compression-inode-flags.patch new file mode 100644 index 00000000000..582a858e556 --- /dev/null +++ b/queue-5.8/btrfs-add-missing-check-for-nocow-and-compression-inode-flags.patch @@ -0,0 +1,111 @@ +From f37c563bab4297024c300b05c8f48430e323809d Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Fri, 10 Jul 2020 09:49:56 +0200 +Subject: btrfs: add missing check for nocow and compression inode flags + +From: David Sterba + +commit f37c563bab4297024c300b05c8f48430e323809d upstream. + +User Forza reported on IRC that some invalid combinations of file +attributes are accepted by chattr. + +The NODATACOW and compression file flags/attributes are mutually +exclusive, but they could be set by 'chattr +c +C' on an empty file. The +nodatacow will be in effect because it's checked first in +btrfs_run_delalloc_range. + +Extend the flag validation to catch the following cases: + + - input flags are conflicting + - old and new flags are conflicting + - initialize the local variable with inode flags after inode ls locked + +Inode attributes take precedence over mount options and are an +independent setting. + +Nocompress would be a no-op with nodatacow, but we don't want to mix +any compression-related options with nodatacow. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 30 ++++++++++++++++++++++-------- + 1 file changed, 22 insertions(+), 8 deletions(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -164,8 +164,11 @@ static int btrfs_ioctl_getflags(struct f + return 0; + } + +-/* Check if @flags are a supported and valid set of FS_*_FL flags */ +-static int check_fsflags(unsigned int flags) ++/* ++ * Check if @flags are a supported and valid set of FS_*_FL flags and that ++ * the old and new flags are not conflicting ++ */ ++static int check_fsflags(unsigned int old_flags, unsigned int flags) + { + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ +@@ -174,9 +177,19 @@ static int check_fsflags(unsigned int fl + FS_NOCOW_FL)) + return -EOPNOTSUPP; + ++ /* COMPR and NOCOMP on new/old are valid */ + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + ++ if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL)) ++ return -EINVAL; ++ ++ /* NOCOW and compression options are mutually exclusive */ ++ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL))) ++ return -EINVAL; ++ if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL))) ++ return -EINVAL; ++ + return 0; + } + +@@ -190,7 +203,7 @@ static int btrfs_ioctl_setflags(struct f + unsigned int fsflags, old_fsflags; + int ret; + const char *comp = NULL; +- u32 binode_flags = binode->flags; ++ u32 binode_flags; + + if (!inode_owner_or_capable(inode)) + return -EPERM; +@@ -201,22 +214,23 @@ static int btrfs_ioctl_setflags(struct f + if (copy_from_user(&fsflags, arg, sizeof(fsflags))) + return -EFAULT; + +- ret = check_fsflags(fsflags); +- if (ret) +- return ret; +- + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(inode); +- + fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); + old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); ++ + ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); + if (ret) + goto out_unlock; + ++ ret = check_fsflags(old_fsflags, fsflags); ++ if (ret) ++ goto out_unlock; ++ ++ binode_flags = binode->flags; + if (fsflags & FS_SYNC_FL) + binode_flags |= BTRFS_INODE_SYNC; + else diff --git a/queue-5.8/btrfs-allow-use-of-global-block-reserve-for-balance-item-deletion.patch b/queue-5.8/btrfs-allow-use-of-global-block-reserve-for-balance-item-deletion.patch new file mode 100644 index 00000000000..4912d1ba87c --- /dev/null +++ b/queue-5.8/btrfs-allow-use-of-global-block-reserve-for-balance-item-deletion.patch @@ -0,0 +1,50 @@ +From 3502a8c0dc1bd4b4970b59b06e348f22a1c05581 Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Thu, 25 Jun 2020 12:35:28 +0200 +Subject: btrfs: allow use of global block reserve for balance item deletion + +From: David Sterba + +commit 3502a8c0dc1bd4b4970b59b06e348f22a1c05581 upstream. + +On a filesystem with exhausted metadata, but still enough to start +balance, it's possible to hit this error: + +[324402.053842] BTRFS info (device loop0): 1 enospc errors during balance +[324402.060769] BTRFS info (device loop0): balance: ended with status: -28 +[324402.172295] BTRFS: error (device loop0) in reset_balance_state:3321: errno=-28 No space left + +It fails inside reset_balance_state and turns the filesystem to +read-only, which is unnecessary and should be fixed too, but the problem +is caused by lack for space when the balance item is deleted. This is a +one-time operation and from the same rank as unlink that is allowed to +use the global block reserve. So do the same for the balance item. + +Status of the filesystem (100GiB) just after the balance fails: + +$ btrfs fi df mnt +Data, single: total=80.01GiB, used=38.58GiB +System, single: total=4.00MiB, used=16.00KiB +Metadata, single: total=19.99GiB, used=19.48GiB +GlobalReserve, single: total=512.00MiB, used=50.11MiB + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -3231,7 +3231,7 @@ static int del_balance_item(struct btrfs + if (!path) + return -ENOMEM; + +- trans = btrfs_start_transaction(root, 0); ++ trans = btrfs_start_transaction_fallback_global_rsv(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); diff --git a/queue-5.8/btrfs-avoid-possible-signal-interruption-of-btrfs_drop_snapshot-on-relocation-tree.patch b/queue-5.8/btrfs-avoid-possible-signal-interruption-of-btrfs_drop_snapshot-on-relocation-tree.patch new file mode 100644 index 00000000000..ad7f43ce430 --- /dev/null +++ b/queue-5.8/btrfs-avoid-possible-signal-interruption-of-btrfs_drop_snapshot-on-relocation-tree.patch @@ -0,0 +1,86 @@ +From f3e3d9cc35252a70a2fd698762c9687718268ec6 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 13 Jul 2020 09:03:20 +0800 +Subject: btrfs: avoid possible signal interruption of btrfs_drop_snapshot() on relocation tree + +From: Qu Wenruo + +commit f3e3d9cc35252a70a2fd698762c9687718268ec6 upstream. + +[BUG] +There is a bug report about bad signal timing could lead to read-only +fs during balance: + + BTRFS info (device xvdb): balance: start -d -m -s + BTRFS info (device xvdb): relocating block group 73001861120 flags metadata + BTRFS info (device xvdb): found 12236 extents, stage: move data extents + BTRFS info (device xvdb): relocating block group 71928119296 flags data + BTRFS info (device xvdb): found 3 extents, stage: move data extents + BTRFS info (device xvdb): found 3 extents, stage: update data pointers + BTRFS info (device xvdb): relocating block group 60922265600 flags metadata + BTRFS: error (device xvdb) in btrfs_drop_snapshot:5505: errno=-4 unknown + BTRFS info (device xvdb): forced readonly + BTRFS info (device xvdb): balance: ended with status: -4 + +[CAUSE] +The direct cause is the -EINTR from the following call chain when a +fatal signal is pending: + + relocate_block_group() + |- clean_dirty_subvols() + |- btrfs_drop_snapshot() + |- btrfs_start_transaction() + |- btrfs_delayed_refs_rsv_refill() + |- btrfs_reserve_metadata_bytes() + |- __reserve_metadata_bytes() + |- wait_reserve_ticket() + |- prepare_to_wait_event(); + |- ticket->error = -EINTR; + +Normally this behavior is fine for most btrfs_start_transaction() +callers, as they need to catch any other error, same for the signal, and +exit ASAP. + +However for balance, especially for the clean_dirty_subvols() case, we're +already doing cleanup works, getting -EINTR from btrfs_drop_snapshot() +could cause a lot of unexpected problems. + +From the mentioned forced read-only report, to later balance error due +to half dropped reloc trees. + +[FIX] +Fix this problem by using btrfs_join_transaction() if +btrfs_drop_snapshot() is called from relocation context. + +Since btrfs_join_transaction() won't get interrupted by signal, we can +continue the cleanup. + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba 3 +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5298,7 +5298,14 @@ int btrfs_drop_snapshot(struct btrfs_roo + goto out; + } + +- trans = btrfs_start_transaction(tree_root, 0); ++ /* ++ * Use join to avoid potential EINTR from transaction start. See ++ * wait_reserve_ticket and the whole reservation callchain. ++ */ ++ if (for_reloc) ++ trans = btrfs_join_transaction(tree_root); ++ else ++ trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; diff --git a/queue-5.8/btrfs-don-t-allocate-anonymous-block-device-for-user-invisible-roots.patch b/queue-5.8/btrfs-don-t-allocate-anonymous-block-device-for-user-invisible-roots.patch new file mode 100644 index 00000000000..7cd86551520 --- /dev/null +++ b/queue-5.8/btrfs-don-t-allocate-anonymous-block-device-for-user-invisible-roots.patch @@ -0,0 +1,90 @@ +From 851fd730a743e072badaf67caf39883e32439431 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 16 Jun 2020 10:17:34 +0800 +Subject: btrfs: don't allocate anonymous block device for user invisible roots + +From: Qu Wenruo + +commit 851fd730a743e072badaf67caf39883e32439431 upstream. + +[BUG] +When a lot of subvolumes are created, there is a user report about +transaction aborted: + + BTRFS: Transaction aborted (error -24) + WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs] + RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs] + Call Trace: + create_pending_snapshots+0x82/0xa0 [btrfs] + btrfs_commit_transaction+0x275/0x8c0 [btrfs] + btrfs_mksubvol+0x4b9/0x500 [btrfs] + btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] + btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] + btrfs_ioctl+0x11a4/0x2da0 [btrfs] + do_vfs_ioctl+0xa9/0x640 + ksys_ioctl+0x67/0x90 + __x64_sys_ioctl+0x1a/0x20 + do_syscall_64+0x5a/0x110 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + ---[ end trace 33f2f83f3d5250e9 ]--- + BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown + BTRFS info (device sda1): forced readonly + BTRFS warning (device sda1): Skipping commit of aborted transaction. + BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown + +[CAUSE] +The error is EMFILE (Too many files open) and comes from the anonymous +block device allocation. The ids are in a shared pool of size 1<<20. + +The ids are assigned to live subvolumes, ie. the root structure exists +in memory (eg. after creation or after the root appears in some path). +The pool could be exhausted if the numbers are not reclaimed fast +enough, after subvolume deletion or if other system component uses the +anon block devices. + +[WORKAROUND] +Since it's not possible to completely solve the problem, we can only +minimize the time the id is allocated to a subvolume root. + +Firstly, we can reduce the use of anon_dev by trees that are not +subvolume roots, like data reloc tree. + +This patch will do extra check on root objectid, to skip roots that +don't need anon_dev. Currently it's only data reloc tree and orphan +roots. + +Reported-by: Greed Rong +Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/ +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1428,9 +1428,16 @@ static int btrfs_init_fs_root(struct btr + spin_lock_init(&root->ino_cache_lock); + init_waitqueue_head(&root->ino_cache_wait); + +- ret = get_anon_bdev(&root->anon_dev); +- if (ret) +- goto fail; ++ /* ++ * Don't assign anonymous block device to roots that are not exposed to ++ * userspace, the id pool is limited to 1M ++ */ ++ if (is_fstree(root->root_key.objectid) && ++ btrfs_root_refs(&root->root_item) > 0) { ++ ret = get_anon_bdev(&root->anon_dev); ++ if (ret) ++ goto fail; ++ } + + mutex_lock(&root->objectid_mutex); + ret = btrfs_find_highest_objectid(root, diff --git a/queue-5.8/btrfs-don-t-traverse-into-the-seed-devices-in-show_devname.patch b/queue-5.8/btrfs-don-t-traverse-into-the-seed-devices-in-show_devname.patch new file mode 100644 index 00000000000..42d8b88a1a4 --- /dev/null +++ b/queue-5.8/btrfs-don-t-traverse-into-the-seed-devices-in-show_devname.patch @@ -0,0 +1,123 @@ +From 4faf55b03823e96c44dc4e364520000ed3b12fdb Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Fri, 10 Jul 2020 14:37:38 +0800 +Subject: btrfs: don't traverse into the seed devices in show_devname + +From: Anand Jain + +commit 4faf55b03823e96c44dc4e364520000ed3b12fdb upstream. + +->show_devname currently shows the lowest devid in the list. As the seed +devices have the lowest devid in the sprouted filesystem, the userland +tool such as findmnt end up seeing seed device instead of the device from +the read-writable sprouted filesystem. As shown below. + + mount /dev/sda /btrfs + mount: /btrfs: WARNING: device write-protected, mounted read-only. + + findmnt --output SOURCE,TARGET,UUID /btrfs + SOURCE TARGET UUID + /dev/sda /btrfs 899f7027-3e46-4626-93e7-7d4c9ad19111 + + btrfs dev add -f /dev/sdb /btrfs + + umount /btrfs + mount /dev/sdb /btrfs + + findmnt --output SOURCE,TARGET,UUID /btrfs + SOURCE TARGET UUID + /dev/sda /btrfs 899f7027-3e46-4626-93e7-7d4c9ad19111 + +All sprouts from a single seed will show the same seed device and the +same fsid. That's confusing. +This is causing problems in our prototype as there isn't any reference +to the sprout file-system(s) which is being used for actual read and +write. + +This was added in the patch which implemented the show_devname in btrfs +commit 9c5085c14798 ("Btrfs: implement ->show_devname"). +I tried to look for any particular reason that we need to show the seed +device, there isn't any. + +So instead, do not traverse through the seed devices, just show the +lowest devid in the sprouted fsid. + +After the patch: + + mount /dev/sda /btrfs + mount: /btrfs: WARNING: device write-protected, mounted read-only. + + findmnt --output SOURCE,TARGET,UUID /btrfs + SOURCE TARGET UUID + /dev/sda /btrfs 899f7027-3e46-4626-93e7-7d4c9ad19111 + + btrfs dev add -f /dev/sdb /btrfs + mount -o rw,remount /dev/sdb /btrfs + + findmnt --output SOURCE,TARGET,UUID /btrfs + SOURCE TARGET UUID + /dev/sdb /btrfs 595ca0e6-b82e-46b5-b9e2-c72a6928be48 + + mount /dev/sda /btrfs1 + mount: /btrfs1: WARNING: device write-protected, mounted read-only. + + btrfs dev add -f /dev/sdc /btrfs1 + + findmnt --output SOURCE,TARGET,UUID /btrfs1 + SOURCE TARGET UUID + /dev/sdc /btrfs1 ca1dbb7a-8446-4f95-853c-a20f3f82bdbb + + cat /proc/self/mounts | grep btrfs + /dev/sdb /btrfs btrfs rw,relatime,noacl,space_cache,subvolid=5,subvol=/ 0 0 + /dev/sdc /btrfs1 btrfs ro,relatime,noacl,space_cache,subvolid=5,subvol=/ 0 0 + +Reported-by: Martin K. Petersen +CC: stable@vger.kernel.org # 4.19+ +Tested-by: Martin K. Petersen +Signed-off-by: Anand Jain +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/super.c | 21 +++++++-------------- + 1 file changed, 7 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -2296,9 +2296,7 @@ static int btrfs_unfreeze(struct super_b + static int btrfs_show_devname(struct seq_file *m, struct dentry *root) + { + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); +- struct btrfs_fs_devices *cur_devices; + struct btrfs_device *dev, *first_dev = NULL; +- struct list_head *head; + + /* + * Lightweight locking of the devices. We should not need +@@ -2308,18 +2306,13 @@ static int btrfs_show_devname(struct seq + * least until the rcu_read_unlock. + */ + rcu_read_lock(); +- cur_devices = fs_info->fs_devices; +- while (cur_devices) { +- head = &cur_devices->devices; +- list_for_each_entry_rcu(dev, head, dev_list) { +- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) +- continue; +- if (!dev->name) +- continue; +- if (!first_dev || dev->devid < first_dev->devid) +- first_dev = dev; +- } +- cur_devices = cur_devices->seed; ++ list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) { ++ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) ++ continue; ++ if (!dev->name) ++ continue; ++ if (!first_dev || dev->devid < first_dev->devid) ++ first_dev = dev; + } + + if (first_dev) diff --git a/queue-5.8/btrfs-don-t-warn-if-we-abort-a-transaction-with-erofs.patch b/queue-5.8/btrfs-don-t-warn-if-we-abort-a-transaction-with-erofs.patch new file mode 100644 index 00000000000..27c12e21e01 --- /dev/null +++ b/queue-5.8/btrfs-don-t-warn-if-we-abort-a-transaction-with-erofs.patch @@ -0,0 +1,37 @@ +From f95ebdbed46a4d8b9fdb7bff109fdbb6fc9a6dc8 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 21 Jul 2020 11:24:27 -0400 +Subject: btrfs: don't WARN if we abort a transaction with EROFS + +From: Josef Bacik + +commit f95ebdbed46a4d8b9fdb7bff109fdbb6fc9a6dc8 upstream. + +If we got some sort of corruption via a read and call +btrfs_handle_fs_error() we'll set BTRFS_FS_STATE_ERROR on the fs and +complain. If a subsequent trans handle trips over this it'll get EROFS +and then abort. However at that point we're not aborting for the +original reason, we're aborting because we've been flipped read only. +We do not need to WARN_ON() here. + +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -3198,7 +3198,7 @@ do { \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ +- if ((errno) != -EIO) { \ ++ if ((errno) != -EIO && (errno) != -EROFS) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ diff --git a/queue-5.8/btrfs-fix-race-between-page-release-and-a-fast-fsync.patch b/queue-5.8/btrfs-fix-race-between-page-release-and-a-fast-fsync.patch new file mode 100644 index 00000000000..95cba149dea --- /dev/null +++ b/queue-5.8/btrfs-fix-race-between-page-release-and-a-fast-fsync.patch @@ -0,0 +1,92 @@ +From 3d6448e631591756da36efb3ea6355ff6f383c3a Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 22 Jul 2020 12:28:37 +0100 +Subject: btrfs: fix race between page release and a fast fsync + +From: Filipe Manana + +commit 3d6448e631591756da36efb3ea6355ff6f383c3a upstream. + +When releasing an extent map, done through the page release callback, we +can race with an ongoing fast fsync and cause the fsync to miss a new +extent and not log it. The steps for this to happen are the following: + +1) A page is dirtied for some inode I; + +2) Writeback for that page is triggered by a path other than fsync, for + example by the system due to memory pressure; + +3) When the ordered extent for the extent (a single 4K page) finishes, + we unpin the corresponding extent map and set its generation to N, + the current transaction's generation; + +4) The btrfs_releasepage() callback is invoked by the system due to + memory pressure for that no longer dirty page of inode I; + +5) At the same time, some task calls fsync on inode I, joins transaction + N, and at btrfs_log_inode() it sees that the inode does not have the + full sync flag set, so we proceed with a fast fsync. But before we get + into btrfs_log_changed_extents() and lock the inode's extent map tree: + +6) Through btrfs_releasepage() we end up at try_release_extent_mapping() + and we remove the extent map for the new 4Kb extent, because it is + neither pinned anymore nor locked. By calling remove_extent_mapping(), + we remove the extent map from the list of modified extents, since the + extent map does not have the logging flag set. We unlock the inode's + extent map tree; + +7) The task doing the fast fsync now enters btrfs_log_changed_extents(), + locks the inode's extent map tree and iterates its list of modified + extents, which no longer has the 4Kb extent in it, so it does not log + the extent; + +8) The fsync finishes; + +9) Before transaction N is committed, a power failure happens. After + replaying the log, the 4K extent of inode I will be missing, since + it was not logged due to the race with try_release_extent_mapping(). + +So fix this by teaching try_release_extent_mapping() to not remove an +extent map if it's still in the list of modified extents. + +Fixes: ff44c6e36dc9dc ("Btrfs: do not hold the write_lock on the extent tree while logging") +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent_io.c | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -4502,15 +4502,25 @@ int try_release_extent_mapping(struct pa + free_extent_map(em); + break; + } +- if (!test_range_bit(tree, em->start, +- extent_map_end(em) - 1, +- EXTENT_LOCKED, 0, NULL)) { ++ if (test_range_bit(tree, em->start, ++ extent_map_end(em) - 1, ++ EXTENT_LOCKED, 0, NULL)) ++ goto next; ++ /* ++ * If it's not in the list of modified extents, used ++ * by a fast fsync, we can remove it. If it's being ++ * logged we can safely remove it since fsync took an ++ * extra reference on the em. ++ */ ++ if (list_empty(&em->list) || ++ test_bit(EXTENT_FLAG_LOGGING, &em->flags)) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &btrfs_inode->runtime_flags); + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } ++next: + start = extent_map_end(em); + write_unlock(&map->lock); + diff --git a/queue-5.8/btrfs-free-anon-block-device-right-after-subvolume-deletion.patch b/queue-5.8/btrfs-free-anon-block-device-right-after-subvolume-deletion.patch new file mode 100644 index 00000000000..c0ffb0341ff --- /dev/null +++ b/queue-5.8/btrfs-free-anon-block-device-right-after-subvolume-deletion.patch @@ -0,0 +1,68 @@ +From 082b6c970f02fefd278c7833880cda29691a5f34 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 16 Jun 2020 10:17:37 +0800 +Subject: btrfs: free anon block device right after subvolume deletion + +From: Qu Wenruo + +commit 082b6c970f02fefd278c7833880cda29691a5f34 upstream. + +[BUG] +When a lot of subvolumes are created, there is a user report about +transaction aborted caused by slow anonymous block device reclaim: + + BTRFS: Transaction aborted (error -24) + WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs] + RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs] + Call Trace: + create_pending_snapshots+0x82/0xa0 [btrfs] + btrfs_commit_transaction+0x275/0x8c0 [btrfs] + btrfs_mksubvol+0x4b9/0x500 [btrfs] + btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] + btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] + btrfs_ioctl+0x11a4/0x2da0 [btrfs] + do_vfs_ioctl+0xa9/0x640 + ksys_ioctl+0x67/0x90 + __x64_sys_ioctl+0x1a/0x20 + do_syscall_64+0x5a/0x110 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + ---[ end trace 33f2f83f3d5250e9 ]--- + BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown + BTRFS info (device sda1): forced readonly + BTRFS warning (device sda1): Skipping commit of aborted transaction. + BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown + +[CAUSE] +The anonymous device pool is shared and its size is 1M. It's possible to +hit that limit if the subvolume deletion is not fast enough and the +subvolumes to be cleaned keep the ids allocated. + +[WORKAROUND] +We can't avoid the anon device pool exhaustion but we can shorten the +time the id is attached to the subvolume root once the subvolume becomes +invisible to the user. + +Reported-by: Greed Rong +Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/ +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -4041,6 +4041,8 @@ int btrfs_delete_subvolume(struct inode + } + } + ++ free_anon_bdev(dest->anon_dev); ++ dest->anon_dev = 0; + out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; diff --git a/queue-5.8/btrfs-move-the-chunk_mutex-in-btrfs_read_chunk_tree.patch b/queue-5.8/btrfs-move-the-chunk_mutex-in-btrfs_read_chunk_tree.patch new file mode 100644 index 00000000000..19e27ec8f55 --- /dev/null +++ b/queue-5.8/btrfs-move-the-chunk_mutex-in-btrfs_read_chunk_tree.patch @@ -0,0 +1,166 @@ +From 01d01caf19ff7c537527d352d169c4368375c0a1 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 17 Jul 2020 15:12:28 -0400 +Subject: btrfs: move the chunk_mutex in btrfs_read_chunk_tree + +From: Josef Bacik + +commit 01d01caf19ff7c537527d352d169c4368375c0a1 upstream. + +We are currently getting this lockdep splat in btrfs/161: + + ====================================================== + WARNING: possible circular locking dependency detected + 5.8.0-rc5+ #20 Tainted: G E + ------------------------------------------------------ + mount/678048 is trying to acquire lock: + ffff9b769f15b6e0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: clone_fs_devices+0x4d/0x170 [btrfs] + + but task is already holding lock: + ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: + __mutex_lock+0x8b/0x8f0 + btrfs_init_new_device+0x2d2/0x1240 [btrfs] + btrfs_ioctl+0x1de/0x2d20 [btrfs] + ksys_ioctl+0x87/0xc0 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #0 (&fs_devs->device_list_mutex){+.+.}-{3:3}: + __lock_acquire+0x1240/0x2460 + lock_acquire+0xab/0x360 + __mutex_lock+0x8b/0x8f0 + clone_fs_devices+0x4d/0x170 [btrfs] + btrfs_read_chunk_tree+0x330/0x800 [btrfs] + open_ctree+0xb7c/0x18ce [btrfs] + btrfs_mount_root.cold+0x13/0xfa [btrfs] + legacy_get_tree+0x30/0x50 + vfs_get_tree+0x28/0xc0 + fc_mount+0xe/0x40 + vfs_kern_mount.part.0+0x71/0x90 + btrfs_mount+0x13b/0x3e0 [btrfs] + legacy_get_tree+0x30/0x50 + vfs_get_tree+0x28/0xc0 + do_mount+0x7de/0xb30 + __x64_sys_mount+0x8e/0xd0 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + other info that might help us debug this: + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(&fs_info->chunk_mutex); + lock(&fs_devs->device_list_mutex); + lock(&fs_info->chunk_mutex); + lock(&fs_devs->device_list_mutex); + + *** DEADLOCK *** + + 3 locks held by mount/678048: + #0: ffff9b75ff5fb0e0 (&type->s_umount_key#63/1){+.+.}-{3:3}, at: alloc_super+0xb5/0x380 + #1: ffffffffc0c2fbc8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x54/0x800 [btrfs] + #2: ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] + + stack backtrace: + CPU: 2 PID: 678048 Comm: mount Tainted: G E 5.8.0-rc5+ #20 + Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./890FX Deluxe5, BIOS P1.40 05/03/2011 + Call Trace: + dump_stack+0x96/0xd0 + check_noncircular+0x162/0x180 + __lock_acquire+0x1240/0x2460 + ? asm_sysvec_apic_timer_interrupt+0x12/0x20 + lock_acquire+0xab/0x360 + ? clone_fs_devices+0x4d/0x170 [btrfs] + __mutex_lock+0x8b/0x8f0 + ? clone_fs_devices+0x4d/0x170 [btrfs] + ? rcu_read_lock_sched_held+0x52/0x60 + ? cpumask_next+0x16/0x20 + ? module_assert_mutex_or_preempt+0x14/0x40 + ? __module_address+0x28/0xf0 + ? clone_fs_devices+0x4d/0x170 [btrfs] + ? static_obj+0x4f/0x60 + ? lockdep_init_map_waits+0x43/0x200 + ? clone_fs_devices+0x4d/0x170 [btrfs] + clone_fs_devices+0x4d/0x170 [btrfs] + btrfs_read_chunk_tree+0x330/0x800 [btrfs] + open_ctree+0xb7c/0x18ce [btrfs] + ? super_setup_bdi_name+0x79/0xd0 + btrfs_mount_root.cold+0x13/0xfa [btrfs] + ? vfs_parse_fs_string+0x84/0xb0 + ? rcu_read_lock_sched_held+0x52/0x60 + ? kfree+0x2b5/0x310 + legacy_get_tree+0x30/0x50 + vfs_get_tree+0x28/0xc0 + fc_mount+0xe/0x40 + vfs_kern_mount.part.0+0x71/0x90 + btrfs_mount+0x13b/0x3e0 [btrfs] + ? cred_has_capability+0x7c/0x120 + ? rcu_read_lock_sched_held+0x52/0x60 + ? legacy_get_tree+0x30/0x50 + legacy_get_tree+0x30/0x50 + vfs_get_tree+0x28/0xc0 + do_mount+0x7de/0xb30 + ? memdup_user+0x4e/0x90 + __x64_sys_mount+0x8e/0xd0 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +This is because btrfs_read_chunk_tree() can come upon DEV_EXTENT's and +then read the device, which takes the device_list_mutex. The +device_list_mutex needs to be taken before the chunk_mutex, so this is a +problem. We only really need the chunk mutex around adding the chunk, +so move the mutex around read_one_chunk. + +An argument could be made that we don't even need the chunk_mutex here +as it's during mount, and we are protected by various other locks. +However we already have special rules for ->device_list_mutex, and I'd +rather not have another special case for ->chunk_mutex. + +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Anand Jain +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -7064,7 +7064,6 @@ int btrfs_read_chunk_tree(struct btrfs_f + * otherwise we don't need it. + */ + mutex_lock(&uuid_mutex); +- mutex_lock(&fs_info->chunk_mutex); + + /* + * It is possible for mount and umount to race in such a way that +@@ -7109,7 +7108,9 @@ int btrfs_read_chunk_tree(struct btrfs_f + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ++ mutex_lock(&fs_info->chunk_mutex); + ret = read_one_chunk(&found_key, leaf, chunk); ++ mutex_unlock(&fs_info->chunk_mutex); + if (ret) + goto error; + } +@@ -7139,7 +7140,6 @@ int btrfs_read_chunk_tree(struct btrfs_f + } + ret = 0; + error: +- mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&uuid_mutex); + + btrfs_free_path(path); diff --git a/queue-5.8/btrfs-only-commit-delayed-items-at-fsync-if-we-are-logging-a-directory.patch b/queue-5.8/btrfs-only-commit-delayed-items-at-fsync-if-we-are-logging-a-directory.patch new file mode 100644 index 00000000000..4d75382a23c --- /dev/null +++ b/queue-5.8/btrfs-only-commit-delayed-items-at-fsync-if-we-are-logging-a-directory.patch @@ -0,0 +1,93 @@ +From 5aa7d1a7f4a2f8ca6be1f32415e9365d026e8fa7 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 2 Jul 2020 12:32:20 +0100 +Subject: btrfs: only commit delayed items at fsync if we are logging a directory + +From: Filipe Manana + +commit 5aa7d1a7f4a2f8ca6be1f32415e9365d026e8fa7 upstream. + +When logging an inode we are committing its delayed items if either the +inode is a directory or if it is a new inode, created in the current +transaction. + +We need to do it for directories, since new directory indexes are stored +as delayed items of the inode and when logging a directory we need to be +able to access all indexes from the fs/subvolume tree in order to figure +out which index ranges need to be logged. + +However for new inodes that are not directories, we do not need to do it +because the only type of delayed item they can have is the inode item, and +we are guaranteed to always log an up to date version of the inode item: + +*) for a full fsync we do it by committing the delayed inode and then + copying the item from the fs/subvolume tree with + copy_inode_items_to_log(); + +*) for a fast fsync we always log the inode item based on the contents of + the in-memory struct btrfs_inode. We guarantee this is always done since + commit e4545de5b035c7 ("Btrfs: fix fsync data loss after append write"). + +So stop running delayed items for a new inodes that are not directories, +since that forces committing the delayed inode into the fs/subvolume tree, +wasting time and adding contention to the tree when a full fsync is not +required. We will only do it in case a fast fsync is needed. + +This patch is part of a series that has the following patches: + +1/4 btrfs: only commit the delayed inode when doing a full fsync +2/4 btrfs: only commit delayed items at fsync if we are logging a directory +3/4 btrfs: stop incremening log_batch for the log root tree when syncing log +4/4 btrfs: remove no longer needed use of log_writers for the log root tree + +After the entire patchset applied I saw about 12% decrease on max latency +reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of +ram, using kvm and using a raw NVMe device directly (no intermediary fs on +the host). The test was invoked like the following: + + mkfs.btrfs -f /dev/sdk + mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk + dbench -D /mnt/sdk -t 300 8 + umount /mnt/dsk + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -5122,7 +5122,6 @@ static int btrfs_log_inode(struct btrfs_ + const loff_t end, + struct btrfs_log_ctx *ctx) + { +- struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; +@@ -5165,15 +5164,17 @@ static int btrfs_log_inode(struct btrfs_ + max_key.offset = (u64)-1; + + /* +- * Only run delayed items if we are a dir or a new file. ++ * Only run delayed items if we are a directory. We want to make sure ++ * all directory indexes hit the fs/subvolume tree so we can find them ++ * and figure out which index ranges have to be logged. ++ * + * Otherwise commit the delayed inode only if the full sync flag is set, + * as we want to make sure an up to date version is in the subvolume + * tree so copy_inode_items_to_log() / copy_items() can find it and copy + * it to the log tree. For a non full sync, we always log the inode item + * based on the in-memory struct btrfs_inode which is always up to date. + */ +- if (S_ISDIR(inode->vfs_inode.i_mode) || +- inode->generation > fs_info->last_trans_committed) ++ if (S_ISDIR(inode->vfs_inode.i_mode)) + ret = btrfs_commit_inode_delayed_items(trans, inode); + else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + ret = btrfs_commit_inode_delayed_inode(inode); diff --git a/queue-5.8/btrfs-only-commit-the-delayed-inode-when-doing-a-full-fsync.patch b/queue-5.8/btrfs-only-commit-the-delayed-inode-when-doing-a-full-fsync.patch new file mode 100644 index 00000000000..dcf746fc4e9 --- /dev/null +++ b/queue-5.8/btrfs-only-commit-the-delayed-inode-when-doing-a-full-fsync.patch @@ -0,0 +1,88 @@ +From 8c8648dd1f6d62aeb912deeb788b6ac33cb782e7 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 2 Jul 2020 12:31:59 +0100 +Subject: btrfs: only commit the delayed inode when doing a full fsync + +From: Filipe Manana + +commit 8c8648dd1f6d62aeb912deeb788b6ac33cb782e7 upstream. + +Commit 2c2c452b0cafdc ("Btrfs: fix fsync when extend references are added +to an inode") forced a commit of the delayed inode when logging an inode +in order to ensure we would end up logging the inode item during a full +fsync. By committing the delayed inode, we updated the inode item in the +fs/subvolume tree and then later when copying items from leafs modified in +the current transaction into the log tree (with copy_inode_items_to_log()) +we ended up copying the inode item from the fs/subvolume tree into the log +tree. Logging an up to date version of the inode item is required to make +sure at log replay time we get the link count fixup triggered among other +things (replay xattr deletes, etc). The test case generic/040 from fstests +exercises the bug which that commit fixed. + +However for a fast fsync we don't need to commit the delayed inode because +we always log an up to date version of the inode item based on the struct +btrfs_inode we have in-memory. We started doing this for fast fsyncs since +commit e4545de5b035c7 ("Btrfs: fix fsync data loss after append write"). + +So just stop committing the delayed inode if we are doing a fast fsync, +we are only wasting time and adding contention on fs/subvolume tree. + +This patch is part of a series that has the following patches: + +1/4 btrfs: only commit the delayed inode when doing a full fsync +2/4 btrfs: only commit delayed items at fsync if we are logging a directory +3/4 btrfs: stop incremening log_batch for the log root tree when syncing log +4/4 btrfs: remove no longer needed use of log_writers for the log root tree + +After the entire patchset applied I saw about 12% decrease on max latency +reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of +ram, using kvm and using a raw NVMe device directly (no intermediary fs on +the host). The test was invoked like the following: + + mkfs.btrfs -f /dev/sdk + mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk + dbench -D /mnt/sdk -t 300 8 + umount /mnt/dsk + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -5130,7 +5130,7 @@ static int btrfs_log_inode(struct btrfs_ + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + int err = 0; +- int ret; ++ int ret = 0; + bool fast_search = false; + u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = &inode->extent_tree; +@@ -5167,14 +5167,16 @@ static int btrfs_log_inode(struct btrfs_ + + /* + * Only run delayed items if we are a dir or a new file. +- * Otherwise commit the delayed inode only, which is needed in +- * order for the log replay code to mark inodes for link count +- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). ++ * Otherwise commit the delayed inode only if the full sync flag is set, ++ * as we want to make sure an up to date version is in the subvolume ++ * tree so copy_inode_items_to_log() / copy_items() can find it and copy ++ * it to the log tree. For a non full sync, we always log the inode item ++ * based on the in-memory struct btrfs_inode which is always up to date. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) || + inode->generation > fs_info->last_trans_committed) + ret = btrfs_commit_inode_delayed_items(trans, inode); +- else ++ else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + ret = btrfs_commit_inode_delayed_inode(inode); + + if (ret) { diff --git a/queue-5.8/btrfs-open-device-without-device_list_mutex.patch b/queue-5.8/btrfs-open-device-without-device_list_mutex.patch new file mode 100644 index 00000000000..abfb3defa38 --- /dev/null +++ b/queue-5.8/btrfs-open-device-without-device_list_mutex.patch @@ -0,0 +1,253 @@ +From 18c850fdc5a801bad4977b0f1723761d42267e45 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 17 Jul 2020 15:12:27 -0400 +Subject: btrfs: open device without device_list_mutex + +From: Josef Bacik + +commit 18c850fdc5a801bad4977b0f1723761d42267e45 upstream. + +There's long existed a lockdep splat because we open our bdev's under +the ->device_list_mutex at mount time, which acquires the bd_mutex. +Usually this goes unnoticed, but if you do loopback devices at all +suddenly the bd_mutex comes with a whole host of other dependencies, +which results in the splat when you mount a btrfs file system. + +====================================================== +WARNING: possible circular locking dependency detected +5.8.0-0.rc3.1.fc33.x86_64+debug #1 Not tainted +------------------------------------------------------ +systemd-journal/509 is trying to acquire lock: +ffff970831f84db0 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x44/0x70 [btrfs] + +but task is already holding lock: +ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] + +which lock already depends on the new lock. + +the existing dependency chain (in reverse order) is: + + -> #6 (sb_pagefaults){.+.+}-{0:0}: + __sb_start_write+0x13e/0x220 + btrfs_page_mkwrite+0x59/0x560 [btrfs] + do_page_mkwrite+0x4f/0x130 + do_wp_page+0x3b0/0x4f0 + handle_mm_fault+0xf47/0x1850 + do_user_addr_fault+0x1fc/0x4b0 + exc_page_fault+0x88/0x300 + asm_exc_page_fault+0x1e/0x30 + + -> #5 (&mm->mmap_lock#2){++++}-{3:3}: + __might_fault+0x60/0x80 + _copy_from_user+0x20/0xb0 + get_sg_io_hdr+0x9a/0xb0 + scsi_cmd_ioctl+0x1ea/0x2f0 + cdrom_ioctl+0x3c/0x12b4 + sr_block_ioctl+0xa4/0xd0 + block_ioctl+0x3f/0x50 + ksys_ioctl+0x82/0xc0 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #4 (&cd->lock){+.+.}-{3:3}: + __mutex_lock+0x7b/0x820 + sr_block_open+0xa2/0x180 + __blkdev_get+0xdd/0x550 + blkdev_get+0x38/0x150 + do_dentry_open+0x16b/0x3e0 + path_openat+0x3c9/0xa00 + do_filp_open+0x75/0x100 + do_sys_openat2+0x8a/0x140 + __x64_sys_openat+0x46/0x70 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: + __mutex_lock+0x7b/0x820 + __blkdev_get+0x6a/0x550 + blkdev_get+0x85/0x150 + blkdev_get_by_path+0x2c/0x70 + btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] + open_fs_devices+0x88/0x240 [btrfs] + btrfs_open_devices+0x92/0xa0 [btrfs] + btrfs_mount_root+0x250/0x490 [btrfs] + legacy_get_tree+0x30/0x50 + vfs_get_tree+0x28/0xc0 + vfs_kern_mount.part.0+0x71/0xb0 + btrfs_mount+0x119/0x380 [btrfs] + legacy_get_tree+0x30/0x50 + vfs_get_tree+0x28/0xc0 + do_mount+0x8c6/0xca0 + __x64_sys_mount+0x8e/0xd0 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: + __mutex_lock+0x7b/0x820 + btrfs_run_dev_stats+0x36/0x420 [btrfs] + commit_cowonly_roots+0x91/0x2d0 [btrfs] + btrfs_commit_transaction+0x4e6/0x9f0 [btrfs] + btrfs_sync_file+0x38a/0x480 [btrfs] + __x64_sys_fdatasync+0x47/0x80 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: + __mutex_lock+0x7b/0x820 + btrfs_commit_transaction+0x48e/0x9f0 [btrfs] + btrfs_sync_file+0x38a/0x480 [btrfs] + __x64_sys_fdatasync+0x47/0x80 + do_syscall_64+0x52/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: + __lock_acquire+0x1241/0x20c0 + lock_acquire+0xb0/0x400 + __mutex_lock+0x7b/0x820 + btrfs_record_root_in_trans+0x44/0x70 [btrfs] + start_transaction+0xd2/0x500 [btrfs] + btrfs_dirty_inode+0x44/0xd0 [btrfs] + file_update_time+0xc6/0x120 + btrfs_page_mkwrite+0xda/0x560 [btrfs] + do_page_mkwrite+0x4f/0x130 + do_wp_page+0x3b0/0x4f0 + handle_mm_fault+0xf47/0x1850 + do_user_addr_fault+0x1fc/0x4b0 + exc_page_fault+0x88/0x300 + asm_exc_page_fault+0x1e/0x30 + +other info that might help us debug this: + +Chain exists of: + &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults + +Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(sb_pagefaults); + lock(&mm->mmap_lock#2); + lock(sb_pagefaults); + lock(&fs_info->reloc_mutex); + + *** DEADLOCK *** + +3 locks held by systemd-journal/509: + #0: ffff97083bdec8b8 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x12e/0x4b0 + #1: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] + #2: ffff97083144d6a8 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3f8/0x500 [btrfs] + +stack backtrace: +CPU: 0 PID: 509 Comm: systemd-journal Not tainted 5.8.0-0.rc3.1.fc33.x86_64+debug #1 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 +Call Trace: + dump_stack+0x92/0xc8 + check_noncircular+0x134/0x150 + __lock_acquire+0x1241/0x20c0 + lock_acquire+0xb0/0x400 + ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] + ? lock_acquire+0xb0/0x400 + ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] + __mutex_lock+0x7b/0x820 + ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] + ? kvm_sched_clock_read+0x14/0x30 + ? sched_clock+0x5/0x10 + ? sched_clock_cpu+0xc/0xb0 + btrfs_record_root_in_trans+0x44/0x70 [btrfs] + start_transaction+0xd2/0x500 [btrfs] + btrfs_dirty_inode+0x44/0xd0 [btrfs] + file_update_time+0xc6/0x120 + btrfs_page_mkwrite+0xda/0x560 [btrfs] + ? sched_clock+0x5/0x10 + do_page_mkwrite+0x4f/0x130 + do_wp_page+0x3b0/0x4f0 + handle_mm_fault+0xf47/0x1850 + do_user_addr_fault+0x1fc/0x4b0 + exc_page_fault+0x88/0x300 + ? asm_exc_page_fault+0x8/0x30 + asm_exc_page_fault+0x1e/0x30 +RIP: 0033:0x7fa3972fdbfe +Code: Bad RIP value. + +Fix this by not holding the ->device_list_mutex at this point. The +device_list_mutex exists to protect us from modifying the device list +while the file system is running. + +However it can also be modified by doing a scan on a device. But this +action is specifically protected by the uuid_mutex, which we are holding +here. We cannot race with opening at this point because we have the +->s_mount lock held during the mount. Not having the +->device_list_mutex here is perfectly safe as we're not going to change +the devices at this point. + +CC: stable@vger.kernel.org # 4.19+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +[ add some comments ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 21 ++++++++++++++++++--- + 1 file changed, 18 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -245,7 +245,9 @@ static int __btrfs_map_block(struct btrf + * + * global::fs_devs - add, remove, updates to the global list + * +- * does not protect: manipulation of the fs_devices::devices list! ++ * does not protect: manipulation of the fs_devices::devices list in general ++ * but in mount context it could be used to exclude list modifications by eg. ++ * scan ioctl + * + * btrfs_device::name - renames (write side), read is RCU + * +@@ -258,6 +260,9 @@ static int __btrfs_map_block(struct btrf + * may be used to exclude some operations from running concurrently without any + * modifications to the list (see write_all_supers) + * ++ * Is not required at mount and close times, because our device list is ++ * protected by the uuid_mutex at that point. ++ * + * balance_mutex + * ------------- + * protects balance structures (status, state) and context accessed from +@@ -602,6 +607,11 @@ static int btrfs_free_stale_devices(cons + return ret; + } + ++/* ++ * This is only used on mount, and we are protected from competing things ++ * messing with our fs_devices by the uuid_mutex, thus we do not need the ++ * fs_devices->device_list_mutex here. ++ */ + static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device, fmode_t flags, + void *holder) +@@ -1229,8 +1239,14 @@ int btrfs_open_devices(struct btrfs_fs_d + int ret; + + lockdep_assert_held(&uuid_mutex); ++ /* ++ * The device_list_mutex cannot be taken here in case opening the ++ * underlying device takes further locks like bd_mutex. ++ * ++ * We also don't need the lock here as this is called during mount and ++ * exclusion is provided by uuid_mutex ++ */ + +- mutex_lock(&fs_devices->device_list_mutex); + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; +@@ -1238,7 +1254,6 @@ int btrfs_open_devices(struct btrfs_fs_d + list_sort(NULL, &fs_devices->devices, devid_cmp); + ret = open_fs_devices(fs_devices, flags, holder); + } +- mutex_unlock(&fs_devices->device_list_mutex); + + return ret; + } diff --git a/queue-5.8/btrfs-pass-checksum-type-via-btrfs_ioc_fs_info-ioctl.patch b/queue-5.8/btrfs-pass-checksum-type-via-btrfs_ioc_fs_info-ioctl.patch new file mode 100644 index 00000000000..916e536e29a --- /dev/null +++ b/queue-5.8/btrfs-pass-checksum-type-via-btrfs_ioc_fs_info-ioctl.patch @@ -0,0 +1,125 @@ +From 137c541821a83debb63b3fa8abdd1cbc41bdf3a1 Mon Sep 17 00:00:00 2001 +From: Johannes Thumshirn +Date: Mon, 13 Jul 2020 21:28:58 +0900 +Subject: btrfs: pass checksum type via BTRFS_IOC_FS_INFO ioctl + +From: Johannes Thumshirn + +commit 137c541821a83debb63b3fa8abdd1cbc41bdf3a1 upstream. + +With the recent addition of filesystem checksum types other than CRC32c, +it is not anymore hard-coded which checksum type a btrfs filesystem uses. + +Up to now there is no good way to read the filesystem checksum, apart from +reading the filesystem UUID and then query sysfs for the checksum type. + +Add a new csum_type and csum_size fields to the BTRFS_IOC_FS_INFO ioctl +command which usually is used to query filesystem features. Also add a +flags member indicating that the kernel responded with a set csum_type and +csum_size field. + +For compatibility reasons, only return the csum_type and csum_size if +the BTRFS_FS_INFO_FLAG_CSUM_INFO flag was passed to the kernel. Also +clear any unknown flags so we don't pass false positives to user-space +newer than the kernel. + +To simplify further additions to the ioctl, also switch the padding to a +u8 array. Pahole was used to verify the result of this switch: + +The csum members are added before flags, which might look odd, but this +is to keep the alignment requirements and not to introduce holes in the +structure. + + $ pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko + struct btrfs_ioctl_fs_info_args { + __u64 max_id; /* 0 8 */ + __u64 num_devices; /* 8 8 */ + __u8 fsid[16]; /* 16 16 */ + __u32 nodesize; /* 32 4 */ + __u32 sectorsize; /* 36 4 */ + __u32 clone_alignment; /* 40 4 */ + __u16 csum_type; /* 44 2 */ + __u16 csum_size; /* 46 2 */ + __u64 flags; /* 48 8 */ + __u8 reserved[968]; /* 56 968 */ + + /* size: 1024, cachelines: 16, members: 10 */ + }; + +Fixes: 3951e7f050ac ("btrfs: add xxhash64 to checksumming algorithms") +Fixes: 3831bf0094ab ("btrfs: add sha256 to checksumming algorithm") +CC: stable@vger.kernel.org # 5.5+ +Signed-off-by: Johannes Thumshirn +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 16 +++++++++++++--- + include/uapi/linux/btrfs.h | 14 ++++++++++++-- + 2 files changed, 25 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3217,11 +3217,15 @@ static long btrfs_ioctl_fs_info(struct b + struct btrfs_ioctl_fs_info_args *fi_args; + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; ++ u64 flags_in; + int ret = 0; + +- fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); +- if (!fi_args) +- return -ENOMEM; ++ fi_args = memdup_user(arg, sizeof(*fi_args)); ++ if (IS_ERR(fi_args)) ++ return PTR_ERR(fi_args); ++ ++ flags_in = fi_args->flags; ++ memset(fi_args, 0, sizeof(*fi_args)); + + rcu_read_lock(); + fi_args->num_devices = fs_devices->num_devices; +@@ -3237,6 +3241,12 @@ static long btrfs_ioctl_fs_info(struct b + fi_args->sectorsize = fs_info->sectorsize; + fi_args->clone_alignment = fs_info->sectorsize; + ++ if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { ++ fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); ++ fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); ++ fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; ++ } ++ + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; + +--- a/include/uapi/linux/btrfs.h ++++ b/include/uapi/linux/btrfs.h +@@ -243,6 +243,13 @@ struct btrfs_ioctl_dev_info_args { + __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ + }; + ++/* ++ * Retrieve information about the filesystem ++ */ ++ ++/* Request information about checksum type and size */ ++#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) ++ + struct btrfs_ioctl_fs_info_args { + __u64 max_id; /* out */ + __u64 num_devices; /* out */ +@@ -250,8 +257,11 @@ struct btrfs_ioctl_fs_info_args { + __u32 nodesize; /* out */ + __u32 sectorsize; /* out */ + __u32 clone_alignment; /* out */ +- __u32 reserved32; +- __u64 reserved[122]; /* pad to 1k */ ++ /* See BTRFS_FS_INFO_FLAG_* */ ++ __u16 csum_type; /* out */ ++ __u16 csum_size; /* out */ ++ __u64 flags; /* in/out */ ++ __u8 reserved[968]; /* pad to 1k */ + }; + + /* diff --git a/queue-5.8/btrfs-preallocate-anon-block-device-at-first-phase-of-snapshot-creation.patch b/queue-5.8/btrfs-preallocate-anon-block-device-at-first-phase-of-snapshot-creation.patch new file mode 100644 index 00000000000..dc115441569 --- /dev/null +++ b/queue-5.8/btrfs-preallocate-anon-block-device-at-first-phase-of-snapshot-creation.patch @@ -0,0 +1,293 @@ +From 2dfb1e43f57dd3aeaa66f7cf05d068db2d4c8788 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 16 Jun 2020 10:17:36 +0800 +Subject: btrfs: preallocate anon block device at first phase of snapshot creation + +From: Qu Wenruo + +commit 2dfb1e43f57dd3aeaa66f7cf05d068db2d4c8788 upstream. + +[BUG] +When the anonymous block device pool is exhausted, subvolume/snapshot +creation fails with EMFILE (Too many files open). This has been reported +by a user. The allocation happens in the second phase during transaction +commit where it's only way out is to abort the transaction + + BTRFS: Transaction aborted (error -24) + WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs] + RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs] + Call Trace: + create_pending_snapshots+0x82/0xa0 [btrfs] + btrfs_commit_transaction+0x275/0x8c0 [btrfs] + btrfs_mksubvol+0x4b9/0x500 [btrfs] + btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] + btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] + btrfs_ioctl+0x11a4/0x2da0 [btrfs] + do_vfs_ioctl+0xa9/0x640 + ksys_ioctl+0x67/0x90 + __x64_sys_ioctl+0x1a/0x20 + do_syscall_64+0x5a/0x110 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + ---[ end trace 33f2f83f3d5250e9 ]--- + BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown + BTRFS info (device sda1): forced readonly + BTRFS warning (device sda1): Skipping commit of aborted transaction. + BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown + +[CAUSE] +When the global anonymous block device pool is exhausted, the following +call chain will fail, and lead to transaction abort: + + btrfs_ioctl_snap_create_v2() + |- btrfs_ioctl_snap_create_transid() + |- btrfs_mksubvol() + |- btrfs_commit_transaction() + |- create_pending_snapshot() + |- btrfs_get_fs_root() + |- btrfs_init_fs_root() + |- get_anon_bdev() + +[FIX] +Although we can't enlarge the anonymous block device pool, at least we +can preallocate anon_dev for subvolume/snapshot in the first phase, +outside of transaction context and exactly at the moment the user calls +the creation ioctl. + +Reported-by: Greed Rong +Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/ +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 71 ++++++++++++++++++++++++++++++++++++++++++++----- + fs/btrfs/disk-io.h | 2 + + fs/btrfs/ioctl.c | 21 +++++++++++++- + fs/btrfs/transaction.c | 2 - + fs/btrfs/transaction.h | 2 + + 5 files changed, 89 insertions(+), 9 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1395,7 +1395,12 @@ alloc_fail: + goto out; + } + +-static int btrfs_init_fs_root(struct btrfs_root *root) ++/* ++ * Initialize subvolume root in-memory structure ++ * ++ * @anon_dev: anonymous device to attach to the root, if zero, allocate new ++ */ ++static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) + { + int ret; + unsigned int nofs_flag; +@@ -1434,9 +1439,13 @@ static int btrfs_init_fs_root(struct btr + */ + if (is_fstree(root->root_key.objectid) && + btrfs_root_refs(&root->root_item) > 0) { +- ret = get_anon_bdev(&root->anon_dev); +- if (ret) +- goto fail; ++ if (!anon_dev) { ++ ret = get_anon_bdev(&root->anon_dev); ++ if (ret) ++ goto fail; ++ } else { ++ root->anon_dev = anon_dev; ++ } + } + + mutex_lock(&root->objectid_mutex); +@@ -1541,8 +1550,27 @@ void btrfs_free_fs_info(struct btrfs_fs_ + } + + +-struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, +- u64 objectid, bool check_ref) ++/* ++ * Get an in-memory reference of a root structure. ++ * ++ * For essential trees like root/extent tree, we grab it from fs_info directly. ++ * For subvolume trees, we check the cached filesystem roots first. If not ++ * found, then read it from disk and add it to cached fs roots. ++ * ++ * Caller should release the root by calling btrfs_put_root() after the usage. ++ * ++ * NOTE: Reloc and log trees can't be read by this function as they share the ++ * same root objectid. ++ * ++ * @objectid: root id ++ * @anon_dev: preallocated anonymous block device number for new roots, ++ * pass 0 for new allocation. ++ * @check_ref: whether to check root item references, If true, return -ENOENT ++ * for orphan roots ++ */ ++static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, ++ u64 objectid, dev_t anon_dev, ++ bool check_ref) + { + struct btrfs_root *root; + struct btrfs_path *path; +@@ -1571,6 +1599,8 @@ struct btrfs_root *btrfs_get_fs_root(str + again: + root = btrfs_lookup_fs_root(fs_info, objectid); + if (root) { ++ /* Shouldn't get preallocated anon_dev for cached roots */ ++ ASSERT(!anon_dev); + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + btrfs_put_root(root); + return ERR_PTR(-ENOENT); +@@ -1590,7 +1620,7 @@ again: + goto fail; + } + +- ret = btrfs_init_fs_root(root); ++ ret = btrfs_init_fs_root(root, anon_dev); + if (ret) + goto fail; + +@@ -1623,6 +1653,33 @@ fail: + return ERR_PTR(ret); + } + ++/* ++ * Get in-memory reference of a root structure ++ * ++ * @objectid: tree objectid ++ * @check_ref: if set, verify that the tree exists and the item has at least ++ * one reference ++ */ ++struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, ++ u64 objectid, bool check_ref) ++{ ++ return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); ++} ++ ++/* ++ * Get in-memory reference of a root structure, created as new, optionally pass ++ * the anonymous block device id ++ * ++ * @objectid: tree objectid ++ * @anon_dev: if zero, allocate a new anonymous block device or use the ++ * parameter value ++ */ ++struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, ++ u64 objectid, dev_t anon_dev) ++{ ++ return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); ++} ++ + static int btrfs_congested_fn(void *congested_data, int bdi_bits) + { + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; +--- a/fs/btrfs/disk-io.h ++++ b/fs/btrfs/disk-io.h +@@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs + + struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, bool check_ref); ++struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, ++ u64 objectid, dev_t anon_dev); + + void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); + int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -566,6 +566,7 @@ static noinline int create_subvol(struct + struct inode *inode; + int ret; + int err; ++ dev_t anon_dev = 0; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; +@@ -578,6 +579,10 @@ static noinline int create_subvol(struct + if (ret) + goto fail_free; + ++ ret = get_anon_bdev(&anon_dev); ++ if (ret < 0) ++ goto fail_free; ++ + /* + * Don't create subvolume whose level is not zero. Or qgroup will be + * screwed up since it assumes subvolume qgroup's level to be 0. +@@ -660,12 +665,15 @@ static noinline int create_subvol(struct + goto fail; + + key.offset = (u64)-1; +- new_root = btrfs_get_fs_root(fs_info, objectid, true); ++ new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); + if (IS_ERR(new_root)) { ++ free_anon_bdev(anon_dev); + ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, ret); + goto fail; + } ++ /* Freeing will be done in btrfs_put_root() of new_root */ ++ anon_dev = 0; + + btrfs_record_root_in_trans(trans, new_root); + +@@ -735,6 +743,8 @@ fail: + return ret; + + fail_free: ++ if (anon_dev) ++ free_anon_bdev(anon_dev); + kfree(root_item); + return ret; + } +@@ -762,6 +772,9 @@ static int create_snapshot(struct btrfs_ + if (!pending_snapshot) + return -ENOMEM; + ++ ret = get_anon_bdev(&pending_snapshot->anon_dev); ++ if (ret < 0) ++ goto free_pending; + pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), + GFP_KERNEL); + pending_snapshot->path = btrfs_alloc_path(); +@@ -823,10 +836,16 @@ static int create_snapshot(struct btrfs_ + + d_instantiate(dentry, inode); + ret = 0; ++ pending_snapshot->anon_dev = 0; + fail: ++ /* Prevent double freeing of anon_dev */ ++ if (ret && pending_snapshot->snap) ++ pending_snapshot->snap->anon_dev = 0; + btrfs_put_root(pending_snapshot->snap); + btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); + free_pending: ++ if (pending_snapshot->anon_dev) ++ free_anon_bdev(pending_snapshot->anon_dev); + kfree(pending_snapshot->root_item); + btrfs_free_path(pending_snapshot->path); + kfree(pending_snapshot); +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1630,7 +1630,7 @@ static noinline int create_pending_snaps + } + + key.offset = (u64)-1; +- pending->snap = btrfs_get_fs_root(fs_info, objectid, true); ++ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); + if (IS_ERR(pending->snap)) { + ret = PTR_ERR(pending->snap); + btrfs_abort_transaction(trans, ret); +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -151,6 +151,8 @@ struct btrfs_pending_snapshot { + struct btrfs_block_rsv block_rsv; + /* extra metadata reservation for relocation */ + int error; ++ /* Preallocated anonymous block device number */ ++ dev_t anon_dev; + bool readonly; + struct list_head list; + }; diff --git a/queue-5.8/btrfs-ref-verify-fix-memory-leak-in-add_block_entry.patch b/queue-5.8/btrfs-ref-verify-fix-memory-leak-in-add_block_entry.patch new file mode 100644 index 00000000000..6cf068cd448 --- /dev/null +++ b/queue-5.8/btrfs-ref-verify-fix-memory-leak-in-add_block_entry.patch @@ -0,0 +1,50 @@ +From d60ba8de1164e1b42e296ff270c622a070ef8fe7 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Tue, 7 Jul 2020 06:29:08 -0700 +Subject: btrfs: ref-verify: fix memory leak in add_block_entry + +From: Tom Rix + +commit d60ba8de1164e1b42e296ff270c622a070ef8fe7 upstream. + +clang static analysis flags this error + +fs/btrfs/ref-verify.c:290:3: warning: Potential leak of memory pointed to by 're' [unix.Malloc] + kfree(be); + ^~~~~ + +The problem is in this block of code: + + if (root_objectid) { + struct root_entry *exist_re; + + exist_re = insert_root_entry(&exist->roots, re); + if (exist_re) + kfree(re); + } + +There is no 'else' block freeing when root_objectid is 0. Add the +missing kfree to the else branch. + +Fixes: fd708b81d972 ("Btrfs: add a extent ref verify tool") +CC: stable@vger.kernel.org # 4.19+ +Signed-off-by: Tom Rix +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ref-verify.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/ref-verify.c ++++ b/fs/btrfs/ref-verify.c +@@ -286,6 +286,8 @@ static struct block_entry *add_block_ent + exist_re = insert_root_entry(&exist->roots, re); + if (exist_re) + kfree(re); ++ } else { ++ kfree(re); + } + kfree(be); + return exist; diff --git a/queue-5.8/btrfs-relocation-review-the-call-sites-which-can-be-interrupted-by-signal.patch b/queue-5.8/btrfs-relocation-review-the-call-sites-which-can-be-interrupted-by-signal.patch new file mode 100644 index 00000000000..2921a3728c1 --- /dev/null +++ b/queue-5.8/btrfs-relocation-review-the-call-sites-which-can-be-interrupted-by-signal.patch @@ -0,0 +1,104 @@ +From 44d354abf33e92a5e73b965c84caf5a5d5e58a0b Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 13 Jul 2020 09:03:21 +0800 +Subject: btrfs: relocation: review the call sites which can be interrupted by signal + +From: Qu Wenruo + +commit 44d354abf33e92a5e73b965c84caf5a5d5e58a0b upstream. + +Since most metadata reservation calls can return -EINTR when get +interrupted by fatal signal, we need to review the all the metadata +reservation call sites. + +In relocation code, the metadata reservation happens in the following +sites: + +- btrfs_block_rsv_refill() in merge_reloc_root() + merge_reloc_root() is a pretty critical section, we don't want to be + interrupted by signal, so change the flush status to + BTRFS_RESERVE_FLUSH_LIMIT, so it won't get interrupted by signal. + Since such change can be ENPSPC-prone, also shrink the amount of + metadata to reserve least amount avoid deadly ENOSPC there. + +- btrfs_block_rsv_refill() in reserve_metadata_space() + It calls with BTRFS_RESERVE_FLUSH_LIMIT, which won't get interrupted + by signal. + +- btrfs_block_rsv_refill() in prepare_to_relocate() + +- btrfs_block_rsv_add() in prepare_to_relocate() + +- btrfs_block_rsv_refill() in relocate_block_group() + +- btrfs_delalloc_reserve_metadata() in relocate_file_extent_cluster() + +- btrfs_start_transaction() in relocate_block_group() + +- btrfs_start_transaction() in create_reloc_inode() + Can be interrupted by fatal signal and we can handle it easily. + For these call sites, just catch the -EINTR value in btrfs_balance() + and count them as canceled. + +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/relocation.c | 12 ++++++++++-- + fs/btrfs/volumes.c | 17 ++++++++++++++++- + 2 files changed, 26 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -1686,12 +1686,20 @@ static noinline_for_stack int merge_relo + btrfs_unlock_up_safe(path, 0); + } + +- min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; ++ /* ++ * In merge_reloc_root(), we modify the upper level pointer to swap the ++ * tree blocks between reloc tree and subvolume tree. Thus for tree ++ * block COW, we COW at most from level 1 to root level for each tree. ++ * ++ * Thus the needed metadata size is at most root_level * nodesize, ++ * and * 2 since we have two trees to COW. ++ */ ++ min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, +- BTRFS_RESERVE_FLUSH_ALL); ++ BTRFS_RESERVE_FLUSH_LIMIT); + if (ret) { + err = ret; + goto out; +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -4150,7 +4150,22 @@ int btrfs_balance(struct btrfs_fs_info * + mutex_lock(&fs_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); +- else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) ++ /* ++ * Balance can be canceled by: ++ * ++ * - Regular cancel request ++ * Then ret == -ECANCELED and balance_cancel_req > 0 ++ * ++ * - Fatal signal to "btrfs" process ++ * Either the signal caught by wait_reserve_ticket() and callers ++ * got -EINTR, or caught by btrfs_should_cancel_balance() and ++ * got -ECANCELED. ++ * Either way, in this case balance_cancel_req = 0, and ++ * ret == -EINTR or ret == -ECANCELED. ++ * ++ * So here we only check the return value to catch canceled balance. ++ */ ++ else if (ret == -ECANCELED || ret == -EINTR) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); diff --git a/queue-5.8/btrfs-remove-no-longer-needed-use-of-log_writers-for-the-log-root-tree.patch b/queue-5.8/btrfs-remove-no-longer-needed-use-of-log_writers-for-the-log-root-tree.patch new file mode 100644 index 00000000000..5b6c945a87f --- /dev/null +++ b/queue-5.8/btrfs-remove-no-longer-needed-use-of-log_writers-for-the-log-root-tree.patch @@ -0,0 +1,122 @@ +From a93e01682e283f6de09d6ce8f805dc52a2e942fb Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 2 Jul 2020 12:32:40 +0100 +Subject: btrfs: remove no longer needed use of log_writers for the log root tree + +From: Filipe Manana + +commit a93e01682e283f6de09d6ce8f805dc52a2e942fb upstream. + +When syncing the log, we used to update the log root tree without holding +neither the log_mutex of the subvolume root nor the log_mutex of log root +tree. + +We used to have two critical sections delimited by the log_mutex of the +log root tree, so in the first one we incremented the log_writers of the +log root tree and on the second one we decremented it and waited for the +log_writers counter to go down to zero. This was because the update of +the log root tree happened between the two critical sections. + +The use of two critical sections allowed a little bit more of parallelism +and required the use of the log_writers counter, necessary to make sure +we didn't miss any log root tree update when we have multiple tasks trying +to sync the log in parallel. + +However after commit 06989c799f0481 ("Btrfs: fix race updating log root +item during fsync") the log root tree update was moved into a critical +section delimited by the subvolume's log_mutex. Later another commit +moved the log tree update from that critical section into the second +critical section delimited by the log_mutex of the log root tree. Both +commits addressed different bugs. + +The end result is that the first critical section delimited by the +log_mutex of the log root tree became pointless, since there's nothing +done between it and the second critical section, we just have an unlock +of the log_mutex followed by a lock operation. This means we can merge +both critical sections, as the first one does almost nothing now, and we +can stop using the log_writers counter of the log root tree, which was +incremented in the first critical section and decremented in the second +criticial section, used to make sure no one in the second critical section +started writeback of the log root tree before some other task updated it. + +So just remove the mutex_unlock() followed by mutex_lock() of the log root +tree, as well as the use of the log_writers counter for the log root tree. + +This patch is part of a series that has the following patches: + +1/4 btrfs: only commit the delayed inode when doing a full fsync +2/4 btrfs: only commit delayed items at fsync if we are logging a directory +3/4 btrfs: stop incremening log_batch for the log root tree when syncing log +4/4 btrfs: remove no longer needed use of log_writers for the log root tree + +After the entire patchset applied I saw about 12% decrease on max latency +reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of +ram, using kvm and using a raw NVMe device directly (no intermediary fs on +the host). The test was invoked like the following: + + mkfs.btrfs -f /dev/sdk + mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk + dbench -D /mnt/sdk -t 300 8 + umount /mnt/dsk + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 1 + + fs/btrfs/tree-log.c | 13 ------------- + 2 files changed, 1 insertion(+), 13 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1059,6 +1059,7 @@ struct btrfs_root { + wait_queue_head_t log_writer_wait; + wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; ++ /* Used only for log trees of subvolumes, not for the log root tree */ + atomic_t log_writers; + atomic_t log_commit[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3116,28 +3116,17 @@ int btrfs_sync_log(struct btrfs_trans_ha + btrfs_init_log_ctx(&root_log_ctx, NULL); + + mutex_lock(&log_root_tree->log_mutex); +- atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + +- mutex_unlock(&log_root_tree->log_mutex); +- +- mutex_lock(&log_root_tree->log_mutex); +- + /* + * Now we are safe to update the log_root_tree because we're under the + * log_mutex, and we're a current writer so we're holding the commit + * open until we drop the log_mutex. + */ + ret = update_log_root(trans, log, &new_root_item); +- +- if (atomic_dec_and_test(&log_root_tree->log_writers)) { +- /* atomic_dec_and_test implies a barrier */ +- cond_wake_up_nomb(&log_root_tree->log_writer_wait); +- } +- + if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); +@@ -3183,8 +3172,6 @@ int btrfs_sync_log(struct btrfs_trans_ha + root_log_ctx.log_transid - 1); + } + +- wait_for_writer(log_root_tree); +- + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again diff --git a/queue-5.8/btrfs-return-erofs-for-btrfs_fs_state_error-cases.patch b/queue-5.8/btrfs-return-erofs-for-btrfs_fs_state_error-cases.patch new file mode 100644 index 00000000000..b3bd34fd008 --- /dev/null +++ b/queue-5.8/btrfs-return-erofs-for-btrfs_fs_state_error-cases.patch @@ -0,0 +1,152 @@ +From fbabd4a36faaf74c83142d0b3d950c11ec14fda1 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 21 Jul 2020 10:38:37 -0400 +Subject: btrfs: return EROFS for BTRFS_FS_STATE_ERROR cases + +From: Josef Bacik + +commit fbabd4a36faaf74c83142d0b3d950c11ec14fda1 upstream. + +Eric reported seeing this message while running generic/475 + + BTRFS: error (device dm-3) in btrfs_sync_log:3084: errno=-117 Filesystem corrupted + +Full stack trace: + + BTRFS: error (device dm-0) in btrfs_commit_transaction:2323: errno=-5 IO failure (Error while writing out transaction) + BTRFS info (device dm-0): forced readonly + BTRFS warning (device dm-0): Skipping commit of aborted transaction. + ------------[ cut here ]------------ + BTRFS: error (device dm-0) in cleanup_transaction:1894: errno=-5 IO failure + BTRFS: Transaction aborted (error -117) + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6480 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6488 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6490 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6498 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64a0 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64a8 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64b0 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64b8 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64c0 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3572 rw 0,0 sector 0x1b85e8 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3572 rw 0,0 sector 0x1b85f0 len 4096 err no 10 + WARNING: CPU: 3 PID: 23985 at fs/btrfs/tree-log.c:3084 btrfs_sync_log+0xbc8/0xd60 [btrfs] + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4288 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4290 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4298 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42a0 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42a8 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42b0 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42b8 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42c0 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42c8 len 4096 err no 10 + BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42d0 len 4096 err no 10 + CPU: 3 PID: 23985 Comm: fsstress Tainted: G W L 5.8.0-rc4-default+ #1181 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 + RIP: 0010:btrfs_sync_log+0xbc8/0xd60 [btrfs] + RSP: 0018:ffff909a44d17bd0 EFLAGS: 00010286 + RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001 + RDX: ffff8f3be41cb940 RSI: ffffffffb0108d2b RDI: ffffffffb0108ff7 + RBP: ffff909a44d17e70 R08: 0000000000000000 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000037988 R12: ffff8f3bd20e4000 + R13: ffff8f3bd20e4428 R14: 00000000ffffff8b R15: ffff909a44d17c70 + FS: 00007f6a6ed3fb80(0000) GS:ffff8f3c3dc00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f6a6ed3e000 CR3: 00000000525c0003 CR4: 0000000000160ee0 + Call Trace: + ? finish_wait+0x90/0x90 + ? __mutex_unlock_slowpath+0x45/0x2a0 + ? lock_acquire+0xa3/0x440 + ? lockref_put_or_lock+0x9/0x30 + ? dput+0x20/0x4a0 + ? dput+0x20/0x4a0 + ? do_raw_spin_unlock+0x4b/0xc0 + ? _raw_spin_unlock+0x1f/0x30 + btrfs_sync_file+0x335/0x490 [btrfs] + do_fsync+0x38/0x70 + __x64_sys_fsync+0x10/0x20 + do_syscall_64+0x50/0xe0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + RIP: 0033:0x7f6a6ef1b6e3 + Code: Bad RIP value. + RSP: 002b:00007ffd01e20038 EFLAGS: 00000246 ORIG_RAX: 000000000000004a + RAX: ffffffffffffffda RBX: 000000000007a120 RCX: 00007f6a6ef1b6e3 + RDX: 00007ffd01e1ffa0 RSI: 00007ffd01e1ffa0 RDI: 0000000000000003 + RBP: 0000000000000003 R08: 0000000000000001 R09: 00007ffd01e2004c + R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000009f + R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 + irq event stamp: 0 + hardirqs last enabled at (0): [<0000000000000000>] 0x0 + hardirqs last disabled at (0): [] copy_process+0x67b/0x1b00 + softirqs last enabled at (0): [] copy_process+0x67b/0x1b00 + softirqs last disabled at (0): [<0000000000000000>] 0x0 + ---[ end trace af146e0e38433456 ]--- + BTRFS: error (device dm-0) in btrfs_sync_log:3084: errno=-117 Filesystem corrupted + +This ret came from btrfs_write_marked_extents(). If we get an aborted +transaction via EIO before, we'll see it in btree_write_cache_pages() +and return EUCLEAN, which gets printed as "Filesystem corrupted". + +Except we shouldn't be returning EUCLEAN here, we need to be returning +EROFS because EUCLEAN is reserved for actual corruption, not IO errors. + +We are inconsistent about our handling of BTRFS_FS_STATE_ERROR +elsewhere, but we want to use EROFS for this particular case. The +original transaction abort has the real error code for why we ended up +with an aborted transaction, all subsequent actions just need to return +EROFS because they may not have a trans handle and have no idea about +the original cause of the abort. + +After patch "btrfs: don't WARN if we abort a transaction with EROFS" the +stacktrace will not be dumped either. + +Reported-by: Eric Sandeen +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +[ add full test stacktrace ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent_io.c | 2 +- + fs/btrfs/scrub.c | 2 +- + fs/btrfs/transaction.c | 5 ++++- + 3 files changed, 6 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -4127,7 +4127,7 @@ retry: + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = flush_write_bio(&epd); + } else { +- ret = -EUCLEAN; ++ ret = -EROFS; + end_write_bio(&epd, ret); + } + return ret; +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -3758,7 +3758,7 @@ static noinline_for_stack int scrub_supe + struct btrfs_fs_info *fs_info = sctx->fs_info; + + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) +- return -EIO; ++ return -EROFS; + + /* Seed devices of a new filesystem has their own generation. */ + if (scrub_dev->fs_devices != fs_info->fs_devices) +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struc + if (TRANS_ABORTED(trans) || + test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { + wake_up_process(info->transaction_kthread); +- err = -EIO; ++ if (TRANS_ABORTED(trans)) ++ err = trans->aborted; ++ else ++ err = -EROFS; + } + + kmem_cache_free(btrfs_trans_handle_cachep, trans); diff --git a/queue-5.8/btrfs-stop-incremening-log_batch-for-the-log-root-tree-when-syncing-log.patch b/queue-5.8/btrfs-stop-incremening-log_batch-for-the-log-root-tree-when-syncing-log.patch new file mode 100644 index 00000000000..1cb0b387d2a --- /dev/null +++ b/queue-5.8/btrfs-stop-incremening-log_batch-for-the-log-root-tree-when-syncing-log.patch @@ -0,0 +1,68 @@ +From 28a9579561bcb9082715e720eac93012e708ab94 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 2 Jul 2020 12:32:31 +0100 +Subject: btrfs: stop incremening log_batch for the log root tree when syncing log + +From: Filipe Manana + +commit 28a9579561bcb9082715e720eac93012e708ab94 upstream. + +We are incrementing the log_batch atomic counter of the root log tree but +we never use that counter, it's used only for the log trees of subvolume +roots. We started doing it when we moved the log_batch and log_write +counters from the global, per fs, btrfs_fs_info structure, into the +btrfs_root structure in commit 7237f1833601dc ("Btrfs: fix tree logs +parallel sync"). + +So just stop doing it for the log root tree and add a comment over the +field declaration so inform it's used only for log trees of subvolume +roots. + +This patch is part of a series that has the following patches: + +1/4 btrfs: only commit the delayed inode when doing a full fsync +2/4 btrfs: only commit delayed items at fsync if we are logging a directory +3/4 btrfs: stop incremening log_batch for the log root tree when syncing log +4/4 btrfs: remove no longer needed use of log_writers for the log root tree + +After the entire patchset applied I saw about 12% decrease on max latency +reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of +ram, using kvm and using a raw NVMe device directly (no intermediary fs on +the host). The test was invoked like the following: + + mkfs.btrfs -f /dev/sdk + mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk + dbench -D /mnt/sdk -t 300 8 + umount /mnt/dsk + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 1 + + fs/btrfs/tree-log.c | 1 - + 2 files changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1061,6 +1061,7 @@ struct btrfs_root { + struct list_head log_ctxs[2]; + atomic_t log_writers; + atomic_t log_commit[2]; ++ /* Used only for log trees of subvolumes, not for the log root tree */ + atomic_t log_batch; + int log_transid; + /* No matter the commit succeeds or not*/ +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3116,7 +3116,6 @@ int btrfs_sync_log(struct btrfs_trans_ha + btrfs_init_log_ctx(&root_log_ctx, NULL); + + mutex_lock(&log_root_tree->log_mutex); +- atomic_inc(&log_root_tree->log_batch); + atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; diff --git a/queue-5.8/btrfs-sysfs-use-nofs-for-device-creation.patch b/queue-5.8/btrfs-sysfs-use-nofs-for-device-creation.patch new file mode 100644 index 00000000000..c7d28414e15 --- /dev/null +++ b/queue-5.8/btrfs-sysfs-use-nofs-for-device-creation.patch @@ -0,0 +1,181 @@ +From a47bd78d0c44621efb98b525d04d60dc4d1a79b0 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 21 Jul 2020 10:17:50 -0400 +Subject: btrfs: sysfs: use NOFS for device creation + +From: Josef Bacik + +commit a47bd78d0c44621efb98b525d04d60dc4d1a79b0 upstream. + +Dave hit this splat during testing btrfs/078: + + ====================================================== + WARNING: possible circular locking dependency detected + 5.8.0-rc6-default+ #1191 Not tainted + ------------------------------------------------------ + kswapd0/75 is trying to acquire lock: + ffffa040e9d04ff8 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] + + but task is already holding lock: + ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #2 (fs_reclaim){+.+.}-{0:0}: + __lock_acquire+0x56f/0xaa0 + lock_acquire+0xa3/0x440 + fs_reclaim_acquire.part.0+0x25/0x30 + __kmalloc_track_caller+0x49/0x330 + kstrdup+0x2e/0x60 + __kernfs_new_node.constprop.0+0x44/0x250 + kernfs_new_node+0x25/0x50 + kernfs_create_link+0x34/0xa0 + sysfs_do_create_link_sd+0x5e/0xd0 + btrfs_sysfs_add_devices_dir+0x65/0x100 [btrfs] + btrfs_init_new_device+0x44c/0x12b0 [btrfs] + btrfs_ioctl+0xc3c/0x25c0 [btrfs] + ksys_ioctl+0x68/0xa0 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x50/0xe0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: + __lock_acquire+0x56f/0xaa0 + lock_acquire+0xa3/0x440 + __mutex_lock+0xa0/0xaf0 + btrfs_chunk_alloc+0x137/0x3e0 [btrfs] + find_free_extent+0xb44/0xfb0 [btrfs] + btrfs_reserve_extent+0x9b/0x180 [btrfs] + btrfs_alloc_tree_block+0xc1/0x350 [btrfs] + alloc_tree_block_no_bg_flush+0x4a/0x60 [btrfs] + __btrfs_cow_block+0x143/0x7a0 [btrfs] + btrfs_cow_block+0x15f/0x310 [btrfs] + push_leaf_right+0x150/0x240 [btrfs] + split_leaf+0x3cd/0x6d0 [btrfs] + btrfs_search_slot+0xd14/0xf70 [btrfs] + btrfs_insert_empty_items+0x64/0xc0 [btrfs] + __btrfs_commit_inode_delayed_items+0xb2/0x840 [btrfs] + btrfs_async_run_delayed_root+0x10e/0x1d0 [btrfs] + btrfs_work_helper+0x2f9/0x650 [btrfs] + process_one_work+0x22c/0x600 + worker_thread+0x50/0x3b0 + kthread+0x137/0x150 + ret_from_fork+0x1f/0x30 + + -> #0 (&delayed_node->mutex){+.+.}-{3:3}: + check_prev_add+0x98/0xa20 + validate_chain+0xa8c/0x2a00 + __lock_acquire+0x56f/0xaa0 + lock_acquire+0xa3/0x440 + __mutex_lock+0xa0/0xaf0 + __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] + btrfs_evict_inode+0x3bf/0x560 [btrfs] + evict+0xd6/0x1c0 + dispose_list+0x48/0x70 + prune_icache_sb+0x54/0x80 + super_cache_scan+0x121/0x1a0 + do_shrink_slab+0x175/0x420 + shrink_slab+0xb1/0x2e0 + shrink_node+0x192/0x600 + balance_pgdat+0x31f/0x750 + kswapd+0x206/0x510 + kthread+0x137/0x150 + ret_from_fork+0x1f/0x30 + + other info that might help us debug this: + + Chain exists of: + &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(fs_reclaim); + lock(&fs_info->chunk_mutex); + lock(fs_reclaim); + lock(&delayed_node->mutex); + + *** DEADLOCK *** + + 3 locks held by kswapd0/75: + #0: ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 + #1: ffffffff8b0b50b8 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x54/0x2e0 + #2: ffffa040e057c0e8 (&type->s_umount_key#26){++++}-{3:3}, at: trylock_super+0x16/0x50 + + stack backtrace: + CPU: 2 PID: 75 Comm: kswapd0 Not tainted 5.8.0-rc6-default+ #1191 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 + Call Trace: + dump_stack+0x78/0xa0 + check_noncircular+0x16f/0x190 + check_prev_add+0x98/0xa20 + validate_chain+0xa8c/0x2a00 + __lock_acquire+0x56f/0xaa0 + lock_acquire+0xa3/0x440 + ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] + __mutex_lock+0xa0/0xaf0 + ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] + ? __lock_acquire+0x56f/0xaa0 + ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] + ? lock_acquire+0xa3/0x440 + ? btrfs_evict_inode+0x138/0x560 [btrfs] + ? btrfs_evict_inode+0x2fe/0x560 [btrfs] + ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] + __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] + btrfs_evict_inode+0x3bf/0x560 [btrfs] + evict+0xd6/0x1c0 + dispose_list+0x48/0x70 + prune_icache_sb+0x54/0x80 + super_cache_scan+0x121/0x1a0 + do_shrink_slab+0x175/0x420 + shrink_slab+0xb1/0x2e0 + shrink_node+0x192/0x600 + balance_pgdat+0x31f/0x750 + kswapd+0x206/0x510 + ? _raw_spin_unlock_irqrestore+0x3e/0x50 + ? finish_wait+0x90/0x90 + ? balance_pgdat+0x750/0x750 + kthread+0x137/0x150 + ? kthread_stop+0x2a0/0x2a0 + ret_from_fork+0x1f/0x30 + +This is because we're holding the chunk_mutex while adding this device +and adding its sysfs entries. We actually hold different locks in +different places when calling this function, the dev_replace semaphore +for instance in dev replace, so instead of moving this call around +simply wrap it's operations in NOFS. + +CC: stable@vger.kernel.org # 4.14+ +Reported-by: David Sterba +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/sysfs.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1273,7 +1273,9 @@ int btrfs_sysfs_add_devices_dir(struct b + { + int error = 0; + struct btrfs_device *dev; ++ unsigned int nofs_flag; + ++ nofs_flag = memalloc_nofs_save(); + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + + if (one_device && one_device != dev) +@@ -1301,6 +1303,7 @@ int btrfs_sysfs_add_devices_dir(struct b + break; + } + } ++ memalloc_nofs_restore(nofs_flag); + + return error; + } diff --git a/queue-5.8/series b/queue-5.8/series index 75ce70eafd2..ba8273426cc 100644 --- a/queue-5.8/series +++ b/queue-5.8/series @@ -10,3 +10,23 @@ pci-mark-amd-navi10-gpu-rev-0x00-ats-as-broken.patch pci-add-device-even-if-driver-attach-failed.patch pci-qcom-define-some-parf-params-needed-for-ipq8064-soc.patch pci-qcom-add-support-for-tx-term-offset-for-rev-2.1.0.patch +btrfs-allow-use-of-global-block-reserve-for-balance-item-deletion.patch +btrfs-free-anon-block-device-right-after-subvolume-deletion.patch +btrfs-don-t-allocate-anonymous-block-device-for-user-invisible-roots.patch +btrfs-preallocate-anon-block-device-at-first-phase-of-snapshot-creation.patch +btrfs-ref-verify-fix-memory-leak-in-add_block_entry.patch +btrfs-only-commit-the-delayed-inode-when-doing-a-full-fsync.patch +btrfs-stop-incremening-log_batch-for-the-log-root-tree-when-syncing-log.patch +btrfs-only-commit-delayed-items-at-fsync-if-we-are-logging-a-directory.patch +btrfs-remove-no-longer-needed-use-of-log_writers-for-the-log-root-tree.patch +btrfs-don-t-traverse-into-the-seed-devices-in-show_devname.patch +btrfs-pass-checksum-type-via-btrfs_ioc_fs_info-ioctl.patch +btrfs-open-device-without-device_list_mutex.patch +btrfs-move-the-chunk_mutex-in-btrfs_read_chunk_tree.patch +btrfs-relocation-review-the-call-sites-which-can-be-interrupted-by-signal.patch +btrfs-add-missing-check-for-nocow-and-compression-inode-flags.patch +btrfs-avoid-possible-signal-interruption-of-btrfs_drop_snapshot-on-relocation-tree.patch +btrfs-return-erofs-for-btrfs_fs_state_error-cases.patch +btrfs-sysfs-use-nofs-for-device-creation.patch +btrfs-don-t-warn-if-we-abort-a-transaction-with-erofs.patch +btrfs-fix-race-between-page-release-and-a-fast-fsync.patch