]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 13 Aug 2021 10:18:12 +0000 (12:18 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 13 Aug 2021 10:18:12 +0000 (12:18 +0200)
added patches:
btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch
btrfs-make-btrfs_qgroup_reserve_data-take-btrfs_inode.patch
btrfs-make-qgroup_free_reserved_data-take-btrfs_inode.patch
btrfs-qgroup-allow-to-unreserve-range-without-releasing-other-ranges.patch
btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch
btrfs-qgroup-try-to-flush-qgroup-space-when-we-get-edquot.patch
btrfs-transaction-cleanup-unused-trans_state_blocked.patch

queue-5.4/btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch [new file with mode: 0644]
queue-5.4/btrfs-make-btrfs_qgroup_reserve_data-take-btrfs_inode.patch [new file with mode: 0644]
queue-5.4/btrfs-make-qgroup_free_reserved_data-take-btrfs_inode.patch [new file with mode: 0644]
queue-5.4/btrfs-qgroup-allow-to-unreserve-range-without-releasing-other-ranges.patch [new file with mode: 0644]
queue-5.4/btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch [new file with mode: 0644]
queue-5.4/btrfs-qgroup-try-to-flush-qgroup-space-when-we-get-edquot.patch [new file with mode: 0644]
queue-5.4/btrfs-transaction-cleanup-unused-trans_state_blocked.patch [new file with mode: 0644]
queue-5.4/series

diff --git a/queue-5.4/btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch b/queue-5.4/btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch
new file mode 100644 (file)
index 0000000..7f91ac5
--- /dev/null
@@ -0,0 +1,277 @@
+From foo@baz Fri Aug 13 12:17:11 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 17:55:30 +0800
+Subject: btrfs: fix lockdep splat when enabling and disabling qgroups
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Filipe Manana <fdmanana@suse.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <60ff32a043f5315ee559d4d9e1222e0f40a93917.1628845854.git.anand.jain@oracle.com>
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit a855fbe69229078cd8aecd8974fb996a5ca651e6 upstream
+
+When running test case btrfs/017 from fstests, lockdep reported the
+following splat:
+
+  [ 1297.067385] ======================================================
+  [ 1297.067708] WARNING: possible circular locking dependency detected
+  [ 1297.068022] 5.10.0-rc4-btrfs-next-73 #1 Not tainted
+  [ 1297.068322] ------------------------------------------------------
+  [ 1297.068629] btrfs/189080 is trying to acquire lock:
+  [ 1297.068929] ffff9f2725731690 (sb_internal#2){.+.+}-{0:0}, at: btrfs_quota_enable+0xaf/0xa70 [btrfs]
+  [ 1297.069274]
+                but task is already holding lock:
+  [ 1297.069868] ffff9f2702b61a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_enable+0x3b/0xa70 [btrfs]
+  [ 1297.070219]
+                which lock already depends on the new lock.
+
+  [ 1297.071131]
+                the existing dependency chain (in reverse order) is:
+  [ 1297.071721]
+                -> #1 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}:
+  [ 1297.072375]        lock_acquire+0xd8/0x490
+  [ 1297.072710]        __mutex_lock+0xa3/0xb30
+  [ 1297.073061]        btrfs_qgroup_inherit+0x59/0x6a0 [btrfs]
+  [ 1297.073421]        create_subvol+0x194/0x990 [btrfs]
+  [ 1297.073780]        btrfs_mksubvol+0x3fb/0x4a0 [btrfs]
+  [ 1297.074133]        __btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs]
+  [ 1297.074498]        btrfs_ioctl_snap_create+0x58/0x80 [btrfs]
+  [ 1297.074872]        btrfs_ioctl+0x1a90/0x36f0 [btrfs]
+  [ 1297.075245]        __x64_sys_ioctl+0x83/0xb0
+  [ 1297.075617]        do_syscall_64+0x33/0x80
+  [ 1297.075993]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [ 1297.076380]
+                -> #0 (sb_internal#2){.+.+}-{0:0}:
+  [ 1297.077166]        check_prev_add+0x91/0xc60
+  [ 1297.077572]        __lock_acquire+0x1740/0x3110
+  [ 1297.077984]        lock_acquire+0xd8/0x490
+  [ 1297.078411]        start_transaction+0x3c5/0x760 [btrfs]
+  [ 1297.078853]        btrfs_quota_enable+0xaf/0xa70 [btrfs]
+  [ 1297.079323]        btrfs_ioctl+0x2c60/0x36f0 [btrfs]
+  [ 1297.079789]        __x64_sys_ioctl+0x83/0xb0
+  [ 1297.080232]        do_syscall_64+0x33/0x80
+  [ 1297.080680]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [ 1297.081139]
+                other info that might help us debug this:
+
+  [ 1297.082536]  Possible unsafe locking scenario:
+
+  [ 1297.083510]        CPU0                    CPU1
+  [ 1297.084005]        ----                    ----
+  [ 1297.084500]   lock(&fs_info->qgroup_ioctl_lock);
+  [ 1297.084994]                                lock(sb_internal#2);
+  [ 1297.085485]                                lock(&fs_info->qgroup_ioctl_lock);
+  [ 1297.085974]   lock(sb_internal#2);
+  [ 1297.086454]
+                 *** DEADLOCK ***
+  [ 1297.087880] 3 locks held by btrfs/189080:
+  [ 1297.088324]  #0: ffff9f2725731470 (sb_writers#14){.+.+}-{0:0}, at: btrfs_ioctl+0xa73/0x36f0 [btrfs]
+  [ 1297.088799]  #1: ffff9f2702b60cc0 (&fs_info->subvol_sem){++++}-{3:3}, at: btrfs_ioctl+0x1f4d/0x36f0 [btrfs]
+  [ 1297.089284]  #2: ffff9f2702b61a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_enable+0x3b/0xa70 [btrfs]
+  [ 1297.089771]
+                stack backtrace:
+  [ 1297.090662] CPU: 5 PID: 189080 Comm: btrfs Not tainted 5.10.0-rc4-btrfs-next-73 #1
+  [ 1297.091132] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+  [ 1297.092123] Call Trace:
+  [ 1297.092629]  dump_stack+0x8d/0xb5
+  [ 1297.093115]  check_noncircular+0xff/0x110
+  [ 1297.093596]  check_prev_add+0x91/0xc60
+  [ 1297.094076]  ? kvm_clock_read+0x14/0x30
+  [ 1297.094553]  ? kvm_sched_clock_read+0x5/0x10
+  [ 1297.095029]  __lock_acquire+0x1740/0x3110
+  [ 1297.095510]  lock_acquire+0xd8/0x490
+  [ 1297.095993]  ? btrfs_quota_enable+0xaf/0xa70 [btrfs]
+  [ 1297.096476]  start_transaction+0x3c5/0x760 [btrfs]
+  [ 1297.096962]  ? btrfs_quota_enable+0xaf/0xa70 [btrfs]
+  [ 1297.097451]  btrfs_quota_enable+0xaf/0xa70 [btrfs]
+  [ 1297.097941]  ? btrfs_ioctl+0x1f4d/0x36f0 [btrfs]
+  [ 1297.098429]  btrfs_ioctl+0x2c60/0x36f0 [btrfs]
+  [ 1297.098904]  ? do_user_addr_fault+0x20c/0x430
+  [ 1297.099382]  ? kvm_clock_read+0x14/0x30
+  [ 1297.099854]  ? kvm_sched_clock_read+0x5/0x10
+  [ 1297.100328]  ? sched_clock+0x5/0x10
+  [ 1297.100801]  ? sched_clock_cpu+0x12/0x180
+  [ 1297.101272]  ? __x64_sys_ioctl+0x83/0xb0
+  [ 1297.101739]  __x64_sys_ioctl+0x83/0xb0
+  [ 1297.102207]  do_syscall_64+0x33/0x80
+  [ 1297.102673]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [ 1297.103148] RIP: 0033:0x7f773ff65d87
+
+This is because during the quota enable ioctl we lock first the mutex
+qgroup_ioctl_lock and then start a transaction, and starting a transaction
+acquires a fs freeze semaphore (at the VFS level). However, every other
+code path, except for the quota disable ioctl path, we do the opposite:
+we start a transaction and then lock the mutex.
+
+So fix this by making the quota enable and disable paths to start the
+transaction without having the mutex locked, and then, after starting the
+transaction, lock the mutex and check if some other task already enabled
+or disabled the quotas, bailing with success if that was the case.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+
+ Conflicts:
+       fs/btrfs/qgroup.c
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h  |    5 +++-
+ fs/btrfs/qgroup.c |   56 ++++++++++++++++++++++++++++++++++++++++++++++--------
+ 2 files changed, 52 insertions(+), 9 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -827,7 +827,10 @@ struct btrfs_fs_info {
+        */
+       struct ulist *qgroup_ulist;
+-      /* protect user change for quota operations */
++      /*
++       * Protect user change for quota operations. If a transaction is needed,
++       * it must be started before locking this lock.
++       */
+       struct mutex qgroup_ioctl_lock;
+       /* list of dirty qgroups to be written at next commit */
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -886,6 +886,7 @@ int btrfs_quota_enable(struct btrfs_fs_i
+       struct btrfs_key found_key;
+       struct btrfs_qgroup *qgroup = NULL;
+       struct btrfs_trans_handle *trans = NULL;
++      struct ulist *ulist = NULL;
+       int ret = 0;
+       int slot;
+@@ -893,13 +894,28 @@ int btrfs_quota_enable(struct btrfs_fs_i
+       if (fs_info->quota_root)
+               goto out;
+-      fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
+-      if (!fs_info->qgroup_ulist) {
++      ulist = ulist_alloc(GFP_KERNEL);
++      if (!ulist) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       /*
++       * Unlock qgroup_ioctl_lock before starting the transaction. This is to
++       * avoid lock acquisition inversion problems (reported by lockdep) between
++       * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
++       * start a transaction.
++       * After we started the transaction lock qgroup_ioctl_lock again and
++       * check if someone else created the quota root in the meanwhile. If so,
++       * just return success and release the transaction handle.
++       *
++       * Also we don't need to worry about someone else calling
++       * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
++       * that function returns 0 (success) when the sysfs entries already exist.
++       */
++      mutex_unlock(&fs_info->qgroup_ioctl_lock);
++
++      /*
+        * 1 for quota root item
+        * 1 for BTRFS_QGROUP_STATUS item
+        *
+@@ -908,12 +924,20 @@ int btrfs_quota_enable(struct btrfs_fs_i
+        * would be a lot of overkill.
+        */
+       trans = btrfs_start_transaction(tree_root, 2);
++
++      mutex_lock(&fs_info->qgroup_ioctl_lock);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               trans = NULL;
+               goto out;
+       }
++      if (fs_info->quota_root)
++              goto out;
++
++      fs_info->qgroup_ulist = ulist;
++      ulist = NULL;
++
+       /*
+        * initially create the quota tree
+        */
+@@ -1046,10 +1070,13 @@ out:
+       if (ret) {
+               ulist_free(fs_info->qgroup_ulist);
+               fs_info->qgroup_ulist = NULL;
+-              if (trans)
+-                      btrfs_end_transaction(trans);
+       }
+       mutex_unlock(&fs_info->qgroup_ioctl_lock);
++      if (ret && trans)
++              btrfs_end_transaction(trans);
++      else if (trans)
++              ret = btrfs_end_transaction(trans);
++      ulist_free(ulist);
+       return ret;
+ }
+@@ -1062,19 +1089,29 @@ int btrfs_quota_disable(struct btrfs_fs_
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
+       if (!fs_info->quota_root)
+               goto out;
++      mutex_unlock(&fs_info->qgroup_ioctl_lock);
+       /*
+        * 1 For the root item
+        *
+        * We should also reserve enough items for the quota tree deletion in
+        * btrfs_clean_quota_tree but this is not done.
++       *
++       * Also, we must always start a transaction without holding the mutex
++       * qgroup_ioctl_lock, see btrfs_quota_enable().
+        */
+       trans = btrfs_start_transaction(fs_info->tree_root, 1);
++
++      mutex_lock(&fs_info->qgroup_ioctl_lock);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
++              trans = NULL;
+               goto out;
+       }
++      if (!fs_info->quota_root)
++              goto out;
++
+       clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+       btrfs_qgroup_wait_for_completion(fs_info, false);
+       spin_lock(&fs_info->qgroup_lock);
+@@ -1088,13 +1125,13 @@ int btrfs_quota_disable(struct btrfs_fs_
+       ret = btrfs_clean_quota_tree(trans, quota_root);
+       if (ret) {
+               btrfs_abort_transaction(trans, ret);
+-              goto end_trans;
++              goto out;
+       }
+       ret = btrfs_del_root(trans, &quota_root->root_key);
+       if (ret) {
+               btrfs_abort_transaction(trans, ret);
+-              goto end_trans;
++              goto out;
+       }
+       list_del(&quota_root->dirty_list);
+@@ -1108,10 +1145,13 @@ int btrfs_quota_disable(struct btrfs_fs_
+       free_extent_buffer(quota_root->commit_root);
+       kfree(quota_root);
+-end_trans:
+-      ret = btrfs_end_transaction(trans);
+ out:
+       mutex_unlock(&fs_info->qgroup_ioctl_lock);
++      if (ret && trans)
++              btrfs_end_transaction(trans);
++      else if (trans)
++              ret = btrfs_end_transaction(trans);
++
+       return ret;
+ }
diff --git a/queue-5.4/btrfs-make-btrfs_qgroup_reserve_data-take-btrfs_inode.patch b/queue-5.4/btrfs-make-btrfs_qgroup_reserve_data-take-btrfs_inode.patch
new file mode 100644 (file)
index 0000000..d7a6bab
--- /dev/null
@@ -0,0 +1,112 @@
+From foo@baz Fri Aug 13 12:16:18 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 17:55:25 +0800
+Subject: btrfs: make btrfs_qgroup_reserve_data take btrfs_inode
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Nikolay Borisov <nborisov@suse.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <7df1756cf877dc6c51330a5793008453a1ee730a.1628845854.git.anand.jain@oracle.com>
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 7661a3e033ab782366e0e1f4b6aad0df3555fcbd upstream
+
+There's only a single use of vfs_inode in a tracepoint so let's take
+btrfs_inode directly.
+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/delalloc-space.c |    2 +-
+ fs/btrfs/file.c           |    7 ++++---
+ fs/btrfs/qgroup.c         |   10 +++++-----
+ fs/btrfs/qgroup.h         |    2 +-
+ 4 files changed, 11 insertions(+), 10 deletions(-)
+
+--- a/fs/btrfs/delalloc-space.c
++++ b/fs/btrfs/delalloc-space.c
+@@ -151,7 +151,7 @@ int btrfs_check_data_free_space(struct i
+               return ret;
+       /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
+-      ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
++      ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len);
+       if (ret < 0)
+               btrfs_free_reserved_data_space_noquota(inode, start, len);
+       else
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -3149,7 +3149,7 @@ reserve_space:
+                                                 &cached_state);
+               if (ret)
+                       goto out;
+-              ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
++              ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
+                                               alloc_start, bytes_to_reserve);
+               if (ret) {
+                       unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+@@ -3322,8 +3322,9 @@ static long btrfs_fallocate(struct file
+                               free_extent_map(em);
+                               break;
+                       }
+-                      ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+-                                      cur_offset, last_byte - cur_offset);
++                      ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
++                                      &data_reserved, cur_offset,
++                                      last_byte - cur_offset);
+                       if (ret < 0) {
+                               cur_offset = last_byte;
+                               free_extent_map(em);
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3425,11 +3425,11 @@ btrfs_qgroup_rescan_resume(struct btrfs_
+  *       same @reserved, caller must ensure when error happens it's OK
+  *       to free *ALL* reserved space.
+  */
+-int btrfs_qgroup_reserve_data(struct inode *inode,
++int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+                       struct extent_changeset **reserved_ret, u64 start,
+                       u64 len)
+ {
+-      struct btrfs_root *root = BTRFS_I(inode)->root;
++      struct btrfs_root *root = inode->root;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       struct extent_changeset *reserved;
+@@ -3452,12 +3452,12 @@ int btrfs_qgroup_reserve_data(struct ino
+       reserved = *reserved_ret;
+       /* Record already reserved space */
+       orig_reserved = reserved->bytes_changed;
+-      ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
++      ret = set_record_extent_bits(&inode->io_tree, start,
+                       start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+       /* Newly reserved space */
+       to_reserve = reserved->bytes_changed - orig_reserved;
+-      trace_btrfs_qgroup_reserve_data(inode, start, len,
++      trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
+                                       to_reserve, QGROUP_RESERVE);
+       if (ret < 0)
+               goto cleanup;
+@@ -3471,7 +3471,7 @@ cleanup:
+       /* cleanup *ALL* already reserved ranges */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(&reserved->range_changed, &uiter)))
+-              clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
++              clear_extent_bit(&inode->io_tree, unode->val,
+                                unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
+       /* Also free data bytes of already reserved one */
+       btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -344,7 +344,7 @@ int btrfs_verify_qgroup_counts(struct bt
+ #endif
+ /* New io_tree based accurate qgroup reserve API */
+-int btrfs_qgroup_reserve_data(struct inode *inode,
++int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+                       struct extent_changeset **reserved, u64 start, u64 len);
+ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+ int btrfs_qgroup_free_data(struct inode *inode,
diff --git a/queue-5.4/btrfs-make-qgroup_free_reserved_data-take-btrfs_inode.patch b/queue-5.4/btrfs-make-qgroup_free_reserved_data-take-btrfs_inode.patch
new file mode 100644 (file)
index 0000000..e3bdbd7
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Fri Aug 13 12:16:18 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 17:55:24 +0800
+Subject: btrfs: make qgroup_free_reserved_data take btrfs_inode
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Nikolay Borisov <nborisov@suse.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <8a1f31bf0f35535bb38d906432d78a7de7fdff2c.1628845854.git.anand.jain@oracle.com>
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit df2cfd131fd33dbef1ce33be8b332b1f3d645f35 upstream
+
+It only uses btrfs_inode so can just as easily take it as an argument.
+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3481,10 +3481,10 @@ cleanup:
+ }
+ /* Free ranges specified by @reserved, normally in error path */
+-static int qgroup_free_reserved_data(struct inode *inode,
++static int qgroup_free_reserved_data(struct btrfs_inode *inode,
+                       struct extent_changeset *reserved, u64 start, u64 len)
+ {
+-      struct btrfs_root *root = BTRFS_I(inode)->root;
++      struct btrfs_root *root = inode->root;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       struct extent_changeset changeset;
+@@ -3520,8 +3520,8 @@ static int qgroup_free_reserved_data(str
+                * EXTENT_QGROUP_RESERVED, we won't double free.
+                * So not need to rush.
+                */
+-              ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
+-                              free_start, free_start + free_len - 1,
++              ret = clear_record_extent_bits(&inode->io_tree, free_start,
++                              free_start + free_len - 1,
+                               EXTENT_QGROUP_RESERVED, &changeset);
+               if (ret < 0)
+                       goto out;
+@@ -3550,7 +3550,8 @@ static int __btrfs_qgroup_release_data(s
+       /* In release case, we shouldn't have @reserved */
+       WARN_ON(!free && reserved);
+       if (free && reserved)
+-              return qgroup_free_reserved_data(inode, reserved, start, len);
++              return qgroup_free_reserved_data(BTRFS_I(inode), reserved,
++                                               start, len);
+       extent_changeset_init(&changeset);
+       ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
+                       start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
diff --git a/queue-5.4/btrfs-qgroup-allow-to-unreserve-range-without-releasing-other-ranges.patch b/queue-5.4/btrfs-qgroup-allow-to-unreserve-range-without-releasing-other-ranges.patch
new file mode 100644 (file)
index 0000000..d9f7fdc
--- /dev/null
@@ -0,0 +1,199 @@
+From foo@baz Fri Aug 13 12:16:18 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 17:55:26 +0800
+Subject: btrfs: qgroup: allow to unreserve range without releasing other ranges
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Qu Wenruo <wqu@suse.com>, Josef Bacik <josef@toxicpanda.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <d0ba354d6d4e08af313127e0acd44143125b3372.1628845854.git.anand.jain@oracle.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 263da812e87bac4098a4778efaa32c54275641db upstream
+
+[PROBLEM]
+Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
+reserved space of the changeset.
+
+For example:
+       ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
+       ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
+       ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
+
+If the last btrfs_qgroup_reserve_data() failed, it will release the
+entire [0, 3M) range.
+
+This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
+go error handling and need to release all reserved ranges anyway.
+
+But this also means the following call is not possible:
+
+       ret = btrfs_qgroup_reserve_data();
+       if (ret == -EDQUOT) {
+               /* Do something to free some qgroup space */
+               ret = btrfs_qgroup_reserve_data();
+       }
+
+As if the first btrfs_qgroup_reserve_data() fails, it will free all
+reserved qgroup space.
+
+[CAUSE]
+This is because we release all reserved ranges when
+btrfs_qgroup_reserve_data() fails.
+
+[FIX]
+This patch will implement a new function, qgroup_unreserve_range(), to
+iterate through the ulist nodes, to find any nodes in the failure range,
+and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
+decrease the extent_changeset::bytes_changed, so that we can revert to
+previous state.
+
+This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
+happens.
+
+Suggested-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |   92 +++++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 77 insertions(+), 15 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3411,6 +3411,73 @@ btrfs_qgroup_rescan_resume(struct btrfs_
+       }
+ }
++#define rbtree_iterate_from_safe(node, next, start)                           \
++       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
++
++static int qgroup_unreserve_range(struct btrfs_inode *inode,
++                                struct extent_changeset *reserved, u64 start,
++                                u64 len)
++{
++      struct rb_node *node;
++      struct rb_node *next;
++      struct ulist_node *entry = NULL;
++      int ret = 0;
++
++      node = reserved->range_changed.root.rb_node;
++      while (node) {
++              entry = rb_entry(node, struct ulist_node, rb_node);
++              if (entry->val < start)
++                      node = node->rb_right;
++              else if (entry)
++                      node = node->rb_left;
++              else
++                      break;
++      }
++
++      /* Empty changeset */
++      if (!entry)
++              return 0;
++
++      if (entry->val > start && rb_prev(&entry->rb_node))
++              entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
++                               rb_node);
++
++      rbtree_iterate_from_safe(node, next, &entry->rb_node) {
++              u64 entry_start;
++              u64 entry_end;
++              u64 entry_len;
++              int clear_ret;
++
++              entry = rb_entry(node, struct ulist_node, rb_node);
++              entry_start = entry->val;
++              entry_end = entry->aux;
++              entry_len = entry_end - entry_start + 1;
++
++              if (entry_start >= start + len)
++                      break;
++              if (entry_start + entry_len <= start)
++                      continue;
++              /*
++               * Now the entry is in [start, start + len), revert the
++               * EXTENT_QGROUP_RESERVED bit.
++               */
++              clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
++                                            entry_end, EXTENT_QGROUP_RESERVED);
++              if (!ret && clear_ret < 0)
++                      ret = clear_ret;
++
++              ulist_del(&reserved->range_changed, entry->val, entry->aux);
++              if (likely(reserved->bytes_changed >= entry_len)) {
++                      reserved->bytes_changed -= entry_len;
++              } else {
++                      WARN_ON(1);
++                      reserved->bytes_changed = 0;
++              }
++      }
++
++      return ret;
++}
++
+ /*
+  * Reserve qgroup space for range [start, start + len).
+  *
+@@ -3421,18 +3488,14 @@ btrfs_qgroup_rescan_resume(struct btrfs_
+  * Return <0 for error (including -EQUOT)
+  *
+  * NOTE: this function may sleep for memory allocation.
+- *       if btrfs_qgroup_reserve_data() is called multiple times with
+- *       same @reserved, caller must ensure when error happens it's OK
+- *       to free *ALL* reserved space.
+  */
+ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+                       struct extent_changeset **reserved_ret, u64 start,
+                       u64 len)
+ {
+       struct btrfs_root *root = inode->root;
+-      struct ulist_node *unode;
+-      struct ulist_iterator uiter;
+       struct extent_changeset *reserved;
++      bool new_reserved = false;
+       u64 orig_reserved;
+       u64 to_reserve;
+       int ret;
+@@ -3445,6 +3508,7 @@ int btrfs_qgroup_reserve_data(struct btr
+       if (WARN_ON(!reserved_ret))
+               return -EINVAL;
+       if (!*reserved_ret) {
++              new_reserved = true;
+               *reserved_ret = extent_changeset_alloc();
+               if (!*reserved_ret)
+                       return -ENOMEM;
+@@ -3460,7 +3524,7 @@ int btrfs_qgroup_reserve_data(struct btr
+       trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
+                                       to_reserve, QGROUP_RESERVE);
+       if (ret < 0)
+-              goto cleanup;
++              goto out;
+       ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
+       if (ret < 0)
+               goto cleanup;
+@@ -3468,15 +3532,13 @@ int btrfs_qgroup_reserve_data(struct btr
+       return ret;
+ cleanup:
+-      /* cleanup *ALL* already reserved ranges */
+-      ULIST_ITER_INIT(&uiter);
+-      while ((unode = ulist_next(&reserved->range_changed, &uiter)))
+-              clear_extent_bit(&inode->io_tree, unode->val,
+-                               unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
+-      /* Also free data bytes of already reserved one */
+-      btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
+-                                orig_reserved, BTRFS_QGROUP_RSV_DATA);
+-      extent_changeset_release(reserved);
++      qgroup_unreserve_range(inode, reserved, start, len);
++out:
++      if (new_reserved) {
++              extent_changeset_release(reserved);
++              kfree(reserved);
++              *reserved_ret = NULL;
++      }
+       return ret;
+ }
diff --git a/queue-5.4/btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch b/queue-5.4/btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch
new file mode 100644 (file)
index 0000000..07b6a74
--- /dev/null
@@ -0,0 +1,171 @@
+From foo@baz Fri Aug 13 12:17:11 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 17:55:29 +0800
+Subject: btrfs: qgroup: remove ASYNC_COMMIT mechanism in favor of reserve retry-after-EDQUOT
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Qu Wenruo <wqu@suse.com>, Josef Bacik <josef@toxicpanda.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <f06f7e9af9501ac1b737c1152ad509b00913c6e1.1628845854.git.anand.jain@oracle.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit adca4d945c8dca28a85df45c5b117e6dac2e77f1 upstream
+
+commit a514d63882c3 ("btrfs: qgroup: Commit transaction in advance to
+reduce early EDQUOT") tries to reduce the early EDQUOT problems by
+checking the qgroup free against threshold and tries to wake up commit
+kthread to free some space.
+
+The problem of that mechanism is, it can only free qgroup per-trans
+metadata space, can't do anything to data, nor prealloc qgroup space.
+
+Now since we have the ability to flush qgroup space, and implemented
+retry-after-EDQUOT behavior, such mechanism can be completely replaced.
+
+So this patch will cleanup such mechanism in favor of
+retry-after-EDQUOT.
+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h       |    5 -----
+ fs/btrfs/disk-io.c     |    1 -
+ fs/btrfs/qgroup.c      |   43 ++-----------------------------------------
+ fs/btrfs/transaction.c |    1 -
+ fs/btrfs/transaction.h |   14 --------------
+ 5 files changed, 2 insertions(+), 62 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -505,11 +505,6 @@ enum {
+        */
+       BTRFS_FS_EXCL_OP,
+       /*
+-       * To info transaction_kthread we need an immediate commit so it
+-       * doesn't need to wait for commit_interval
+-       */
+-      BTRFS_FS_NEED_ASYNC_COMMIT,
+-      /*
+        * Indicate that balance has been set up from the ioctl and is in the
+        * main phase. The fs_info::balance_ctl is initialized.
+        * Set and cleared while holding fs_info::balance_mutex.
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1749,7 +1749,6 @@ static int transaction_kthread(void *arg
+               now = ktime_get_seconds();
+               if (cur->state < TRANS_STATE_COMMIT_START &&
+-                  !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
+                   (now < cur->start_time ||
+                    now - cur->start_time < fs_info->commit_interval)) {
+                       spin_unlock(&fs_info->trans_lock);
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -11,7 +11,6 @@
+ #include <linux/slab.h>
+ #include <linux/workqueue.h>
+ #include <linux/btrfs.h>
+-#include <linux/sizes.h>
+ #include "ctree.h"
+ #include "transaction.h"
+@@ -2840,20 +2839,8 @@ out:
+       return ret;
+ }
+-/*
+- * Two limits to commit transaction in advance.
+- *
+- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
+- * For SIZE, it will be in byte unit as threshold.
+- */
+-#define QGROUP_FREE_RATIO             32
+-#define QGROUP_FREE_SIZE              SZ_32M
+-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
+-                              const struct btrfs_qgroup *qg, u64 num_bytes)
++static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
+ {
+-      u64 free;
+-      u64 threshold;
+-
+       if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+           qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
+               return false;
+@@ -2862,32 +2849,6 @@ static bool qgroup_check_limits(struct b
+           qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
+               return false;
+-      /*
+-       * Even if we passed the check, it's better to check if reservation
+-       * for meta_pertrans is pushing us near limit.
+-       * If there is too much pertrans reservation or it's near the limit,
+-       * let's try commit transaction to free some, using transaction_kthread
+-       */
+-      if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
+-                            BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
+-              if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+-                      free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+-                      threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+-                                        QGROUP_FREE_SIZE);
+-              } else {
+-                      free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+-                      threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+-                                        QGROUP_FREE_SIZE);
+-              }
+-
+-              /*
+-               * Use transaction_kthread to commit transaction, so we no
+-               * longer need to bother nested transaction nor lock context.
+-               */
+-              if (free < threshold)
+-                      btrfs_commit_transaction_locksafe(fs_info);
+-      }
+-
+       return true;
+ }
+@@ -2937,7 +2898,7 @@ static int qgroup_reserve(struct btrfs_r
+               qg = unode_aux_to_qgroup(unode);
+-              if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
++              if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+                       ret = -EDQUOT;
+                       goto out;
+               }
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -2297,7 +2297,6 @@ int btrfs_commit_transaction(struct btrf
+        */
+       cur_trans->state = TRANS_STATE_COMPLETED;
+       wake_up(&cur_trans->commit_wait);
+-      clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
+       spin_lock(&fs_info->trans_lock);
+       list_del_init(&cur_trans->list);
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -207,20 +207,6 @@ int btrfs_clean_one_deleted_snapshot(str
+ int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
+ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+                                  int wait_for_unblock);
+-
+-/*
+- * Try to commit transaction asynchronously, so this is safe to call
+- * even holding a spinlock.
+- *
+- * It's done by informing transaction_kthread to commit transaction without
+- * waiting for commit interval.
+- */
+-static inline void btrfs_commit_transaction_locksafe(
+-              struct btrfs_fs_info *fs_info)
+-{
+-      set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
+-      wake_up_process(fs_info->transaction_kthread);
+-}
+ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
+ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+ void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/queue-5.4/btrfs-qgroup-try-to-flush-qgroup-space-when-we-get-edquot.patch b/queue-5.4/btrfs-qgroup-try-to-flush-qgroup-space-when-we-get-edquot.patch
new file mode 100644 (file)
index 0000000..14d0922
--- /dev/null
@@ -0,0 +1,243 @@
+From foo@baz Fri Aug 13 12:17:11 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 17:55:27 +0800
+Subject: btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Qu Wenruo <wqu@suse.com>, Josef Bacik <josef@toxicpanda.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <740e4978ebebfc08491db3f52264f7b5ba60ed96.1628845854.git.anand.jain@oracle.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit c53e9653605dbf708f5be02902de51831be4b009 upstream
+
+[PROBLEM]
+There are known problem related to how btrfs handles qgroup reserved
+space.  One of the most obvious case is the the test case btrfs/153,
+which do fallocate, then write into the preallocated range.
+
+#  btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
+#      --- tests/btrfs/153.out     2019-10-22 15:18:14.068965341 +0800
+#      +++ xfstests-dev/results//btrfs/153.out.bad      2020-07-01 20:24:40.730000089 +0800
+#      @@ -1,2 +1,5 @@
+#       QA output created by 153
+#      +pwrite: Disk quota exceeded
+#      +/mnt/scratch/testfile2: Disk quota exceeded
+#      +/mnt/scratch/testfile2: Disk quota exceeded
+#       Silence is golden
+#      ...
+#      (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad'  to see the entire diff)
+
+[CAUSE]
+Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
+we always reserve space no matter if it's COW or not.
+
+Such behavior change is mostly for performance, and reverting it is not
+a good idea anyway.
+
+For preallcoated extent, we reserve qgroup data space for it already,
+and since we also reserve data space for qgroup at buffered write time,
+it needs twice the space for us to write into preallocated space.
+
+This leads to the -EDQUOT in buffered write routine.
+
+And we can't follow the same solution, unlike data/meta space check,
+qgroup reserved space is shared between data/metadata.
+The EDQUOT can happen at the metadata reservation, so doing NODATACOW
+check after qgroup reservation failure is not a solution.
+
+[FIX]
+To solve the problem, we don't return -EDQUOT directly, but every time
+we got a -EDQUOT, we try to flush qgroup space:
+
+- Flush all inodes of the root
+  NODATACOW writes will free the qgroup reserved at run_dealloc_range().
+  However we don't have the infrastructure to only flush NODATACOW
+  inodes, here we flush all inodes anyway.
+
+- Wait for ordered extents
+  This would convert the preallocated metadata space into per-trans
+  metadata, which can be freed in later transaction commit.
+
+- Commit transaction
+  This will free all per-trans metadata space.
+
+Also we don't want to trigger flush multiple times, so here we introduce
+a per-root wait list and a new root status, to ensure only one thread
+starts the flushing.
+
+Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h   |    3 +
+ fs/btrfs/disk-io.c |    1 
+ fs/btrfs/qgroup.c  |  100 ++++++++++++++++++++++++++++++++++++++++++++++++-----
+ 3 files changed, 96 insertions(+), 8 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -945,6 +945,8 @@ enum {
+       BTRFS_ROOT_DEAD_TREE,
+       /* The root has a log tree. Used only for subvolume roots. */
+       BTRFS_ROOT_HAS_LOG_TREE,
++      /* Qgroup flushing is in progress */
++      BTRFS_ROOT_QGROUP_FLUSHING,
+ };
+ /*
+@@ -1097,6 +1099,7 @@ struct btrfs_root {
+       spinlock_t qgroup_meta_rsv_lock;
+       u64 qgroup_meta_rsv_pertrans;
+       u64 qgroup_meta_rsv_prealloc;
++      wait_queue_head_t qgroup_flush_wait;
+       /* Number of active swapfiles */
+       atomic_t nr_swapfiles;
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_ro
+       mutex_init(&root->log_mutex);
+       mutex_init(&root->ordered_extent_mutex);
+       mutex_init(&root->delalloc_mutex);
++      init_waitqueue_head(&root->qgroup_flush_wait);
+       init_waitqueue_head(&root->log_writer_wait);
+       init_waitqueue_head(&root->log_commit_wait[0]);
+       init_waitqueue_head(&root->log_commit_wait[1]);
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3479,17 +3479,58 @@ static int qgroup_unreserve_range(struct
+ }
+ /*
+- * Reserve qgroup space for range [start, start + len).
++ * Try to free some space for qgroup.
+  *
+- * This function will either reserve space from related qgroups or doing
+- * nothing if the range is already reserved.
++ * For qgroup, there are only 3 ways to free qgroup space:
++ * - Flush nodatacow write
++ *   Any nodatacow write will free its reserved data space at run_delalloc_range().
++ *   In theory, we should only flush nodatacow inodes, but it's not yet
++ *   possible, so we need to flush the whole root.
+  *
+- * Return 0 for successful reserve
+- * Return <0 for error (including -EQUOT)
++ * - Wait for ordered extents
++ *   When ordered extents are finished, their reserved metadata is finally
++ *   converted to per_trans status, which can be freed by later commit
++ *   transaction.
+  *
+- * NOTE: this function may sleep for memory allocation.
++ * - Commit transaction
++ *   This would free the meta_per_trans space.
++ *   In theory this shouldn't provide much space, but any more qgroup space
++ *   is needed.
+  */
+-int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
++static int try_flush_qgroup(struct btrfs_root *root)
++{
++      struct btrfs_trans_handle *trans;
++      int ret;
++
++      /*
++       * We don't want to run flush again and again, so if there is a running
++       * one, we won't try to start a new flush, but exit directly.
++       */
++      if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
++              wait_event(root->qgroup_flush_wait,
++                      !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
++              return 0;
++      }
++
++      ret = btrfs_start_delalloc_snapshot(root);
++      if (ret < 0)
++              goto out;
++      btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
++
++      trans = btrfs_join_transaction(root);
++      if (IS_ERR(trans)) {
++              ret = PTR_ERR(trans);
++              goto out;
++      }
++
++      ret = btrfs_commit_transaction(trans);
++out:
++      clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
++      wake_up(&root->qgroup_flush_wait);
++      return ret;
++}
++
++static int qgroup_reserve_data(struct btrfs_inode *inode,
+                       struct extent_changeset **reserved_ret, u64 start,
+                       u64 len)
+ {
+@@ -3542,6 +3583,34 @@ out:
+       return ret;
+ }
++/*
++ * Reserve qgroup space for range [start, start + len).
++ *
++ * This function will either reserve space from related qgroups or do nothing
++ * if the range is already reserved.
++ *
++ * Return 0 for successful reservation
++ * Return <0 for error (including -EQUOT)
++ *
++ * NOTE: This function may sleep for memory allocation, dirty page flushing and
++ *     commit transaction. So caller should not hold any dirty page locked.
++ */
++int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
++                      struct extent_changeset **reserved_ret, u64 start,
++                      u64 len)
++{
++      int ret;
++
++      ret = qgroup_reserve_data(inode, reserved_ret, start, len);
++      if (ret <= 0 && ret != -EDQUOT)
++              return ret;
++
++      ret = try_flush_qgroup(inode->root);
++      if (ret < 0)
++              return ret;
++      return qgroup_reserve_data(inode, reserved_ret, start, len);
++}
++
+ /* Free ranges specified by @reserved, normally in error path */
+ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
+                       struct extent_changeset *reserved, u64 start, u64 len)
+@@ -3712,7 +3781,7 @@ static int sub_root_meta_rsv(struct btrf
+       return num_bytes;
+ }
+-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
++static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+                               enum btrfs_qgroup_rsv_type type, bool enforce)
+ {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+@@ -3739,6 +3808,21 @@ int __btrfs_qgroup_reserve_meta(struct b
+       return ret;
+ }
++int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
++                              enum btrfs_qgroup_rsv_type type, bool enforce)
++{
++      int ret;
++
++      ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
++      if (ret <= 0 && ret != -EDQUOT)
++              return ret;
++
++      ret = try_flush_qgroup(root);
++      if (ret < 0)
++              return ret;
++      return qgroup_reserve_meta(root, num_bytes, type, enforce);
++}
++
+ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
+ {
+       struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/queue-5.4/btrfs-transaction-cleanup-unused-trans_state_blocked.patch b/queue-5.4/btrfs-transaction-cleanup-unused-trans_state_blocked.patch
new file mode 100644 (file)
index 0000000..fd44a17
--- /dev/null
@@ -0,0 +1,111 @@
+From foo@baz Fri Aug 13 12:17:11 PM CEST 2021
+From: Anand Jain <anand.jain@oracle.com>
+Date: Fri, 13 Aug 2021 17:55:28 +0800
+Subject: btrfs: transaction: Cleanup unused TRANS_STATE_BLOCKED
+To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
+Cc: linux-btrfs@vger.kernel.org, Qu Wenruo <wqu@suse.com>, David Sterba <dsterba@suse.com>, Anand Jain <anand.jain@oracle.com>
+Message-ID: <7c24989ec90962679cdcf9f6f2ba6fd39fc569ee.1628845854.git.anand.jain@oracle.com>
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 3296bf562443a8ca35aaad959a76a49e9b412760 upstream
+
+The state was introduced in commit 4a9d8bdee368 ("Btrfs: make the state
+of the transaction more readable"), then in commit 302167c50b32
+("btrfs: don't end the transaction for delayed refs in throttle") the
+state is completely removed.
+
+So we can just clean up the state since it's only compared but never
+set.
+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/disk-io.c     |    2 +-
+ fs/btrfs/transaction.c |   15 +++------------
+ fs/btrfs/transaction.h |    1 -
+ 3 files changed, 4 insertions(+), 14 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1748,7 +1748,7 @@ static int transaction_kthread(void *arg
+               }
+               now = ktime_get_seconds();
+-              if (cur->state < TRANS_STATE_BLOCKED &&
++              if (cur->state < TRANS_STATE_COMMIT_START &&
+                   !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
+                   (now < cur->start_time ||
+                    now - cur->start_time < fs_info->commit_interval)) {
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -27,7 +27,6 @@
+ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+       [TRANS_STATE_RUNNING]           = 0U,
+-      [TRANS_STATE_BLOCKED]           =  __TRANS_START,
+       [TRANS_STATE_COMMIT_START]      = (__TRANS_START | __TRANS_ATTACH),
+       [TRANS_STATE_COMMIT_DOING]      = (__TRANS_START |
+                                          __TRANS_ATTACH |
+@@ -388,7 +387,7 @@ int btrfs_record_root_in_trans(struct bt
+ static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+ {
+-      return (trans->state >= TRANS_STATE_BLOCKED &&
++      return (trans->state >= TRANS_STATE_COMMIT_START &&
+               trans->state < TRANS_STATE_UNBLOCKED &&
+               !TRANS_ABORTED(trans));
+ }
+@@ -580,7 +579,7 @@ again:
+       INIT_LIST_HEAD(&h->new_bgs);
+       smp_mb();
+-      if (cur_trans->state >= TRANS_STATE_BLOCKED &&
++      if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
+           may_wait_transaction(fs_info, type)) {
+               current->journal_info = h;
+               btrfs_commit_transaction(h);
+@@ -797,7 +796,7 @@ int btrfs_should_end_transaction(struct
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       smp_mb();
+-      if (cur_trans->state >= TRANS_STATE_BLOCKED ||
++      if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
+           cur_trans->delayed_refs.flushing)
+               return 1;
+@@ -830,7 +829,6 @@ static int __btrfs_end_transaction(struc
+ {
+       struct btrfs_fs_info *info = trans->fs_info;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+-      int lock = (trans->type != TRANS_JOIN_NOLOCK);
+       int err = 0;
+       if (refcount_read(&trans->use_count) > 1) {
+@@ -846,13 +844,6 @@ static int __btrfs_end_transaction(struc
+       btrfs_trans_release_chunk_metadata(trans);
+-      if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
+-              if (throttle)
+-                      return btrfs_commit_transaction(trans);
+-              else
+-                      wake_up_process(info->transaction_kthread);
+-      }
+-
+       if (trans->type & __TRANS_FREEZABLE)
+               sb_end_intwrite(info->sb);
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -13,7 +13,6 @@
+ enum btrfs_trans_state {
+       TRANS_STATE_RUNNING,
+-      TRANS_STATE_BLOCKED,
+       TRANS_STATE_COMMIT_START,
+       TRANS_STATE_COMMIT_DOING,
+       TRANS_STATE_UNBLOCKED,
index 139939da9771cebff7099f0e36de80a20eaa06b8..d9862cbf17b7b80f2eeec6d89adb975b814c106b 100644 (file)
@@ -14,3 +14,10 @@ usb-ehci-fix-kunpeng920-ehci-hardware-problem.patch
 alsa-hda-add-quirk-for-asus-flow-x13.patch
 ppp-fix-generating-ppp-unit-id-when-ifname-is-not-specified.patch
 ovl-prevent-private-clone-if-bind-mount-is-not-allowed.patch
+btrfs-make-qgroup_free_reserved_data-take-btrfs_inode.patch
+btrfs-make-btrfs_qgroup_reserve_data-take-btrfs_inode.patch
+btrfs-qgroup-allow-to-unreserve-range-without-releasing-other-ranges.patch
+btrfs-qgroup-try-to-flush-qgroup-space-when-we-get-edquot.patch
+btrfs-transaction-cleanup-unused-trans_state_blocked.patch
+btrfs-qgroup-remove-async_commit-mechanism-in-favor-of-reserve-retry-after-edquot.patch
+btrfs-fix-lockdep-splat-when-enabling-and-disabling-qgroups.patch