]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 23 Aug 2022 07:21:21 +0000 (09:21 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 23 Aug 2022 07:21:21 +0000 (09:21 +0200)
added patches:
xfs-always-succeed-at-setting-the-reserve-pool-size.patch
xfs-fix-overfilling-of-reserve-pool.patch
xfs-fix-soft-lockup-via-spinning-in-filestream-ag-selection-loop.patch
xfs-flush-inodegc-workqueue-tasks-before-cancel.patch
xfs-reject-crazy-array-sizes-being-fed-to-xfs_ioc_getbmap.patch
xfs-remove-infinite-loop-when-reserving-free-block-pool.patch
xfs-reserve-quota-for-dir-expansion-when-linking-unlinking-files.patch
xfs-reserve-quota-for-target-dir-expansion-when-renaming-files.patch
xfs-revert-xfs-actually-bump-warning-counts-when-we-send-warnings.patch

queue-5.15/series
queue-5.15/xfs-always-succeed-at-setting-the-reserve-pool-size.patch [new file with mode: 0644]
queue-5.15/xfs-fix-overfilling-of-reserve-pool.patch [new file with mode: 0644]
queue-5.15/xfs-fix-soft-lockup-via-spinning-in-filestream-ag-selection-loop.patch [new file with mode: 0644]
queue-5.15/xfs-flush-inodegc-workqueue-tasks-before-cancel.patch [new file with mode: 0644]
queue-5.15/xfs-reject-crazy-array-sizes-being-fed-to-xfs_ioc_getbmap.patch [new file with mode: 0644]
queue-5.15/xfs-remove-infinite-loop-when-reserving-free-block-pool.patch [new file with mode: 0644]
queue-5.15/xfs-reserve-quota-for-dir-expansion-when-linking-unlinking-files.patch [new file with mode: 0644]
queue-5.15/xfs-reserve-quota-for-target-dir-expansion-when-renaming-files.patch [new file with mode: 0644]
queue-5.15/xfs-revert-xfs-actually-bump-warning-counts-when-we-send-warnings.patch [new file with mode: 0644]

index 45b379b27ac7a4ef5f4dd5586c6bdd0e729d0c27..33cabc2bdeab17bd759c85f6db4f5c4b2993e958 100644 (file)
@@ -233,3 +233,12 @@ video-fbdev-i740fb-check-the-argument-of-i740_calc_v.patch
 mips-tlbex-explicitly-compare-_page_no_exec-against-.patch
 can-j1939-j1939_sk_queue_activate_next_locked-replace-warn_on_once-with-netdev_warn_once.patch
 scsi-ufs-ufs-mediatek-fix-build-error-and-type-mismatch.patch
+xfs-flush-inodegc-workqueue-tasks-before-cancel.patch
+xfs-reserve-quota-for-dir-expansion-when-linking-unlinking-files.patch
+xfs-reserve-quota-for-target-dir-expansion-when-renaming-files.patch
+xfs-remove-infinite-loop-when-reserving-free-block-pool.patch
+xfs-always-succeed-at-setting-the-reserve-pool-size.patch
+xfs-fix-overfilling-of-reserve-pool.patch
+xfs-fix-soft-lockup-via-spinning-in-filestream-ag-selection-loop.patch
+xfs-revert-xfs-actually-bump-warning-counts-when-we-send-warnings.patch
+xfs-reject-crazy-array-sizes-being-fed-to-xfs_ioc_getbmap.patch
diff --git a/queue-5.15/xfs-always-succeed-at-setting-the-reserve-pool-size.patch b/queue-5.15/xfs-always-succeed-at-setting-the-reserve-pool-size.patch
new file mode 100644 (file)
index 0000000..9931e3f
--- /dev/null
@@ -0,0 +1,57 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:27 -0700
+Subject: xfs: always succeed at setting the reserve pool size
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-6-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 0baa2657dc4d79202148be79a3dc36c35f425060 ]
+
+Nowadays, xfs_mod_fdblocks will always choose to fill the reserve pool
+with freed blocks before adding to fdblocks.  Therefore, we can change
+the behavior of xfs_reserve_blocks slightly -- setting the target size
+of the pool should always succeed, since a deficiency will eventually
+be made up as blocks get freed.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -434,11 +434,14 @@ xfs_reserve_blocks(
+        * The code below estimates how many blocks it can request from
+        * fdblocks to stash in the reserve pool.  This is a classic TOCTOU
+        * race since fdblocks updates are not always coordinated via
+-       * m_sb_lock.
++       * m_sb_lock.  Set the reserve size even if there's not enough free
++       * space to fill it because mod_fdblocks will refill an undersized
++       * reserve when it can.
+        */
+       free = percpu_counter_sum(&mp->m_fdblocks) -
+                                               xfs_fdblocks_unavailable(mp);
+       delta = request - mp->m_resblks;
++      mp->m_resblks = request;
+       if (delta > 0 && free > 0) {
+               /*
+                * We'll either succeed in getting space from the free block
+@@ -455,10 +458,8 @@ xfs_reserve_blocks(
+                * Update the reserve counters if blocks have been successfully
+                * allocated.
+                */
+-              if (!error) {
+-                      mp->m_resblks += fdblks_delta;
++              if (!error)
+                       mp->m_resblks_avail += fdblks_delta;
+-              }
+       }
+ out:
+       if (outval) {
diff --git a/queue-5.15/xfs-fix-overfilling-of-reserve-pool.patch b/queue-5.15/xfs-fix-overfilling-of-reserve-pool.patch
new file mode 100644 (file)
index 0000000..668c37f
--- /dev/null
@@ -0,0 +1,57 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:28 -0700
+Subject: xfs: fix overfilling of reserve pool
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-7-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 82be38bcf8a2e056b4c99ce79a3827fa743df6ec ]
+
+Due to cycling of m_sb_lock, it's possible for multiple callers of
+xfs_reserve_blocks to race at changing the pool size, subtracting blocks
+from fdblocks, and actually putting it in the pool.  The result of all
+this is that we can overfill the reserve pool to hilarious levels.
+
+xfs_mod_fdblocks, when called with a positive value, already knows how
+to take freed blocks and either fill the reserve until it's full, or put
+them in fdblocks.  Use that instead of setting m_resblks_avail directly.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c |   13 ++++++-------
+ 1 file changed, 6 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -448,18 +448,17 @@ xfs_reserve_blocks(
+                * count or we'll get an ENOSPC.  Don't set the reserved flag
+                * here - we don't want to reserve the extra reserve blocks
+                * from the reserve.
++               *
++               * The desired reserve size can change after we drop the lock.
++               * Use mod_fdblocks to put the space into the reserve or into
++               * fdblocks as appropriate.
+                */
+               fdblks_delta = min(free, delta);
+               spin_unlock(&mp->m_sb_lock);
+               error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+-              spin_lock(&mp->m_sb_lock);
+-
+-              /*
+-               * Update the reserve counters if blocks have been successfully
+-               * allocated.
+-               */
+               if (!error)
+-                      mp->m_resblks_avail += fdblks_delta;
++                      xfs_mod_fdblocks(mp, fdblks_delta, 0);
++              spin_lock(&mp->m_sb_lock);
+       }
+ out:
+       if (outval) {
diff --git a/queue-5.15/xfs-fix-soft-lockup-via-spinning-in-filestream-ag-selection-loop.patch b/queue-5.15/xfs-fix-soft-lockup-via-spinning-in-filestream-ag-selection-loop.patch
new file mode 100644 (file)
index 0000000..bb99078
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:29 -0700
+Subject: xfs: fix soft lockup via spinning in filestream ag selection loop
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Christoph Hellwig <hch@lst.de>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-8-leah.rumancik@gmail.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+[ Upstream commit f650df7171b882dca737ddbbeb414100b31f16af ]
+
+The filestream AG selection loop uses pagf data to aid in AG
+selection, which depends on pagf initialization. If the in-core
+structure is not initialized, the caller invokes the AGF read path
+to do so and carries on. If another task enters the loop and finds
+a pagf init already in progress, the AGF read returns -EAGAIN and
+the task continues the loop. This does not increment the current ag
+index, however, which means the task spins on the current AGF buffer
+until unlocked.
+
+If the AGF read I/O submitted by the initial task happens to be
+delayed for whatever reason, this results in soft lockup warnings
+via the spinning task. This is reproduced by xfs/170. To avoid this
+problem, fix the AGF trylock failure path to properly iterate to the
+next AG. If a task iterates all AGs without making progress, the
+trylock behavior is dropped in favor of blocking locks and thus a
+soft lockup is no longer possible.
+
+Fixes: f48e2df8a877ca1c ("xfs: make xfs_*read_agf return EAGAIN to ALLOC_FLAG_TRYLOCK callers")
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_filestream.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_filestream.c
++++ b/fs/xfs/xfs_filestream.c
+@@ -128,11 +128,12 @@ xfs_filestream_pick_ag(
+               if (!pag->pagf_init) {
+                       err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+                       if (err) {
+-                              xfs_perag_put(pag);
+-                              if (err != -EAGAIN)
++                              if (err != -EAGAIN) {
++                                      xfs_perag_put(pag);
+                                       return err;
++                              }
+                               /* Couldn't lock the AGF, skip this AG. */
+-                              continue;
++                              goto next_ag;
+                       }
+               }
diff --git a/queue-5.15/xfs-flush-inodegc-workqueue-tasks-before-cancel.patch b/queue-5.15/xfs-flush-inodegc-workqueue-tasks-before-cancel.patch
new file mode 100644 (file)
index 0000000..c6a7db8
--- /dev/null
@@ -0,0 +1,121 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:23 -0700
+Subject: xfs: flush inodegc workqueue tasks before cancel
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-2-leah.rumancik@gmail.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+[ Upstream commit 6191cf3ad59fda5901160633fef8e41b064a5246 ]
+
+The xfs_inodegc_stop() helper performs a high level flush of pending
+work on the percpu queues and then runs a cancel_work_sync() on each
+of the percpu work tasks to ensure all work has completed before
+returning.  While cancel_work_sync() waits for wq tasks to complete,
+it does not guarantee work tasks have started. This means that the
+_stop() helper can queue and instantly cancel a wq task without
+having completed the associated work. This can be observed by
+tracepoint inspection of a simple "rm -f <file>; fsfreeze -f <mnt>"
+test:
+
+       xfs_destroy_inode: ... ino 0x83 ...
+       xfs_inode_set_need_inactive: ... ino 0x83 ...
+       xfs_inodegc_stop: ...
+       ...
+       xfs_inodegc_start: ...
+       xfs_inodegc_worker: ...
+       xfs_inode_inactivating: ... ino 0x83 ...
+
+The first few lines show that the inode is removed and need inactive
+state set, but the inactivation work has not completed before the
+inodegc mechanism stops. The inactivation doesn't actually occur
+until the fs is unfrozen and the gc mechanism starts back up. Note
+that this test requires fsfreeze to reproduce because xfs_freeze
+indirectly invokes xfs_fs_statfs(), which calls xfs_inodegc_flush().
+
+When this occurs, the workqueue try_to_grab_pending() logic first
+tries to steal the pending bit, which does not succeed because the
+bit has been set by queue_work_on(). Subsequently, it checks for
+association of a pool workqueue from the work item under the pool
+lock. This association is set at the point a work item is queued and
+cleared when dequeued for processing. If the association exists, the
+work item is removed from the queue and cancel_work_sync() returns
+true. If the pwq association is cleared, the remove attempt assumes
+the task is busy and retries (eventually returning false to the
+caller after waiting for the work task to complete).
+
+To avoid this race, we can flush each work item explicitly before
+cancel. However, since the _queue_all() already schedules each
+underlying work item, the workqueue level helpers are sufficient to
+achieve the same ordering effect. E.g., the inodegc enabled flag
+prevents scheduling any further work in the _stop() case. Use the
+drain_workqueue() helper in this particular case to make the intent
+a bit more self explanatory.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |   22 ++++------------------
+ 1 file changed, 4 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1872,28 +1872,20 @@ xfs_inodegc_worker(
+ }
+ /*
+- * Force all currently queued inode inactivation work to run immediately, and
+- * wait for the work to finish. Two pass - queue all the work first pass, wait
+- * for it in a second pass.
++ * Force all currently queued inode inactivation work to run immediately and
++ * wait for the work to finish.
+  */
+ void
+ xfs_inodegc_flush(
+       struct xfs_mount        *mp)
+ {
+-      struct xfs_inodegc      *gc;
+-      int                     cpu;
+-
+       if (!xfs_is_inodegc_enabled(mp))
+               return;
+       trace_xfs_inodegc_flush(mp, __return_address);
+       xfs_inodegc_queue_all(mp);
+-
+-      for_each_online_cpu(cpu) {
+-              gc = per_cpu_ptr(mp->m_inodegc, cpu);
+-              flush_work(&gc->work);
+-      }
++      flush_workqueue(mp->m_inodegc_wq);
+ }
+ /*
+@@ -1904,18 +1896,12 @@ void
+ xfs_inodegc_stop(
+       struct xfs_mount        *mp)
+ {
+-      struct xfs_inodegc      *gc;
+-      int                     cpu;
+-
+       if (!xfs_clear_inodegc_enabled(mp))
+               return;
+       xfs_inodegc_queue_all(mp);
++      drain_workqueue(mp->m_inodegc_wq);
+-      for_each_online_cpu(cpu) {
+-              gc = per_cpu_ptr(mp->m_inodegc, cpu);
+-              cancel_work_sync(&gc->work);
+-      }
+       trace_xfs_inodegc_stop(mp, __return_address);
+ }
diff --git a/queue-5.15/xfs-reject-crazy-array-sizes-being-fed-to-xfs_ioc_getbmap.patch b/queue-5.15/xfs-reject-crazy-array-sizes-being-fed-to-xfs_ioc_getbmap.patch
new file mode 100644 (file)
index 0000000..bb7ba99
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:31 -0700
+Subject: xfs: reject crazy array sizes being fed to XFS_IOC_GETBMAP*
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Allison Henderson <allison.henderson@oracle.com>, Catherine Hoang <catherine.hoang@oracle.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-10-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 29d650f7e3ab55283b89c9f5883d0c256ce478b5 ]
+
+Syzbot tripped over the following complaint from the kernel:
+
+WARNING: CPU: 2 PID: 15402 at mm/util.c:597 kvmalloc_node+0x11e/0x125 mm/util.c:597
+
+While trying to run XFS_IOC_GETBMAP against the following structure:
+
+struct getbmap fubar = {
+       .bmv_count      = 0x22dae649,
+};
+
+Obviously, this is a crazy huge value since the next thing that the
+ioctl would do is allocate 37GB of memory.  This is enough to make
+kvmalloc mad, but isn't large enough to trip the validation functions.
+In other words, I'm fussing with checks that were **already sufficient**
+because that's easier than dealing with 644 internal bug reports.  Yes,
+that's right, six hundred and forty-four.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Catherine Hoang <catherine.hoang@oracle.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -1545,7 +1545,7 @@ xfs_ioc_getbmap(
+       if (bmx.bmv_count < 2)
+               return -EINVAL;
+-      if (bmx.bmv_count > ULONG_MAX / recsize)
++      if (bmx.bmv_count >= INT_MAX / recsize)
+               return -ENOMEM;
+       buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
diff --git a/queue-5.15/xfs-remove-infinite-loop-when-reserving-free-block-pool.patch b/queue-5.15/xfs-remove-infinite-loop-when-reserving-free-block-pool.patch
new file mode 100644 (file)
index 0000000..9f32f7e
--- /dev/null
@@ -0,0 +1,96 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:26 -0700
+Subject: xfs: remove infinite loop when reserving free block pool
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Brian Foster <bfoster@redhat.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-5-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 15f04fdc75aaaa1cccb0b8b3af1be290e118a7bc ]
+
+Infinite loops in kernel code are scary.  Calls to xfs_reserve_blocks
+should be rare (people should just use the defaults!) so we really don't
+need to try so hard.  Simplify the logic here by removing the infinite
+loop.
+
+Cc: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c |   50 ++++++++++++++++++++------------------------------
+ 1 file changed, 20 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -430,46 +430,36 @@ xfs_reserve_blocks(
+        * If the request is larger than the current reservation, reserve the
+        * blocks before we update the reserve counters. Sample m_fdblocks and
+        * perform a partial reservation if the request exceeds free space.
++       *
++       * The code below estimates how many blocks it can request from
++       * fdblocks to stash in the reserve pool.  This is a classic TOCTOU
++       * race since fdblocks updates are not always coordinated via
++       * m_sb_lock.
+        */
+-      error = -ENOSPC;
+-      do {
+-              free = percpu_counter_sum(&mp->m_fdblocks) -
++      free = percpu_counter_sum(&mp->m_fdblocks) -
+                                               xfs_fdblocks_unavailable(mp);
+-              if (free <= 0)
+-                      break;
+-
+-              delta = request - mp->m_resblks;
+-              lcounter = free - delta;
+-              if (lcounter < 0)
+-                      /* We can't satisfy the request, just get what we can */
+-                      fdblks_delta = free;
+-              else
+-                      fdblks_delta = delta;
+-
++      delta = request - mp->m_resblks;
++      if (delta > 0 && free > 0) {
+               /*
+                * We'll either succeed in getting space from the free block
+-               * count or we'll get an ENOSPC. If we get a ENOSPC, it means
+-               * things changed while we were calculating fdblks_delta and so
+-               * we should try again to see if there is anything left to
+-               * reserve.
+-               *
+-               * Don't set the reserved flag here - we don't want to reserve
+-               * the extra reserve blocks from the reserve.....
++               * count or we'll get an ENOSPC.  Don't set the reserved flag
++               * here - we don't want to reserve the extra reserve blocks
++               * from the reserve.
+                */
++              fdblks_delta = min(free, delta);
+               spin_unlock(&mp->m_sb_lock);
+               error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+               spin_lock(&mp->m_sb_lock);
+-      } while (error == -ENOSPC);
+-      /*
+-       * Update the reserve counters if blocks have been successfully
+-       * allocated.
+-       */
+-      if (!error && fdblks_delta) {
+-              mp->m_resblks += fdblks_delta;
+-              mp->m_resblks_avail += fdblks_delta;
++              /*
++               * Update the reserve counters if blocks have been successfully
++               * allocated.
++               */
++              if (!error) {
++                      mp->m_resblks += fdblks_delta;
++                      mp->m_resblks_avail += fdblks_delta;
++              }
+       }
+-
+ out:
+       if (outval) {
+               outval->resblks = mp->m_resblks;
diff --git a/queue-5.15/xfs-reserve-quota-for-dir-expansion-when-linking-unlinking-files.patch b/queue-5.15/xfs-reserve-quota-for-dir-expansion-when-linking-unlinking-files.patch
new file mode 100644 (file)
index 0000000..83b4b55
--- /dev/null
@@ -0,0 +1,242 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:24 -0700
+Subject: xfs: reserve quota for dir expansion when linking/unlinking files
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-3-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 871b9316e7a778ff97bdc34fdb2f2977f616651d ]
+
+XFS does not reserve quota for directory expansion when linking or
+unlinking children from a directory.  This means that we don't reject
+the expansion with EDQUOT when we're at or near a hard limit, which
+means that unprivileged userspace can use link()/unlink() to exceed
+quota.
+
+The fix for this is nuanced -- link operations don't always expand the
+directory, and we allow a link to proceed with no space reservation if
+we don't need to add a block to the directory to handle the addition.
+Unlink operations generally do not expand the directory (you'd have to
+free a block and then cause a btree split) and we can defer the
+directory block freeing if there is no space reservation.
+
+Moreover, there is a further bug in that we do not trigger the blockgc
+workers to try to clear space when we're out of quota.
+
+To fix both cases, create a new xfs_trans_alloc_dir function that
+allocates the transaction, locks and joins the inodes, and reserves
+quota for the directory.  If there isn't sufficient space or quota,
+we'll switch the caller to reservationless mode.  This should prevent
+quota usage overruns with the least restriction in functionality.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   46 ++++++++++------------------
+ fs/xfs/xfs_trans.c |   86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_trans.h |    3 +
+ 3 files changed, 106 insertions(+), 29 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1223,7 +1223,7 @@ xfs_link(
+ {
+       xfs_mount_t             *mp = tdp->i_mount;
+       xfs_trans_t             *tp;
+-      int                     error;
++      int                     error, nospace_error = 0;
+       int                     resblks;
+       trace_xfs_link(tdp, target_name);
+@@ -1242,19 +1242,11 @@ xfs_link(
+               goto std_return;
+       resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+-      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
+-      if (error == -ENOSPC) {
+-              resblks = 0;
+-              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
+-      }
++      error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
++                      &tp, &nospace_error);
+       if (error)
+               goto std_return;
+-      xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
+-
+-      xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+-      xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+-
+       error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
+                       XFS_IEXT_DIR_MANIP_CNT(mp));
+       if (error)
+@@ -1312,6 +1304,8 @@ xfs_link(
+  error_return:
+       xfs_trans_cancel(tp);
+  std_return:
++      if (error == -ENOSPC && nospace_error)
++              error = nospace_error;
+       return error;
+ }
+@@ -2761,6 +2755,7 @@ xfs_remove(
+       xfs_mount_t             *mp = dp->i_mount;
+       xfs_trans_t             *tp = NULL;
+       int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
++      int                     dontcare;
+       int                     error = 0;
+       uint                    resblks;
+@@ -2778,31 +2773,24 @@ xfs_remove(
+               goto std_return;
+       /*
+-       * We try to get the real space reservation first,
+-       * allowing for directory btree deletion(s) implying
+-       * possible bmap insert(s).  If we can't get the space
+-       * reservation then we use 0 instead, and avoid the bmap
+-       * btree insert(s) in the directory code by, if the bmap
+-       * insert tries to happen, instead trimming the LAST
+-       * block from the directory.
++       * We try to get the real space reservation first, allowing for
++       * directory btree deletion(s) implying possible bmap insert(s).  If we
++       * can't get the space reservation then we use 0 instead, and avoid the
++       * bmap btree insert(s) in the directory code by, if the bmap insert
++       * tries to happen, instead trimming the LAST block from the directory.
++       *
++       * Ignore EDQUOT and ENOSPC being returned via nospace_error because
++       * the directory code can handle a reservationless update and we don't
++       * want to prevent a user from trying to free space by deleting things.
+        */
+       resblks = XFS_REMOVE_SPACE_RES(mp);
+-      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
+-      if (error == -ENOSPC) {
+-              resblks = 0;
+-              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
+-                              &tp);
+-      }
++      error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
++                      &tp, &dontcare);
+       if (error) {
+               ASSERT(error != -ENOSPC);
+               goto std_return;
+       }
+-      xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+-
+-      xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+-      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+-
+       /*
+        * If we're removing a directory perform some additional validation.
+        */
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -1201,3 +1201,89 @@ out_cancel:
+       xfs_trans_cancel(tp);
+       return error;
+ }
++
++/*
++ * Allocate an transaction, lock and join the directory and child inodes to it,
++ * and reserve quota for a directory update.  If there isn't sufficient space,
++ * @dblocks will be set to zero for a reservationless directory update and
++ * @nospace_error will be set to a negative errno describing the space
++ * constraint we hit.
++ *
++ * The caller must ensure that the on-disk dquots attached to this inode have
++ * already been allocated and initialized.  The ILOCKs will be dropped when the
++ * transaction is committed or cancelled.
++ */
++int
++xfs_trans_alloc_dir(
++      struct xfs_inode        *dp,
++      struct xfs_trans_res    *resv,
++      struct xfs_inode        *ip,
++      unsigned int            *dblocks,
++      struct xfs_trans        **tpp,
++      int                     *nospace_error)
++{
++      struct xfs_trans        *tp;
++      struct xfs_mount        *mp = ip->i_mount;
++      unsigned int            resblks;
++      bool                    retried = false;
++      int                     error;
++
++retry:
++      *nospace_error = 0;
++      resblks = *dblocks;
++      error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
++      if (error == -ENOSPC) {
++              *nospace_error = error;
++              resblks = 0;
++              error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
++      }
++      if (error)
++              return error;
++
++      xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
++
++      xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
++      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
++
++      error = xfs_qm_dqattach_locked(dp, false);
++      if (error) {
++              /* Caller should have allocated the dquots! */
++              ASSERT(error != -ENOENT);
++              goto out_cancel;
++      }
++
++      error = xfs_qm_dqattach_locked(ip, false);
++      if (error) {
++              /* Caller should have allocated the dquots! */
++              ASSERT(error != -ENOENT);
++              goto out_cancel;
++      }
++
++      if (resblks == 0)
++              goto done;
++
++      error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false);
++      if (error == -EDQUOT || error == -ENOSPC) {
++              if (!retried) {
++                      xfs_trans_cancel(tp);
++                      xfs_blockgc_free_quota(dp, 0);
++                      retried = true;
++                      goto retry;
++              }
++
++              *nospace_error = error;
++              resblks = 0;
++              error = 0;
++      }
++      if (error)
++              goto out_cancel;
++
++done:
++      *tpp = tp;
++      *dblocks = resblks;
++      return 0;
++
++out_cancel:
++      xfs_trans_cancel(tp);
++      return error;
++}
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -265,6 +265,9 @@ int xfs_trans_alloc_icreate(struct xfs_m
+ int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
+               struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
+               struct xfs_trans **tpp);
++int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
++              struct xfs_inode *ip, unsigned int *dblocks,
++              struct xfs_trans **tpp, int *nospace_error);
+ static inline void
+ xfs_trans_set_context(
diff --git a/queue-5.15/xfs-reserve-quota-for-target-dir-expansion-when-renaming-files.patch b/queue-5.15/xfs-reserve-quota-for-target-dir-expansion-when-renaming-files.patch
new file mode 100644 (file)
index 0000000..12e7210
--- /dev/null
@@ -0,0 +1,109 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:25 -0700
+Subject: xfs: reserve quota for target dir expansion when renaming files
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-4-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 41667260bc84db4dfe566e3f6ab6da5293d60d8d ]
+
+XFS does not reserve quota for directory expansion when renaming
+children into a directory.  This means that we don't reject the
+expansion with EDQUOT when we're at or near a hard limit, which means
+that unprivileged userspace can use rename() to exceed quota.
+
+Rename operations don't always expand the target directory, and we allow
+a rename to proceed with no space reservation if we don't need to add a
+block to the target directory to handle the addition.  Moreover, the
+unlink operation on the source directory generally does not expand the
+directory (you'd have to free a block and then cause a btree split) and
+it's probably of little consequence to leave the corner case that
+renaming a file out of a directory can increase its size.
+
+As with link and unlink, there is a further bug in that we do not
+trigger the blockgc workers to try to clear space when we're out of
+quota.
+
+Because rename is its own special tricky animal, we'll patch xfs_rename
+directly to reserve quota to the rename transaction.  We'll leave
+cleaning up the rest of xfs_rename for the metadata directory tree
+patchset.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   33 ++++++++++++++++++++++++++++++++-
+ 1 file changed, 32 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -3103,7 +3103,8 @@ xfs_rename(
+       bool                    new_parent = (src_dp != target_dp);
+       bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
+       int                     spaceres;
+-      int                     error;
++      bool                    retried = false;
++      int                     error, nospace_error = 0;
+       trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+@@ -3127,9 +3128,12 @@ xfs_rename(
+       xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
+                               inodes, &num_inodes);
++retry:
++      nospace_error = 0;
+       spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
+       if (error == -ENOSPC) {
++              nospace_error = error;
+               spaceres = 0;
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+                               &tp);
+@@ -3184,6 +3188,31 @@ xfs_rename(
+                                       spaceres);
+       /*
++       * Try to reserve quota to handle an expansion of the target directory.
++       * We'll allow the rename to continue in reservationless mode if we hit
++       * a space usage constraint.  If we trigger reservationless mode, save
++       * the errno if there isn't any free space in the target directory.
++       */
++      if (spaceres != 0) {
++              error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
++                              0, false);
++              if (error == -EDQUOT || error == -ENOSPC) {
++                      if (!retried) {
++                              xfs_trans_cancel(tp);
++                              xfs_blockgc_free_quota(target_dp, 0);
++                              retried = true;
++                              goto retry;
++                      }
++
++                      nospace_error = error;
++                      spaceres = 0;
++                      error = 0;
++              }
++              if (error)
++                      goto out_trans_cancel;
++      }
++
++      /*
+        * Check for expected errors before we dirty the transaction
+        * so we can return an error without a transaction abort.
+        *
+@@ -3429,6 +3458,8 @@ out_trans_cancel:
+ out_release_wip:
+       if (wip)
+               xfs_irele(wip);
++      if (error == -ENOSPC && nospace_error)
++              error = nospace_error;
+       return error;
+ }
diff --git a/queue-5.15/xfs-revert-xfs-actually-bump-warning-counts-when-we-send-warnings.patch b/queue-5.15/xfs-revert-xfs-actually-bump-warning-counts-when-we-send-warnings.patch
new file mode 100644 (file)
index 0000000..5dc028f
--- /dev/null
@@ -0,0 +1,54 @@
+From foo@baz Tue Aug 23 09:20:27 AM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Fri, 19 Aug 2022 11:14:30 -0700
+Subject: xfs: revert "xfs: actually bump warning counts when we send warnings"
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, Eric Sandeen <sandeen@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220819181431.4113819-9-leah.rumancik@gmail.com>
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+[ Upstream commit bc37e4fb5cac2925b2e286b1f1d4fc2b519f7d92 ]
+
+This reverts commit 4b8628d57b725b32616965e66975fcdebe008fe7.
+
+XFS quota has had the concept of a "quota warning limit" since
+the earliest Irix implementation, but a mechanism for incrementing
+the warning counter was never implemented, as documented in the
+xfs_quota(8) man page. We do know from the historical archive that
+it was never incremented at runtime during quota reservation
+operations.
+
+With this commit, the warning counter quickly increments for every
+allocation attempt after the user has crossed a quote soft
+limit threshold, and this in turn transitions the user to hard
+quota failures, rendering soft quota thresholds and timers useless.
+This was reported as a regression by users.
+
+Because the intended behavior of this warning counter has never been
+understood or documented, and the result of this change is a regression
+in soft quota functionality, revert this commit to make soft quota
+limits and timers operable again.
+
+Fixes: 4b8628d57b72 ("xfs: actually bump warning counts when we send warnings)
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans_dquot.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/fs/xfs/xfs_trans_dquot.c
++++ b/fs/xfs/xfs_trans_dquot.c
+@@ -603,7 +603,6 @@ xfs_dqresv_check(
+                       return QUOTA_NL_ISOFTLONGWARN;
+               }
+-              res->warnings++;
+               return QUOTA_NL_ISOFTWARN;
+       }