]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 15 Aug 2025 16:50:16 +0000 (18:50 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 15 Aug 2025 16:50:16 +0000 (18:50 +0200)
added patches:
fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch
nfsd-detect-mismatch-of-file-handle-and-delegation-stateid-in-open-op.patch
sunvdc-balance-device-refcount-in-vdc_port_mpgroup_check.patch

queue-5.4/fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch [new file with mode: 0644]
queue-5.4/nfsd-detect-mismatch-of-file-handle-and-delegation-stateid-in-open-op.patch [new file with mode: 0644]
queue-5.4/series
queue-5.4/sunvdc-balance-device-refcount-in-vdc_port_mpgroup_check.patch [new file with mode: 0644]

diff --git a/queue-5.4/fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch b/queue-5.4/fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch
new file mode 100644 (file)
index 0000000..6e22a92
--- /dev/null
@@ -0,0 +1,104 @@
+From 04a2c4b4511d186b0fce685da21085a5d4acd370 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 29 Jun 2025 03:40:21 -0400
+Subject: fs: Prevent file descriptor table allocations exceeding INT_MAX
+
+From: Sasha Levin <sashal@kernel.org>
+
+commit 04a2c4b4511d186b0fce685da21085a5d4acd370 upstream.
+
+When sysctl_nr_open is set to a very high value (for example, 1073741816
+as set by systemd), processes attempting to use file descriptors near
+the limit can trigger massive memory allocation attempts that exceed
+INT_MAX, resulting in a WARNING in mm/slub.c:
+
+  WARNING: CPU: 0 PID: 44 at mm/slub.c:5027 __kvmalloc_node_noprof+0x21a/0x288
+
+This happens because kvmalloc_array() and kvmalloc() check if the
+requested size exceeds INT_MAX and emit a warning when the allocation is
+not flagged with __GFP_NOWARN.
+
+Specifically, when nr_open is set to 1073741816 (0x3ffffff8) and a
+process calls dup2(oldfd, 1073741880), the kernel attempts to allocate:
+- File descriptor array: 1073741880 * 8 bytes = 8,589,935,040 bytes
+- Multiple bitmaps: ~400MB
+- Total allocation size: > 8GB (exceeding INT_MAX = 2,147,483,647)
+
+Reproducer:
+1. Set /proc/sys/fs/nr_open to 1073741816:
+   # echo 1073741816 > /proc/sys/fs/nr_open
+
+2. Run a program that uses a high file descriptor:
+   #include <unistd.h>
+   #include <sys/resource.h>
+
+   int main() {
+       struct rlimit rlim = {1073741824, 1073741824};
+       setrlimit(RLIMIT_NOFILE, &rlim);
+       dup2(2, 1073741880);  // Triggers the warning
+       return 0;
+   }
+
+3. Observe WARNING in dmesg at mm/slub.c:5027
+
+systemd commit a8b627a introduced automatic bumping of fs.nr_open to the
+maximum possible value. The rationale was that systems with memory
+control groups (memcg) no longer need separate file descriptor limits
+since memory is properly accounted. However, this change overlooked
+that:
+
+1. The kernel's allocation functions still enforce INT_MAX as a maximum
+   size regardless of memcg accounting
+2. Programs and tests that legitimately test file descriptor limits can
+   inadvertently trigger massive allocations
+3. The resulting allocations (>8GB) are impractical and will always fail
+
+systemd's algorithm starts with INT_MAX and keeps halving the value
+until the kernel accepts it. On most systems, this results in nr_open
+being set to 1073741816 (0x3ffffff8), which is just under 1GB of file
+descriptors.
+
+While processes rarely use file descriptors near this limit in normal
+operation, certain selftests (like
+tools/testing/selftests/core/unshare_test.c) and programs that test file
+descriptor limits can trigger this issue.
+
+Fix this by adding a check in alloc_fdtable() to ensure the requested
+allocation size does not exceed INT_MAX. This causes the operation to
+fail with -EMFILE instead of triggering a kernel warning and avoids the
+impractical >8GB memory allocation request.
+
+Fixes: 9cfe015aa424 ("get rid of NR_OPEN and introduce a sysctl_nr_open")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Link: https://lore.kernel.org/20250629074021.1038845-1-sashal@kernel.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/file.c |   15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/fs/file.c
++++ b/fs/file.c
+@@ -104,6 +104,21 @@ static struct fdtable * alloc_fdtable(un
+       if (unlikely(nr > sysctl_nr_open))
+               nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
++      /*
++       * Check if the allocation size would exceed INT_MAX. kvmalloc_array()
++       * and kvmalloc() will warn if the allocation size is greater than
++       * INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
++       *
++       * This can happen when sysctl_nr_open is set to a very high value and
++       * a process tries to use a file descriptor near that limit. For example,
++       * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
++       * systemd typically sets it to - then trying to use a file descriptor
++       * close to that value will require allocating a file descriptor table
++       * that exceeds 8GB in size.
++       */
++      if (unlikely(nr > INT_MAX / sizeof(struct file *)))
++              return ERR_PTR(-EMFILE);
++
+       fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
+       if (!fdt)
+               goto out;
diff --git a/queue-5.4/nfsd-detect-mismatch-of-file-handle-and-delegation-stateid-in-open-op.patch b/queue-5.4/nfsd-detect-mismatch-of-file-handle-and-delegation-stateid-in-open-op.patch
new file mode 100644 (file)
index 0000000..824d9d0
--- /dev/null
@@ -0,0 +1,54 @@
+From 9c65001c57164033ad08b654c8b5ae35512ddf4a Mon Sep 17 00:00:00 2001
+From: Dai Ngo <dai.ngo@oracle.com>
+Date: Tue, 10 Jun 2025 08:35:28 -0700
+Subject: NFSD: detect mismatch of file handle and delegation stateid in OPEN op
+
+From: Dai Ngo <dai.ngo@oracle.com>
+
+commit 9c65001c57164033ad08b654c8b5ae35512ddf4a upstream.
+
+When the client sends an OPEN with claim type CLAIM_DELEG_CUR_FH or
+CLAIM_DELEGATION_CUR, the delegation stateid and the file handle
+must belong to the same file, otherwise return NFS4ERR_INVAL.
+
+Note that RFC8881, section 8.2.4, mandates the server to return
+NFS4ERR_BAD_STATEID if the selected table entry does not match the
+current filehandle. However returning NFS4ERR_BAD_STATEID in the
+OPEN causes the client to retry the operation and therefor get the
+client into a loop. To avoid this situation we return NFS4ERR_INVAL
+instead.
+
+Reported-by: Petro Pavlov <petro.pavlov@vastdata.com>
+Fixes: c44c5eeb2c02 ("[PATCH] nfsd4: add open state code for CLAIM_DELEGATE_CUR")
+Cc: stable@vger.kernel.org
+Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nfsd/nfs4state.c |   14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+--- a/fs/nfsd/nfs4state.c
++++ b/fs/nfsd/nfs4state.c
+@@ -5081,6 +5081,20 @@ nfsd4_process_open2(struct svc_rqst *rqs
+               status = nfs4_check_deleg(cl, open, &dp);
+               if (status)
+                       goto out;
++              if (dp && nfsd4_is_deleg_cur(open) &&
++                              (dp->dl_stid.sc_file != fp)) {
++                      /*
++                       * RFC8881 section 8.2.4 mandates the server to return
++                       * NFS4ERR_BAD_STATEID if the selected table entry does
++                       * not match the current filehandle. However returning
++                       * NFS4ERR_BAD_STATEID in the OPEN can cause the client
++                       * to repeatedly retry the operation with the same
++                       * stateid, since the stateid itself is valid. To avoid
++                       * this situation NFSD returns NFS4ERR_INVAL instead.
++                       */
++                      status = nfserr_inval;
++                      goto out;
++              }
+               stp = nfsd4_find_and_lock_existing_open(fp, open);
+       } else {
+               open->op_file = NULL;
index 42e9ffcdb01369dcdacd32f68b98fcc1d4734e3b..dd9e2ded30a64e333f1f8ce2b298b11512052718 100644 (file)
@@ -159,3 +159,6 @@ alsa-usb-audio-validate-uac3-cluster-segment-descriptors.patch
 netlink-avoid-infinite-retry-looping-in-netlink_unicast.patch
 net-gianfar-fix-device-leak-when-querying-time-stamp-info.patch
 net-dpaa-fix-device-leak-when-querying-time-stamp-info.patch
+nfsd-detect-mismatch-of-file-handle-and-delegation-stateid-in-open-op.patch
+sunvdc-balance-device-refcount-in-vdc_port_mpgroup_check.patch
+fs-prevent-file-descriptor-table-allocations-exceeding-int_max.patch
diff --git a/queue-5.4/sunvdc-balance-device-refcount-in-vdc_port_mpgroup_check.patch b/queue-5.4/sunvdc-balance-device-refcount-in-vdc_port_mpgroup_check.patch
new file mode 100644 (file)
index 0000000..d2a5323
--- /dev/null
@@ -0,0 +1,48 @@
+From 63ce53724637e2e7ba51fe3a4f78351715049905 Mon Sep 17 00:00:00 2001
+From: Ma Ke <make24@iscas.ac.cn>
+Date: Sat, 19 Jul 2025 15:58:56 +0800
+Subject: sunvdc: Balance device refcount in vdc_port_mpgroup_check
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ma Ke <make24@iscas.ac.cn>
+
+commit 63ce53724637e2e7ba51fe3a4f78351715049905 upstream.
+
+Using device_find_child() to locate a probed virtual-device-port node
+causes a device refcount imbalance, as device_find_child() internally
+calls get_device() to increment the device’s reference count before
+returning its pointer. vdc_port_mpgroup_check() directly returns true
+upon finding a matching device without releasing the reference via
+put_device(). We should call put_device() to decrement refcount.
+
+As comment of device_find_child() says, 'NOTE: you will need to drop
+the reference with put_device() after use'.
+
+Found by code review.
+
+Cc: stable@vger.kernel.org
+Fixes: 3ee70591d6c4 ("sunvdc: prevent sunvdc panic when mpgroup disk added to guest domain")
+Signed-off-by: Ma Ke <make24@iscas.ac.cn>
+Link: https://lore.kernel.org/r/20250719075856.3447953-1-make24@iscas.ac.cn
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/sunvdc.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/sunvdc.c
++++ b/drivers/block/sunvdc.c
+@@ -967,8 +967,10 @@ static bool vdc_port_mpgroup_check(struc
+       dev = device_find_child(vdev->dev.parent, &port_data,
+                               vdc_device_probed);
+-      if (dev)
++      if (dev) {
++              put_device(dev);
+               return true;
++      }
+       return false;
+ }