]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
cgroup-util: Always open cgroupv2 attribute files in O_NONBLOCK mode
authorDaan De Meyer <daan.j.demeyer@gmail.com>
Tue, 22 Apr 2025 08:36:21 +0000 (10:36 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Tue, 17 Jun 2025 14:07:32 +0000 (15:07 +0100)
As explained in https://lore.kernel.org/all/20250419183545.1982187-1-shakeel.butt@linux.dev/,
writing to memory.max or memory.high triggers synchronous memory reclaim
if the limit is lowered. This can end up taking nonnegligible amounts
of time, completely blocking pid1 from doing any other work while the
reclaim is ongoing.

To address this problem, the kernel going to add O_NONBLOCK semantics
to memory.max and memory.high. If the file is opened with O_NONBLOCK,
the synchronous memory reclaim is skipped and only triggered later
without blocking the process writing the file. Let's make sure we make
use of this by opening cgroupv2 attribute files with O_NONBLOCK.

We opt to do this for all cgroupv2 attribute files, to make sure that
if the same problem happens elsewhere in the future and is fixed in the
same way, we immediately take advantage of that fix without having to
make changes in systemd as well. We probably never want to block when
writing cgroupv2 attributes and any cases where we do want to block should
indicate so explicitly instead of blocking by default.

src/basic/cgroup-util.c
src/basic/fileio.c
src/basic/fileio.h

index 97c05e18f94967305f54041b5be8e123db19cb5d..8073925b4a45640fb56606c645b9b78854cde7fe 100644 (file)
@@ -1594,7 +1594,12 @@ int cg_set_attribute(const char *controller, const char *path, const char *attri
         if (r < 0)
                 return r;
 
-        return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
+        /* https://lore.kernel.org/all/20250419183545.1982187-1-shakeel.butt@linux.dev/ adds O_NONBLOCK
+         * semantics to memory.max and memory.high to skip synchronous memory reclaim when O_NONBLOCK is
+         * enabled. Let's always open cgroupv2 attribute files in nonblocking mode to immediately take
+         * advantage of this and any other asynchronous resource reclaim that's added to the cgroupv2 API in
+         * the future. */
+        return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER|WRITE_STRING_FILE_OPEN_NONBLOCKING);
 }
 
 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
index eaadfaa6accc48b75610b52c8e8b3472a921d98a..97f92213da57966583ca34903dc37e1dd2fa2fe2 100644 (file)
@@ -317,7 +317,8 @@ int write_string_file_full(
                 r = fd = fd_reopen(
                                 ASSERT_FD(dir_fd), O_CLOEXEC | O_NOCTTY |
                                 (FLAGS_SET(flags, WRITE_STRING_FILE_TRUNCATE) ? O_TRUNC : 0) |
-                                (FLAGS_SET(flags, WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL) ? O_RDWR : O_WRONLY));
+                                (FLAGS_SET(flags, WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL) ? O_RDWR : O_WRONLY) |
+                                (FLAGS_SET(flags, WRITE_STRING_FILE_OPEN_NONBLOCKING) ? O_NONBLOCK : 0));
         else {
                 mode_t mode = write_string_file_flags_to_mode(flags);
                 bool call_label_ops_post = false;
@@ -335,7 +336,8 @@ int write_string_file_full(
                                 (FLAGS_SET(flags, WRITE_STRING_FILE_NOFOLLOW) ? O_NOFOLLOW : 0) |
                                 (FLAGS_SET(flags, WRITE_STRING_FILE_CREATE) ? O_CREAT : 0) |
                                 (FLAGS_SET(flags, WRITE_STRING_FILE_TRUNCATE) ? O_TRUNC : 0) |
-                                (FLAGS_SET(flags, WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL) ? O_RDWR : O_WRONLY),
+                                (FLAGS_SET(flags, WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL) ? O_RDWR : O_WRONLY) |
+                                (FLAGS_SET(flags, WRITE_STRING_FILE_OPEN_NONBLOCKING) ? O_NONBLOCK : 0),
                                 mode,
                                 &made_file);
                 if (call_label_ops_post)
index b6513fe3f884842e2c822ab2e474b358d9da3c2c..0297cd07e985f0f894fdde315413d21195f592aa 100644 (file)
@@ -20,6 +20,7 @@ typedef enum {
         WRITE_STRING_FILE_MODE_0444                  = 1 << 11,
         WRITE_STRING_FILE_SUPPRESS_REDUNDANT_VIRTUAL = 1 << 12,
         WRITE_STRING_FILE_LABEL                      = 1 << 13,
+        WRITE_STRING_FILE_OPEN_NONBLOCKING           = 1 << 14,
 } WriteStringFileFlags;
 
 typedef enum {