From: Daan De Meyer Date: Sat, 7 Mar 2026 22:37:55 +0000 (+0100) Subject: core: Add I/O pressure support X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=594659da06be7398b2dc1efebae575353d20fd34;p=thirdparty%2Fsystemd.git core: Add I/O pressure support --- diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 027a8deeb46..76a8dd045f6 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -556,6 +556,10 @@ node /org/freedesktop/systemd1 { readonly t DefaultCPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s DefaultCPUPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t DefaultIOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DefaultIOPressureWatch = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t TimerSlackNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -801,6 +805,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -1255,6 +1263,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -3082,6 +3094,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -3755,6 +3771,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -4451,6 +4471,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -5354,6 +5378,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -6043,6 +6071,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -6713,6 +6745,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -7439,6 +7475,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -8052,6 +8092,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -8630,6 +8674,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -9489,6 +9537,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -10084,6 +10136,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -10644,6 +10700,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -11356,6 +11416,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -11533,6 +11597,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -11725,6 +11793,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -11940,6 +12012,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s IOPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t IOPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly b CoredumpReceive = ...; @@ -12131,6 +12207,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -12347,6 +12427,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -12560,8 +12644,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ KillUnitSubgroup() were added in version 258. TransactionsWithOrderingCycle was added in version 259. DefaultMemoryZSwapWriteback, - DefaultCPUPressureThresholdUSec and - DefaultCPUPressureWatch were added in version 261. + DefaultCPUPressureThresholdUSec, + DefaultCPUPressureWatch, + DefaultIOPressureThresholdUSec, and + DefaultIOPressureWatch were added in version 261. Unit Objects @@ -12653,8 +12739,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ExecReloadPostEx were added in version 259. BindNetworkInterface, MemoryTHP, RefreshOnReload, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Socket Unit Objects @@ -12725,8 +12813,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface MemoryTHP, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Mount Unit Objects @@ -12792,8 +12882,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface MemoryTHP, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Swap Unit Objects @@ -12857,8 +12949,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMKills were added in 259. BindNetworkInterface, MemoryTHP, and RootMStack were added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Slice Unit Objects @@ -12892,8 +12986,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ OOMKills, and ManagedOOMKills were added in 259. BindNetworkInterface was added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Scope Unit Objects @@ -12925,8 +13021,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ OOMKills, and ManagedOOMKills were added in 259. BindNetworkInterface was added in version 260. - CPUPressureThresholdUSec and - CPUPressureWatch were added in version 261. + CPUPressureThresholdUSec, + CPUPressureWatch, + IOPressureThresholdUSec, and + IOPressureWatch were added in version 261. Job Objects diff --git a/man/rules/meson.build b/man/rules/meson.build index 81e7ef4f882..525c56b1d3b 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -609,8 +609,11 @@ manpages = [ ['sd_event_add_memory_pressure', '3', ['sd_event_add_cpu_pressure', + 'sd_event_add_io_pressure', 'sd_event_source_set_cpu_pressure_period', 'sd_event_source_set_cpu_pressure_type', + 'sd_event_source_set_io_pressure_period', + 'sd_event_source_set_io_pressure_type', 'sd_event_source_set_memory_pressure_period', 'sd_event_source_set_memory_pressure_type', 'sd_event_trim_memory'], diff --git a/man/sd_event_add_memory_pressure.xml b/man/sd_event_add_memory_pressure.xml index 1e6b734738f..05f2ff2b745 100644 --- a/man/sd_event_add_memory_pressure.xml +++ b/man/sd_event_add_memory_pressure.xml @@ -25,7 +25,11 @@ sd_event_source_set_cpu_pressure_type sd_event_source_set_cpu_pressure_period - Add and configure an event source run as result of memory or CPU pressure + sd_event_add_io_pressure + sd_event_source_set_io_pressure_type + sd_event_source_set_io_pressure_period + + Add and configure an event source for memory, CPU, or IO pressure notifications @@ -76,6 +80,27 @@ uint64_t window_usec + + int sd_event_add_io_pressure + sd_event *event + sd_event_source **ret_source + sd_event_handler_t handler + void *userdata + + + + int sd_event_source_set_io_pressure_type + sd_event_source *source + const char *type + + + + int sd_event_source_set_io_pressure_period + sd_event_source *source + uint64_t threshold_usec + uint64_t window_usec + + int sd_event_trim_memory void @@ -88,19 +113,24 @@ sd_event_add_memory_pressure() adds a new event source that is triggered whenever memory pressure is seen. Similarly, - sd_event_add_cpu_pressure() adds a new event source that is triggered whenever CPU - pressure is seen. This functionality is built around the Linux kernel's sd_event_add_cpu_pressure() and sd_event_add_io_pressure() add + new event sources that are triggered whenever CPU or IO pressure is seen, respectively. This functionality + is built around the Linux kernel's Pressure Stall Information (PSI) logic. - Both functions expect an event loop object as first parameter, and return the allocated event source + These functions expect an event loop object as first parameter, and return the allocated event source object in the second parameter, on success. The handler parameter is a function to call when pressure is seen, or NULL. The handler function will be passed the userdata pointer, which may be chosen freely by the caller. The handler may return negative to signal an error (see below), other return values are ignored. If - handler is NULL, a default handler that compacts allocation - caches maintained by libsystemd as well as glibc (via malloc_trim3) - will be used. + handler is NULL, a default handler is used. For + sd_event_add_memory_pressure(), the default handler compacts allocation caches + maintained by libsystemd as well as glibc (via malloc_trim3). + For sd_event_add_cpu_pressure() and + sd_event_add_io_pressure(), the default handler is a no-op. It is recommended to + pass a custom handler for CPU and IO pressure to take meaningful action when pressure is + detected. To destroy an event source object use sd_event_source_unref3, @@ -110,8 +140,8 @@ sd_event_source_set_enabled3 with SD_EVENT_OFF. - If the second parameter of sd_event_add_memory_pressure() or - sd_event_add_cpu_pressure() is + If the second parameter of sd_event_add_memory_pressure(), + sd_event_add_cpu_pressure(), or sd_event_add_io_pressure() is NULL no reference to the event source object is returned. In this case, the event source is considered "floating", and will be destroyed implicitly when the event loop itself is destroyed. @@ -146,6 +176,11 @@ provides the some line, not the full line, so only some is valid when watching at the system level. + The IO pressure event source follows the same logic, but uses the + $IO_PRESSURE_WATCH/$IO_PRESSURE_WRITE environment variables, + the io.pressure cgroup file, and the system-wide PSI interface file + /proc/pressure/io instead. + Or in other words: preferably any explicit configuration passed in by an invoking service manager (or similar) is used as notification source, before falling back to local notifications of the service, and finally to global notifications of the system. @@ -189,12 +224,15 @@ Similarly, sd_event_source_set_cpu_pressure_type() and sd_event_source_set_cpu_pressure_period() can be used to fine-tune the PSI - parameters for CPU pressure notifications. They work identically to their memory pressure counterparts. + parameters for CPU pressure notifications, and + sd_event_source_set_io_pressure_type() and + sd_event_source_set_io_pressure_period() can be used to fine-tune the PSI + parameters for IO pressure notifications. They work identically to their memory pressure counterparts. The type parameter takes either some or full, and the period function takes threshold and period times in microseconds. The same constraints apply: these calls must - be invoked immediately after allocating the event source, and will fail if CPU pressure parameterization - has been passed in via the - $CPU_PRESSURE_WATCH/$CPU_PRESSURE_WRITE environment + be invoked immediately after allocating the event source, and will fail if pressure parameterization + has been passed in via the corresponding + $*_PRESSURE_WATCH/$*_PRESSURE_WRITE environment variables. The sd_event_trim_memory() function releases various internal allocation @@ -242,9 +280,9 @@ -EHOSTDOWN - The $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH variable has been set to the literal - string /dev/null, in order to explicitly disable pressure + The $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH variable has been + set to the literal string /dev/null, in order to explicitly disable pressure handling. @@ -253,9 +291,9 @@ -EBADMSG - The $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH variable has been set to an invalid - string, for example a relative rather than an absolute path. + The $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH variable has been + set to an invalid string, for example a relative rather than an absolute path. @@ -263,9 +301,9 @@ -ENOTTY - The $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH variable points to a regular file - outside of the procfs or cgroupfs file systems. + The $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH variable points + to a regular file outside of the procfs or cgroupfs file systems. @@ -273,9 +311,9 @@ -EOPNOTSUPP - No configuration via $MEMORY_PRESSURE_WATCH or - $CPU_PRESSURE_WATCH has been specified and the local kernel does not support the - PSI interface. + No configuration via $MEMORY_PRESSURE_WATCH, + $CPU_PRESSURE_WATCH, or $IO_PRESSURE_WATCH has been specified + and the local kernel does not support the PSI interface. @@ -286,7 +324,9 @@ This is returned by sd_event_source_set_memory_pressure_type(), sd_event_source_set_memory_pressure_period(), sd_event_source_set_cpu_pressure_type(), - and sd_event_source_set_cpu_pressure_period() if invoked on event sources + sd_event_source_set_cpu_pressure_period(), + sd_event_source_set_io_pressure_type(), + and sd_event_source_set_io_pressure_period() if invoked on event sources at a time later than immediately after allocating them. @@ -329,8 +369,11 @@ sd_event_source_set_memory_pressure_period(), and sd_event_trim_memory() were added in version 254. sd_event_add_cpu_pressure(), - sd_event_source_set_cpu_pressure_type(), and - sd_event_source_set_cpu_pressure_period() were added in version 261. + sd_event_source_set_cpu_pressure_type(), + sd_event_source_set_cpu_pressure_period(), + sd_event_add_io_pressure(), + sd_event_source_set_io_pressure_type(), and + sd_event_source_set_io_pressure_period() were added in version 261. diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index 79133dc15eb..eb14cb7f307 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -340,6 +340,20 @@ + + + DefaultIOPressureWatch= + DefaultIOPressureThresholdSec= + + Configures the default settings for the per-unit + IOPressureWatch= and IOPressureThresholdSec= + settings. See + systemd.resource-control5 + for details. Defaults to auto and 200ms, respectively. This + also sets the IO pressure monitoring threshold for the service manager itself. + + + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 1048fcadfc3..455f666374f 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -4717,6 +4717,18 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX + + $IO_PRESSURE_WATCH + $IO_PRESSURE_WRITE + + If IO pressure monitoring is enabled for this service unit, the path to watch + and the data to write into it. See Resource Pressure + Handling for details about these variables and the service protocol data they + convey. + + + + $FDSTORE diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 8d9e27f3d3a..f8a2e14e1b6 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1706,6 +1706,55 @@ DeviceAllow=/dev/loop-control + + + IOPressureWatch= + + Controls IO pressure monitoring for invoked processes. Takes a boolean or one of + auto and skip. If no, tells the service not + to watch for IO pressure events, by setting the $IO_PRESSURE_WATCH + environment variable to the literal string /dev/null. If yes, + tells the service to watch for IO pressure events. This enables IO accounting for the + service, and ensures the io.pressure cgroup attribute file is accessible for + reading and writing by the service's user. It then sets the $IO_PRESSURE_WATCH + environment variable for processes invoked by the unit to the file system path to this file. The + threshold information configured with IOPressureThresholdSec= is encoded in + the $IO_PRESSURE_WRITE environment variable. If the auto + value is set the protocol is enabled if IO accounting is anyway enabled for the unit (e.g. because + IOWeight= or IODeviceWeight= is set), and + disabled otherwise. If set to skip the logic is neither enabled, nor disabled and + the two environment variables are not set. + + Note that services are free to use the two environment variables, but it is unproblematic if + they ignore them. IO pressure handling must be implemented individually in each service, and + usually means different things for different software. + + Services implemented using + sd-event3 may use + sd_event_add_io_pressure3 + to watch for and handle IO pressure events. + + If not explicitly set, defaults to the DefaultIOPressureWatch= setting in + systemd-system.conf5. + + + + + + IOPressureThresholdSec= + + Sets the IO pressure threshold time for IO pressure monitor as configured via + IOPressureWatch=. Specifies the maximum IO stall time before an IO + pressure event is signalled to the service, per 2s window. If not specified, defaults to the + DefaultIOPressureThresholdSec= setting in + systemd-system.conf5 + (which in turn defaults to 200ms). The specified value expects a time unit such as + ms or μs, see + systemd.time7 for + details on the permitted syntax. + + + Coredump Control diff --git a/src/basic/psi-util.c b/src/basic/psi-util.c index cf05485dc7b..f2a93e674f0 100644 --- a/src/basic/psi-util.c +++ b/src/basic/psi-util.c @@ -116,11 +116,17 @@ const PressureResourceInfo pressure_resource_info[_PRESSURE_RESOURCE_MAX] = { .env_watch = "CPU_PRESSURE_WATCH", .env_write = "CPU_PRESSURE_WRITE", }, + [PRESSURE_IO] = { + .name = "io", + .env_watch = "IO_PRESSURE_WATCH", + .env_write = "IO_PRESSURE_WRITE", + }, }; static const char* const pressure_resource_table[_PRESSURE_RESOURCE_MAX] = { [PRESSURE_MEMORY] = "memory", [PRESSURE_CPU] = "cpu", + [PRESSURE_IO] = "io", }; DEFINE_STRING_TABLE_LOOKUP(pressure_resource, PressureResource); diff --git a/src/basic/psi-util.h b/src/basic/psi-util.h index b8737b8976b..8716767ca59 100644 --- a/src/basic/psi-util.h +++ b/src/basic/psi-util.h @@ -12,6 +12,7 @@ typedef enum PressureType { typedef enum PressureResource { PRESSURE_MEMORY, PRESSURE_CPU, + PRESSURE_IO, _PRESSURE_RESOURCE_MAX, _PRESSURE_RESOURCE_INVALID = -EINVAL, } PressureResource; diff --git a/src/core/cgroup.c b/src/core/cgroup.c index a9982de659f..c64521a7e65 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -188,6 +188,7 @@ void cgroup_context_init(CGroupContext *c) { .pressure = { [PRESSURE_MEMORY] = { .watch = _CGROUP_PRESSURE_WATCH_INVALID, .threshold_usec = USEC_INFINITY }, [PRESSURE_CPU] = { .watch = _CGROUP_PRESSURE_WATCH_INVALID, .threshold_usec = USEC_INFINITY }, + [PRESSURE_IO] = { .watch = _CGROUP_PRESSURE_WATCH_INVALID, .threshold_usec = USEC_INFINITY }, }, }; } @@ -531,6 +532,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { "%sManagedOOMPreference: %s\n" "%sMemoryPressureWatch: %s\n" "%sCPUPressureWatch: %s\n" + "%sIOPressureWatch: %s\n" "%sCoredumpReceive: %s\n", prefix, yes_no(c->io_accounting), prefix, yes_no(c->memory_accounting), @@ -568,6 +570,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, managed_oom_preference_to_string(c->moom_preference), prefix, cgroup_pressure_watch_to_string(c->pressure[PRESSURE_MEMORY].watch), prefix, cgroup_pressure_watch_to_string(c->pressure[PRESSURE_CPU].watch), + prefix, cgroup_pressure_watch_to_string(c->pressure[PRESSURE_IO].watch), prefix, yes_no(c->coredump_receive)); if (c->delegate_subgroup) @@ -586,6 +589,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { fprintf(f, "%sCPUPressureThresholdSec: %s\n", prefix, FORMAT_TIMESPAN(c->pressure[PRESSURE_CPU].threshold_usec, 1)); + if (c->pressure[PRESSURE_IO].threshold_usec != USEC_INFINITY) + fprintf(f, "%sIOPressureThresholdSec: %s\n", + prefix, FORMAT_TIMESPAN(c->pressure[PRESSURE_IO].threshold_usec, 1)); + if (c->moom_mem_pressure_duration_usec != USEC_INFINITY) fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n", prefix, FORMAT_TIMESPAN(c->moom_mem_pressure_duration_usec, 1)); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index c4a22765678..ce98f4ba7cd 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -376,6 +376,14 @@ static inline bool cgroup_context_want_pressure(const CGroupContext *c, Pressure c->startup_cpu_weight != CGROUP_WEIGHT_INVALID || c->cpu_quota_per_sec_usec != USEC_INFINITY; + case PRESSURE_IO: + return c->io_accounting || + c->io_weight != CGROUP_WEIGHT_INVALID || + c->startup_io_weight != CGROUP_WEIGHT_INVALID || + c->io_device_weights || + c->io_device_latencies || + c->io_device_limits; + default: assert_not_reached(); } diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index c5a3302e08e..927c133dd9e 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -431,6 +431,8 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure[PRESSURE_MEMORY].threshold_usec), 0), SD_BUS_PROPERTY("CPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure[PRESSURE_CPU].watch), 0), SD_BUS_PROPERTY("CPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure[PRESSURE_CPU].threshold_usec), 0), + SD_BUS_PROPERTY("IOPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure[PRESSURE_IO].watch), 0), + SD_BUS_PROPERTY("IOPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure[PRESSURE_IO].threshold_usec), 0), SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0), SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0), @@ -714,11 +716,12 @@ static int bus_cgroup_set_transient_property( return 1; - } else if (STR_IN_SET(name, "MemoryPressureWatch", "CPUPressureWatch")) { + } else if (STR_IN_SET(name, "MemoryPressureWatch", "CPUPressureWatch", "IOPressureWatch")) { CGroupPressureWatch p; const char *t; - PressureResource pt = streq(name, "MemoryPressureWatch") ? PRESSURE_MEMORY : PRESSURE_CPU; + PressureResource pt = streq(name, "MemoryPressureWatch") ? PRESSURE_MEMORY : + streq(name, "CPUPressureWatch") ? PRESSURE_CPU : PRESSURE_IO; r = sd_bus_message_read(message, "s", &t); if (r < 0) @@ -739,10 +742,11 @@ static int bus_cgroup_set_transient_property( return 1; - } else if (STR_IN_SET(name, "MemoryPressureThresholdUSec", "CPUPressureThresholdUSec")) { + } else if (STR_IN_SET(name, "MemoryPressureThresholdUSec", "CPUPressureThresholdUSec", "IOPressureThresholdUSec")) { uint64_t t; - PressureResource pt = streq(name, "MemoryPressureThresholdUSec") ? PRESSURE_MEMORY : PRESSURE_CPU; + PressureResource pt = streq(name, "MemoryPressureThresholdUSec") ? PRESSURE_MEMORY : + streq(name, "CPUPressureThresholdUSec") ? PRESSURE_CPU : PRESSURE_IO; r = sd_bus_message_read(message, "t", &t); if (r < 0) diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 23f4c4c3de8..78cab48f852 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -2984,6 +2984,8 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure[PRESSURE_MEMORY].watch), 0), SD_BUS_PROPERTY("DefaultCPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure[PRESSURE_CPU].threshold_usec), 0), SD_BUS_PROPERTY("DefaultCPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure[PRESSURE_CPU].watch), 0), + SD_BUS_PROPERTY("DefaultIOPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure[PRESSURE_IO].threshold_usec), 0), + SD_BUS_PROPERTY("DefaultIOPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure[PRESSURE_IO].watch), 0), SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index d3d23500a91..143cfe6286b 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -287,6 +287,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { if (r < 0) return r; + r = serialize_item(f, "exec-cgroup-context-io-pressure-watch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_IO].watch)); + if (r < 0) + return r; + r = serialize_item(f, "exec-cgroup-context-delegate-subgroup", c->delegate_subgroup); if (r < 0) return r; @@ -303,6 +307,12 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) { return r; } + if (c->pressure[PRESSURE_IO].threshold_usec != USEC_INFINITY) { + r = serialize_usec(f, "exec-cgroup-context-io-pressure-threshold-usec", c->pressure[PRESSURE_IO].threshold_usec); + if (r < 0) + return r; + } + LIST_FOREACH(device_allow, a, c->device_allow) { r = serialize_item_format(f, "exec-cgroup-context-device-allow", "%s %s", a->path, @@ -638,6 +648,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { c->pressure[PRESSURE_CPU].watch = cgroup_pressure_watch_from_string(val); if (c->pressure[PRESSURE_CPU].watch < 0) return -EINVAL; + } else if ((val = startswith(l, "exec-cgroup-context-io-pressure-watch="))) { + c->pressure[PRESSURE_IO].watch = cgroup_pressure_watch_from_string(val); + if (c->pressure[PRESSURE_IO].watch < 0) + return -EINVAL; } else if ((val = startswith(l, "exec-cgroup-context-delegate-subgroup="))) { r = free_and_strdup(&c->delegate_subgroup, val); if (r < 0) @@ -650,6 +664,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) { r = deserialize_usec(val, &c->pressure[PRESSURE_CPU].threshold_usec); if (r < 0) return r; + } else if ((val = startswith(l, "exec-cgroup-context-io-pressure-threshold-usec="))) { + r = deserialize_usec(val, &c->pressure[PRESSURE_IO].threshold_usec); + if (r < 0) + return r; } else if ((val = startswith(l, "exec-cgroup-context-device-allow="))) { _cleanup_free_ char *path = NULL, *rwm = NULL; CGroupDevicePermissions p; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 297836def17..17ac9c5138b 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -280,6 +280,8 @@ {{type}}.MemoryPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure[PRESSURE_MEMORY].watch) {{type}}.CPUPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.pressure[PRESSURE_CPU].threshold_usec) {{type}}.CPUPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure[PRESSURE_CPU].watch) +{{type}}.IOPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.pressure[PRESSURE_IO].threshold_usec) +{{type}}.IOPressureWatch, config_parse_pressure_watch, 0, offsetof({{type}}, cgroup_context.pressure[PRESSURE_IO].watch) {{type}}.NFTSet, config_parse_cgroup_nft_set, NFT_SET_PARSE_CGROUP, offsetof({{type}}, cgroup_context) {{type}}.CoredumpReceive, config_parse_bool, 0, offsetof({{type}}, cgroup_context.coredump_receive) {{type}}.BindNetworkInterface, config_parse_bind_network_interface, 0, offsetof({{type}}, cgroup_context) diff --git a/src/core/main.c b/src/core/main.c index 7fcd0fa672d..655f0ac6659 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -821,6 +821,8 @@ static int parse_config_file(void) { { "Manager", "DefaultMemoryPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure[PRESSURE_MEMORY].watch }, { "Manager", "DefaultCPUPressureThresholdSec", config_parse_sec, 0, &arg_defaults.pressure[PRESSURE_CPU].threshold_usec }, { "Manager", "DefaultCPUPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure[PRESSURE_CPU].watch }, + { "Manager", "DefaultIOPressureThresholdSec", config_parse_sec, 0, &arg_defaults.pressure[PRESSURE_IO].threshold_usec }, + { "Manager", "DefaultIOPressureWatch", config_parse_pressure_watch, 0, &arg_defaults.pressure[PRESSURE_IO].watch }, { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_runtime_scope, &arg_cad_burst_action }, { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_defaults.oom_policy }, { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL }, diff --git a/src/core/manager.c b/src/core/manager.c index c71d1a5d69a..73368ec18ae 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -621,6 +621,8 @@ static char** sanitize_environment(char **l) { "CREDENTIALS_DIRECTORY", "EXIT_CODE", "EXIT_STATUS", + "IO_PRESSURE_WATCH", + "IO_PRESSURE_WRITE", "INVOCATION_ID", "JOURNAL_STREAM", "LISTEN_FDNAMES", @@ -807,6 +809,7 @@ static const struct { } pressure_dispatch_table[_PRESSURE_RESOURCE_MAX] = { [PRESSURE_MEMORY] = { sd_event_add_memory_pressure, sd_event_source_set_memory_pressure_period }, [PRESSURE_CPU] = { sd_event_add_cpu_pressure, sd_event_source_set_cpu_pressure_period }, + [PRESSURE_IO] = { sd_event_add_io_pressure, sd_event_source_set_io_pressure_period }, }; int manager_setup_pressure_event_source(Manager *m, PressureResource t) { @@ -5213,6 +5216,7 @@ void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope) { .pressure = { [PRESSURE_MEMORY] = { .watch = CGROUP_PRESSURE_WATCH_AUTO, .threshold_usec = PRESSURE_DEFAULT_THRESHOLD_USEC }, [PRESSURE_CPU] = { .watch = CGROUP_PRESSURE_WATCH_AUTO, .threshold_usec = PRESSURE_DEFAULT_THRESHOLD_USEC }, + [PRESSURE_IO] = { .watch = CGROUP_PRESSURE_WATCH_AUTO, .threshold_usec = PRESSURE_DEFAULT_THRESHOLD_USEC }, }, .oom_policy = OOM_STOP, diff --git a/src/core/system.conf.in b/src/core/system.conf.in index d3cb0160a01..63d28059305 100644 --- a/src/core/system.conf.in +++ b/src/core/system.conf.in @@ -80,6 +80,8 @@ #DefaultMemoryPressureWatch=auto #DefaultCPUPressureThresholdSec=200ms #DefaultCPUPressureWatch=auto +#DefaultIOPressureThresholdSec=200ms +#DefaultIOPressureWatch=auto #DefaultOOMPolicy=stop #DefaultSmackProcessLabel= #DefaultRestrictSUIDSGID= diff --git a/src/core/user.conf.in b/src/core/user.conf.in index fe45c00b74e..33c6733268c 100644 --- a/src/core/user.conf.in +++ b/src/core/user.conf.in @@ -56,6 +56,8 @@ #DefaultMemoryPressureWatch=auto #DefaultCPUPressureThresholdSec=200ms #DefaultCPUPressureWatch=auto +#DefaultIOPressureThresholdSec=200ms +#DefaultIOPressureWatch=auto #DefaultSmackProcessLabel= #DefaultRestrictSUIDSGID= #ReloadLimitIntervalSec= diff --git a/src/core/varlink-cgroup.c b/src/core/varlink-cgroup.c index d4ec6049e66..ab32def28b7 100644 --- a/src/core/varlink-cgroup.c +++ b/src/core/varlink-cgroup.c @@ -327,6 +327,8 @@ int unit_cgroup_context_build_json(sd_json_variant **ret, const char *name, void JSON_BUILD_PAIR_FINITE_USEC("MemoryPressureThresholdUSec", c->pressure[PRESSURE_MEMORY].threshold_usec), SD_JSON_BUILD_PAIR_STRING("CPUPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_CPU].watch)), JSON_BUILD_PAIR_FINITE_USEC("CPUPressureThresholdUSec", c->pressure[PRESSURE_CPU].threshold_usec), + SD_JSON_BUILD_PAIR_STRING("IOPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_IO].watch)), + JSON_BUILD_PAIR_FINITE_USEC("IOPressureThresholdUSec", c->pressure[PRESSURE_IO].threshold_usec), /* Others */ SD_JSON_BUILD_PAIR_BOOLEAN("CoredumpReceive", c->coredump_receive)); diff --git a/src/core/varlink-manager.c b/src/core/varlink-manager.c index 3953b8619f7..997bdc08d01 100644 --- a/src/core/varlink-manager.c +++ b/src/core/varlink-manager.c @@ -110,6 +110,8 @@ static int manager_context_build_json(sd_json_variant **ret, const char *name, v SD_JSON_BUILD_PAIR_STRING("DefaultMemoryPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure[PRESSURE_MEMORY].watch)), JSON_BUILD_PAIR_FINITE_USEC("DefaultCPUPressureThresholdUSec", m->defaults.pressure[PRESSURE_CPU].threshold_usec), SD_JSON_BUILD_PAIR_STRING("DefaultCPUPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure[PRESSURE_CPU].watch)), + JSON_BUILD_PAIR_FINITE_USEC("DefaultIOPressureThresholdUSec", m->defaults.pressure[PRESSURE_IO].threshold_usec), + SD_JSON_BUILD_PAIR_STRING("DefaultIOPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure[PRESSURE_IO].watch)), JSON_BUILD_PAIR_FINITE_USEC("RuntimeWatchdogUSec", manager_get_watchdog(m, WATCHDOG_RUNTIME)), JSON_BUILD_PAIR_FINITE_USEC("RebootWatchdogUSec", manager_get_watchdog(m, WATCHDOG_REBOOT)), JSON_BUILD_PAIR_FINITE_USEC("KExecWatchdogUSec", manager_get_watchdog(m, WATCHDOG_KEXEC)), diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym index 5f5eca60833..38ab92dea12 100644 --- a/src/libsystemd/libsystemd.sym +++ b/src/libsystemd/libsystemd.sym @@ -1099,4 +1099,7 @@ global: sd_event_add_cpu_pressure; sd_event_source_set_cpu_pressure_type; sd_event_source_set_cpu_pressure_period; + sd_event_add_io_pressure; + sd_event_source_set_io_pressure_type; + sd_event_source_set_io_pressure_period; } LIBSYSTEMD_260; diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h index c7d5ba166da..8487c966ab4 100644 --- a/src/libsystemd/sd-event/event-source.h +++ b/src/libsystemd/sd-event/event-source.h @@ -27,6 +27,7 @@ typedef enum EventSourceType { SOURCE_INOTIFY, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE, + SOURCE_IO_PRESSURE, _SOURCE_EVENT_SOURCE_TYPE_MAX, _SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL, } EventSourceType; diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index aba6bf9b478..9256ddd81bf 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -77,6 +77,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] [SOURCE_INOTIFY] = "inotify", [SOURCE_MEMORY_PRESSURE] = "memory-pressure", [SOURCE_CPU_PRESSURE] = "cpu-pressure", + [SOURCE_IO_PRESSURE] = "io-pressure", }; DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); @@ -101,7 +102,8 @@ DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); SOURCE_DEFER, \ SOURCE_INOTIFY, \ SOURCE_MEMORY_PRESSURE, \ - SOURCE_CPU_PRESSURE) + SOURCE_CPU_PRESSURE, \ + SOURCE_IO_PRESSURE) /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put(). * Time sources and ratelimited sources can be passed, so effectively this is the same as the @@ -566,7 +568,7 @@ static int source_child_pidfd_register(sd_event_source *s, int enabled) { return 0; } -#define EVENT_SOURCE_IS_PRESSURE(s) IN_SET((s)->type, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE) +#define EVENT_SOURCE_IS_PRESSURE(s) IN_SET((s)->type, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE, SOURCE_IO_PRESSURE) static void source_pressure_unregister(sd_event_source *s) { assert(s); @@ -1052,6 +1054,7 @@ static void source_disconnect(sd_event_source *s) { case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: source_pressure_remove_from_write_list(s); source_pressure_unregister(s); break; @@ -1198,6 +1201,7 @@ static sd_event_source* source_new(sd_event *e, bool floating, EventSourceType t [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify), [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, pressure), [SOURCE_CPU_PRESSURE] = endoffsetof_field(sd_event_source, pressure), + [SOURCE_IO_PRESSURE] = endoffsetof_field(sd_event_source, pressure), }; sd_event_source *s; @@ -2110,8 +2114,8 @@ static int event_add_pressure( * fd with the epoll right-away. Instead, we just add the event source to a list of pressure event * sources on which writes must be executed before the first event loop iteration is executed. (We * could also write the data here, right away, but we want to give the caller the freedom to call - * sd_event_source_set_{memory,cpu}_pressure_type() and - * sd_event_source_set_{memory,cpu}_pressure_period() before we write it. */ + * sd_event_source_set_{memory,cpu,io}_pressure_type() and + * sd_event_source_set_{memory,cpu,io}_pressure_period() before we write it. */ if (s->pressure.write_buffer_size > 0) source_pressure_add_to_write_list(s); @@ -2160,6 +2164,25 @@ _public_ int sd_event_add_cpu_pressure( PRESSURE_CPU); } +static int io_pressure_callback(sd_event_source *s, void *userdata) { + assert(s); + + return 0; +} + +_public_ int sd_event_add_io_pressure( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + return event_add_pressure( + e, ret, callback, userdata, + SOURCE_IO_PRESSURE, + io_pressure_callback, + PRESSURE_IO); +} + static void event_free_inotify_data(sd_event *e, InotifyData *d) { assert(e); @@ -2962,6 +2985,7 @@ static int event_source_offline( case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: source_pressure_unregister(s); break; @@ -3054,6 +3078,7 @@ static int event_source_online( case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: /* As documented in sd_event_add_{memory,cpu,io}_pressure(), we can only register the PSI fd * with epoll after writing the watch string. */ if (s->pressure.write_buffer_size == 0) { @@ -4308,6 +4333,7 @@ static int source_dispatch(sd_event_source *s) { case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: r = s->pressure.callback(s, s->userdata); break; @@ -4723,6 +4749,7 @@ static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t case SOURCE_MEMORY_PRESSURE: case SOURCE_CPU_PRESSURE: + case SOURCE_IO_PRESSURE: r = process_pressure(s, i->events); break; @@ -5418,6 +5445,13 @@ _public_ int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const cha return event_source_set_pressure_type(s, ty); } +_public_ int sd_event_source_set_io_pressure_type(sd_event_source *s, const char *ty) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO_PRESSURE, -EDOM); + + return event_source_set_pressure_type(s, ty); +} + static int event_source_set_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { _cleanup_free_ char *b = NULL; _cleanup_free_ void *w = NULL; @@ -5478,3 +5512,10 @@ _public_ int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_ return event_source_set_pressure_period(s, threshold_usec, window_usec); } + +_public_ int sd_event_source_set_io_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_IO_PRESSURE, -EDOM); + + return event_source_set_pressure_period(s, threshold_usec, window_usec); +} diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 9c732543fac..1a6bc7370f8 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -2384,6 +2384,7 @@ static const BusProperty cgroup_properties[] = { { "ManagedOOMPreference", bus_append_string }, { "MemoryPressureWatch", bus_append_string }, { "CPUPressureWatch", bus_append_string }, + { "IOPressureWatch", bus_append_string }, { "DelegateSubgroup", bus_append_string }, { "ManagedOOMMemoryPressureLimit", bus_append_parse_permyriad }, { "MemoryAccounting", bus_append_parse_boolean }, @@ -2423,6 +2424,7 @@ static const BusProperty cgroup_properties[] = { { "SocketBindDeny", bus_append_socket_filter }, { "MemoryPressureThresholdSec", bus_append_parse_sec_rename }, { "CPUPressureThresholdSec", bus_append_parse_sec_rename }, + { "IOPressureThresholdSec", bus_append_parse_sec_rename }, { "NFTSet", bus_append_nft_set }, { "BindNetworkInterface", bus_append_string }, diff --git a/src/shared/varlink-io.systemd.Manager.c b/src/shared/varlink-io.systemd.Manager.c index f947c0a0561..9ce1b8350ab 100644 --- a/src/shared/varlink-io.systemd.Manager.c +++ b/src/shared/varlink-io.systemd.Manager.c @@ -68,6 +68,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureThresholdUSec, SD_VARLINK_INT, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultCPUPressureWatch="), SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureWatch, SD_VARLINK_STRING, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultIOPressureThresholdUSec="), + SD_VARLINK_DEFINE_FIELD(DefaultIOPressureThresholdUSec, SD_VARLINK_INT, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultIOPressureWatch="), + SD_VARLINK_DEFINE_FIELD(DefaultIOPressureWatch, SD_VARLINK_STRING, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RuntimeWatchdogSec="), SD_VARLINK_DEFINE_FIELD(RuntimeWatchdogUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RebootWatchdogSec="), diff --git a/src/shared/varlink-io.systemd.Unit.c b/src/shared/varlink-io.systemd.Unit.c index a230f29daba..c1ff4ebc5a7 100644 --- a/src/shared/varlink-io.systemd.Unit.c +++ b/src/shared/varlink-io.systemd.Unit.c @@ -232,6 +232,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE( SD_VARLINK_DEFINE_FIELD(CPUPressureWatch, SD_VARLINK_STRING, 0), SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#CPUPressureThresholdSec="), SD_VARLINK_DEFINE_FIELD(CPUPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#IOPressureWatch="), + SD_VARLINK_DEFINE_FIELD(IOPressureWatch, SD_VARLINK_STRING, 0), + SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#IOPressureThresholdSec="), + SD_VARLINK_DEFINE_FIELD(IOPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE), /* Others */ SD_VARLINK_FIELD_COMMENT("Reflects whether to forward coredumps for processes that crash within this cgroup"), diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h index 71fc9504889..34bd396080d 100644 --- a/src/systemd/sd-event.h +++ b/src/systemd/sd-event.h @@ -98,6 +98,7 @@ int sd_event_add_post(sd_event *e, sd_event_source **ret, sd_event_handler_t cal int sd_event_add_exit(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_add_memory_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_add_cpu_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); +int sd_event_add_io_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata); int sd_event_prepare(sd_event *e); int sd_event_wait(sd_event *e, uint64_t timeout); @@ -165,6 +166,8 @@ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const char *ty); int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); +int sd_event_source_set_io_pressure_type(sd_event_source *s, const char *ty); +int sd_event_source_set_io_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback); int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret); int sd_event_source_get_floating(sd_event_source *s); diff --git a/src/test/test-pressure.c b/src/test/test-pressure.c index 44ff810753e..318b73e4fd6 100644 --- a/src/test/test-pressure.c +++ b/src/test/test-pressure.c @@ -154,6 +154,14 @@ TEST(fake_cpu_pressure) { test_fake_pressure("cpu", fake_cpu_pressure_wrapper); } +static int fake_io_pressure_wrapper(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata) { + return sd_event_add_io_pressure(e, ret, callback, userdata); +} + +TEST(fake_io_pressure) { + test_fake_pressure("io", fake_io_pressure_wrapper); +} + /* Shared infrastructure for real pressure tests */ struct real_pressure_context { @@ -452,7 +460,142 @@ TEST(real_cpu_pressure) { ASSERT_EQ(ex, 31); } +/* IO pressure real test */ + +static int real_io_pressure_callback(sd_event_source *s, void *userdata) { + struct real_pressure_context *c = ASSERT_PTR(userdata); + const char *d; + + ASSERT_NOT_NULL(s); + ASSERT_OK(sd_event_source_get_description(s, &d)); + + log_notice("real io pressure event: %s", d); + + ASSERT_NOT_NULL(c->pid); + ASSERT_OK(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0)); + c->pid = NULL; + + return 0; +} + +_noreturn_ static void real_pressure_eat_io(int pipe_fd) { + char x; + ASSERT_EQ(read(pipe_fd, &x, 1), 1); /* Wait for the GO! */ + + /* Write and fsync in a loop to generate IO pressure */ + for (;;) { + _cleanup_close_ int fd = -EBADF; + + fd = open("/var/tmp/.io-pressure-test", O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, 0600); + if (fd < 0) + continue; + + char buf[4096]; + memset(buf, 'x', sizeof(buf)); + for (int i = 0; i < 256; i++) + if (write(fd, buf, sizeof(buf)) < 0) + break; + (void) fsync(fd); + } +} + +TEST(real_io_pressure) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *scope = NULL; + const char *object; + int r; + + if (getuid() == 0) + r = sd_bus_open_system(&bus); + else + r = sd_bus_open_user(&bus); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't connect to bus"); + + ASSERT_OK(bus_wait_for_jobs_new(bus, &w)); + + ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit")); + ASSERT_OK(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64())); + ASSERT_OK(sd_bus_message_append(m, "ss", scope, "fail")); + ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)")); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0)); + ASSERT_OK(sd_bus_message_append(m, "(sv)", "IOAccounting", "b", true)); + ASSERT_OK(sd_bus_message_close_container(m)); + ASSERT_OK(sd_bus_message_append(m, "a(sa(sv))", 0)); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't issue transient unit call"); + + ASSERT_OK(sd_bus_message_read(reply, "o", &object)); + + ASSERT_OK(bus_wait_for_jobs_one(w, object, /* flags= */ BUS_WAIT_JOBS_LOG_ERROR, /* extra_args= */ NULL)); + + ASSERT_OK(sd_event_default(&e)); + + ASSERT_OK_ERRNO(pipe2(pipe_fd, O_CLOEXEC)); + + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + r = pidref_safe_fork("(eat-io)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, &pidref); + ASSERT_OK(r); + if (r == 0) { + real_pressure_eat_io(pipe_fd[0]); + _exit(EXIT_SUCCESS); + } + + ASSERT_OK(event_add_child_pidref(e, &cs, &pidref, WEXITED, real_pressure_child_callback, NULL)); + ASSERT_OK(sd_event_source_set_child_process_own(cs, true)); + + ASSERT_OK_ERRNO(unsetenv("IO_PRESSURE_WATCH")); + ASSERT_OK_ERRNO(unsetenv("IO_PRESSURE_WRITE")); + + struct real_pressure_context context = { + .pid = cs, + }; + + r = sd_event_add_io_pressure(e, &es, real_io_pressure_callback, &context); + if (r < 0) + return (void) log_tests_skipped_errno(r, "can't allocate io pressure fd"); + + ASSERT_OK(sd_event_source_set_description(es, "real pressure event source")); + ASSERT_OK_ZERO(sd_event_source_set_io_pressure_type(es, "some")); + /* Unprivileged writes require a minimum of 2s otherwise the kernel will refuse the write. */ + ASSERT_OK_POSITIVE(sd_event_source_set_io_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); + ASSERT_OK_ZERO(sd_event_source_set_io_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC)); + ASSERT_OK(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT)); + + m = sd_bus_message_unref(m); + + ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties")); + ASSERT_OK(sd_bus_message_append(m, "sb", scope, true)); + ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)")); + ASSERT_OK(sd_bus_message_open_container(m, 'r', "sv")); + ASSERT_OK(sd_bus_message_append(m, "s", "IOWriteBandwidthMax")); + ASSERT_OK(sd_bus_message_open_container(m, 'v', "a(st)")); + ASSERT_OK(sd_bus_message_append(m, "a(st)", 1, "/var/tmp", (uint64_t) 1024*1024)); /* 1M/s */ + ASSERT_OK(sd_bus_message_close_container(m)); + ASSERT_OK(sd_bus_message_close_container(m)); + ASSERT_OK(sd_bus_message_close_container(m)); + + ASSERT_OK(sd_bus_call(bus, m, 0, NULL, NULL)); + + /* Now start eating IO */ + ASSERT_EQ(write(pipe_fd[1], &(const char) { 'x' }, 1), 1); + + ASSERT_OK(sd_event_loop(e)); + int ex = 0; + ASSERT_OK(sd_event_get_exit_code(e, &ex)); + ASSERT_EQ(ex, 31); +} + static int outro(void) { + (void) unlink("/var/tmp/.io-pressure-test"); hashmap_trim_pools(); return 0; } diff --git a/test/units/TEST-79-PRESSURE.sh b/test/units/TEST-79-PRESSURE.sh index d4e4a9e06b5..72de8a1d9d1 100755 --- a/test/units/TEST-79-PRESSURE.sh +++ b/test/units/TEST-79-PRESSURE.sh @@ -114,4 +114,57 @@ systemd-run \ rm "$SCRIPT" +# Now test IO pressure + +if ! cat /proc/pressure/io >/dev/null ; then + echo "kernel has no IO PSI support." >&2 + echo OK >/testok + exit 0 +fi + +if ! test -f "$CGROUP"/io.pressure ; then + echo "No IO accounting/PSI delegated via cgroup, can't test." >&2 + echo OK >/testok + exit 0 +fi + +UNIT="test-iopress-$RANDOM.service" +SCRIPT="/tmp/iopress-$RANDOM.sh" + +cat >"$SCRIPT" <<'EOF' +#!/usr/bin/env bash + +set -ex + +export +id + +test -n "$IO_PRESSURE_WATCH" +test "$IO_PRESSURE_WATCH" != /dev/null +test -w "$IO_PRESSURE_WATCH" + +ls -al "$IO_PRESSURE_WATCH" + +EXPECTED="$(echo -n -e "some 123000 2000000\x00" | base64)" + +test "$EXPECTED" = "$IO_PRESSURE_WRITE" + +EOF + +chmod +x "$SCRIPT" + +systemd-run \ + -u "$UNIT" \ + -p Type=exec \ + -p ProtectControlGroups=1 \ + -p DynamicUser=1 \ + -p IOPressureWatch=on \ + -p IOPressureThresholdSec=123ms \ + -p BindPaths=$SCRIPT \ + `# Make sanitizers happy when DynamicUser=1 pulls in instrumented systemd NSS modules` \ + -p EnvironmentFile=-/usr/lib/systemd/systemd-asan-env \ + --wait "$SCRIPT" + +rm "$SCRIPT" + touch /testok