]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: Add I/O pressure support
authorDaan De Meyer <daan@amutable.com>
Sat, 7 Mar 2026 22:37:55 +0000 (23:37 +0100)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Thu, 9 Apr 2026 20:47:10 +0000 (22:47 +0200)
29 files changed:
man/org.freedesktop.systemd1.xml
man/rules/meson.build
man/sd_event_add_memory_pressure.xml
man/systemd-system.conf.xml
man/systemd.exec.xml
man/systemd.resource-control.xml
src/basic/psi-util.c
src/basic/psi-util.h
src/core/cgroup.c
src/core/cgroup.h
src/core/dbus-cgroup.c
src/core/dbus-manager.c
src/core/execute-serialize.c
src/core/load-fragment-gperf.gperf.in
src/core/main.c
src/core/manager.c
src/core/system.conf.in
src/core/user.conf.in
src/core/varlink-cgroup.c
src/core/varlink-manager.c
src/libsystemd/libsystemd.sym
src/libsystemd/sd-event/event-source.h
src/libsystemd/sd-event/sd-event.c
src/shared/bus-unit-util.c
src/shared/varlink-io.systemd.Manager.c
src/shared/varlink-io.systemd.Unit.c
src/systemd/sd-event.h
src/test/test-pressure.c
test/units/TEST-79-PRESSURE.sh

index 027a8deeb46538de2802bb2ea173e409e461e14e..76a8dd045f6c660f74719799dce056d62fa92ade 100644 (file)
@@ -556,6 +556,10 @@ node /org/freedesktop/systemd1 {
       readonly t DefaultCPUPressureThresholdUSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly s DefaultCPUPressureWatch = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly t DefaultIOPressureThresholdUSec = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s DefaultIOPressureWatch = '...';
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly t TimerSlackNSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@@ -801,6 +805,10 @@ node /org/freedesktop/systemd1 {
 
     <!--property DefaultCPUPressureWatch is not documented!-->
 
+    <!--property DefaultIOPressureThresholdUSec is not documented!-->
+
+    <!--property DefaultIOPressureWatch is not documented!-->
+
     <!--property TimerSlackNSec is not documented!-->
 
     <!--property DefaultOOMPolicy is not documented!-->
@@ -1255,6 +1263,10 @@ node /org/freedesktop/systemd1 {
 
     <variablelist class="dbus-property" generated="True" extra-ref="DefaultCPUPressureWatch"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="DefaultIOPressureThresholdUSec"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="DefaultIOPressureWatch"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="TimerSlackNSec"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="DefaultOOMPolicy"/>
@@ -3082,6 +3094,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUPressureThresholdUSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s IOPressureWatch = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly t IOPressureThresholdUSec = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiss) NFTSet = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CoredumpReceive = ...;
@@ -3755,6 +3771,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property CPUPressureThresholdUSec is not documented!-->
 
+    <!--property IOPressureWatch is not documented!-->
+
+    <!--property IOPressureThresholdUSec is not documented!-->
+
     <!--property NFTSet is not documented!-->
 
     <!--property CoredumpReceive is not documented!-->
@@ -4451,6 +4471,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUPressureThresholdUSec"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureWatch"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureThresholdUSec"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/>
@@ -5354,6 +5378,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUPressureThresholdUSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s IOPressureWatch = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly t IOPressureThresholdUSec = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiss) NFTSet = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CoredumpReceive = ...;
@@ -6043,6 +6071,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--property CPUPressureThresholdUSec is not documented!-->
 
+    <!--property IOPressureWatch is not documented!-->
+
+    <!--property IOPressureThresholdUSec is not documented!-->
+
     <!--property NFTSet is not documented!-->
 
     <!--property CoredumpReceive is not documented!-->
@@ -6713,6 +6745,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUPressureThresholdUSec"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureWatch"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureThresholdUSec"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/>
@@ -7439,6 +7475,10 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUPressureThresholdUSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s IOPressureWatch = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly t IOPressureThresholdUSec = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiss) NFTSet = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CoredumpReceive = ...;
@@ -8052,6 +8092,10 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--property CPUPressureThresholdUSec is not documented!-->
 
+    <!--property IOPressureWatch is not documented!-->
+
+    <!--property IOPressureThresholdUSec is not documented!-->
+
     <!--property NFTSet is not documented!-->
 
     <!--property CoredumpReceive is not documented!-->
@@ -8630,6 +8674,10 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUPressureThresholdUSec"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureWatch"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureThresholdUSec"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/>
@@ -9489,6 +9537,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUPressureThresholdUSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s IOPressureWatch = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly t IOPressureThresholdUSec = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiss) NFTSet = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CoredumpReceive = ...;
@@ -10084,6 +10136,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--property CPUPressureThresholdUSec is not documented!-->
 
+    <!--property IOPressureWatch is not documented!-->
+
+    <!--property IOPressureThresholdUSec is not documented!-->
+
     <!--property NFTSet is not documented!-->
 
     <!--property CoredumpReceive is not documented!-->
@@ -10644,6 +10700,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUPressureThresholdUSec"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureWatch"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureThresholdUSec"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/>
@@ -11356,6 +11416,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUPressureThresholdUSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s IOPressureWatch = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly t IOPressureThresholdUSec = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiss) NFTSet = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CoredumpReceive = ...;
@@ -11533,6 +11597,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <!--property CPUPressureThresholdUSec is not documented!-->
 
+    <!--property IOPressureWatch is not documented!-->
+
+    <!--property IOPressureThresholdUSec is not documented!-->
+
     <!--property NFTSet is not documented!-->
 
     <!--property CoredumpReceive is not documented!-->
@@ -11725,6 +11793,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUPressureThresholdUSec"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureWatch"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureThresholdUSec"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/>
@@ -11940,6 +12012,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly t CPUPressureThresholdUSec = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly s IOPressureWatch = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
+      readonly t IOPressureThresholdUSec = ...;
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly a(iiss) NFTSet = [...];
       @org.freedesktop.DBus.Property.EmitsChangedSignal("false")
       readonly b CoredumpReceive = ...;
@@ -12131,6 +12207,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <!--property CPUPressureThresholdUSec is not documented!-->
 
+    <!--property IOPressureWatch is not documented!-->
+
+    <!--property IOPressureThresholdUSec is not documented!-->
+
     <!--property NFTSet is not documented!-->
 
     <!--property CoredumpReceive is not documented!-->
@@ -12347,6 +12427,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
 
     <variablelist class="dbus-property" generated="True" extra-ref="CPUPressureThresholdUSec"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureWatch"/>
+
+    <variablelist class="dbus-property" generated="True" extra-ref="IOPressureThresholdUSec"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/>
@@ -12560,8 +12644,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <function>KillUnitSubgroup()</function> were added in version 258.</para>
       <para><varname>TransactionsWithOrderingCycle</varname> was added in version 259.</para>
       <para><varname>DefaultMemoryZSwapWriteback</varname>,
-      <varname>DefaultCPUPressureThresholdUSec</varname> and
-      <varname>DefaultCPUPressureWatch</varname> were added in version 261.</para>
+      <varname>DefaultCPUPressureThresholdUSec</varname>,
+      <varname>DefaultCPUPressureWatch</varname>,
+      <varname>DefaultIOPressureThresholdUSec</varname>, and
+      <varname>DefaultIOPressureWatch</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Unit Objects</title>
@@ -12653,8 +12739,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ExecReloadPostEx</varname> were added in version 259.</para>
       <para><varname>BindNetworkInterface</varname>, <varname>MemoryTHP</varname>,
       <varname>RefreshOnReload</varname>, and <varname>RootMStack</varname> were added in version 260.</para>
-      <para><varname>CPUPressureThresholdUSec</varname> and
-      <varname>CPUPressureWatch</varname> were added in version 261.</para>
+      <para><varname>CPUPressureThresholdUSec</varname>,
+      <varname>CPUPressureWatch</varname>,
+      <varname>IOPressureThresholdUSec</varname>, and
+      <varname>IOPressureWatch</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Socket Unit Objects</title>
@@ -12725,8 +12813,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ManagedOOMKills</varname> were added in 259.</para>
       <para><varname>BindNetworkInterface</varname> <varname>MemoryTHP</varname>, and
       <varname>RootMStack</varname> were added in version 260.</para>
-      <para><varname>CPUPressureThresholdUSec</varname> and
-      <varname>CPUPressureWatch</varname> were added in version 261.</para>
+      <para><varname>CPUPressureThresholdUSec</varname>,
+      <varname>CPUPressureWatch</varname>,
+      <varname>IOPressureThresholdUSec</varname>, and
+      <varname>IOPressureWatch</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Mount Unit Objects</title>
@@ -12792,8 +12882,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ManagedOOMKills</varname> were added in 259.</para>
       <para><varname>BindNetworkInterface</varname> <varname>MemoryTHP</varname>, and
       <varname>RootMStack</varname> were added in version 260.</para>
-      <para><varname>CPUPressureThresholdUSec</varname> and
-      <varname>CPUPressureWatch</varname> were added in version 261.</para>
+      <para><varname>CPUPressureThresholdUSec</varname>,
+      <varname>CPUPressureWatch</varname>,
+      <varname>IOPressureThresholdUSec</varname>, and
+      <varname>IOPressureWatch</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Swap Unit Objects</title>
@@ -12857,8 +12949,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>ManagedOOMKills</varname> were added in 259.</para>
       <para><varname>BindNetworkInterface</varname>, <varname>MemoryTHP</varname>, and
       <varname>RootMStack</varname> were added in version 260.</para>
-      <para><varname>CPUPressureThresholdUSec</varname> and
-      <varname>CPUPressureWatch</varname> were added in version 261.</para>
+      <para><varname>CPUPressureThresholdUSec</varname>,
+      <varname>CPUPressureWatch</varname>,
+      <varname>IOPressureThresholdUSec</varname>, and
+      <varname>IOPressureWatch</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Slice Unit Objects</title>
@@ -12892,8 +12986,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>OOMKills</varname>, and
       <varname>ManagedOOMKills</varname> were added in 259.</para>
       <para><varname>BindNetworkInterface</varname> was added in version 260.</para>
-      <para><varname>CPUPressureThresholdUSec</varname> and
-      <varname>CPUPressureWatch</varname> were added in version 261.</para>
+      <para><varname>CPUPressureThresholdUSec</varname>,
+      <varname>CPUPressureWatch</varname>,
+      <varname>IOPressureThresholdUSec</varname>, and
+      <varname>IOPressureWatch</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Scope Unit Objects</title>
@@ -12925,8 +13021,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <para><varname>OOMKills</varname>, and
       <varname>ManagedOOMKills</varname> were added in 259.</para>
       <para><varname>BindNetworkInterface</varname> was added in version 260.</para>
-      <para><varname>CPUPressureThresholdUSec</varname> and
-      <varname>CPUPressureWatch</varname> were added in version 261.</para>
+      <para><varname>CPUPressureThresholdUSec</varname>,
+      <varname>CPUPressureWatch</varname>,
+      <varname>IOPressureThresholdUSec</varname>, and
+      <varname>IOPressureWatch</varname> were added in version 261.</para>
     </refsect2>
     <refsect2>
       <title>Job Objects</title>
index 81e7ef4f882620cf2c9f7c8f218bf14e0e910e08..525c56b1d3b34786ec599882d276ec13868ff42c 100644 (file)
@@ -609,8 +609,11 @@ manpages = [
  ['sd_event_add_memory_pressure',
   '3',
   ['sd_event_add_cpu_pressure',
+   'sd_event_add_io_pressure',
    'sd_event_source_set_cpu_pressure_period',
    'sd_event_source_set_cpu_pressure_type',
+   'sd_event_source_set_io_pressure_period',
+   'sd_event_source_set_io_pressure_type',
    'sd_event_source_set_memory_pressure_period',
    'sd_event_source_set_memory_pressure_type',
    'sd_event_trim_memory'],
index 1e6b734738f6c862b86e271b67e24ae82607e17f..05f2ff2b7452883010a9232d69c1b58e75a691a1 100644 (file)
     <refname>sd_event_source_set_cpu_pressure_type</refname>
     <refname>sd_event_source_set_cpu_pressure_period</refname>
 
-    <refpurpose>Add and configure an event source run as result of memory or CPU pressure</refpurpose>
+    <refname>sd_event_add_io_pressure</refname>
+    <refname>sd_event_source_set_io_pressure_type</refname>
+    <refname>sd_event_source_set_io_pressure_period</refname>
+
+    <refpurpose>Add and configure an event source for memory, CPU, or IO pressure notifications</refpurpose>
   </refnamediv>
 
   <refsynopsisdiv>
         <paramdef>uint64_t <parameter>window_usec</parameter></paramdef>
       </funcprototype>
 
+      <funcprototype>
+        <funcdef>int <function>sd_event_add_io_pressure</function></funcdef>
+        <paramdef>sd_event *<parameter>event</parameter></paramdef>
+        <paramdef>sd_event_source **<parameter>ret_source</parameter></paramdef>
+        <paramdef>sd_event_handler_t <parameter>handler</parameter></paramdef>
+        <paramdef>void *<parameter>userdata</parameter></paramdef>
+      </funcprototype>
+
+      <funcprototype>
+        <funcdef>int <function>sd_event_source_set_io_pressure_type</function></funcdef>
+        <paramdef>sd_event_source *<parameter>source</parameter></paramdef>
+        <paramdef>const char *<parameter>type</parameter></paramdef>
+      </funcprototype>
+
+      <funcprototype>
+        <funcdef>int <function>sd_event_source_set_io_pressure_period</function></funcdef>
+        <paramdef>sd_event_source *<parameter>source</parameter></paramdef>
+        <paramdef>uint64_t <parameter>threshold_usec</parameter></paramdef>
+        <paramdef>uint64_t <parameter>window_usec</parameter></paramdef>
+      </funcprototype>
+
       <funcprototype>
         <funcdef>int <function>sd_event_trim_memory</function></funcdef>
         <paramdef>void</paramdef>
 
     <para><function>sd_event_add_memory_pressure()</function> adds a new event source that is triggered
     whenever memory pressure is seen. Similarly,
-    <function>sd_event_add_cpu_pressure()</function> adds a new event source that is triggered whenever CPU
-    pressure is seen. This functionality is built around the Linux kernel's <ulink
+    <function>sd_event_add_cpu_pressure()</function> and <function>sd_event_add_io_pressure()</function> add
+    new event sources that are triggered whenever CPU or IO pressure is seen, respectively. This functionality
+    is built around the Linux kernel's <ulink
     url="https://docs.kernel.org/accounting/psi.html">Pressure Stall Information (PSI)</ulink> logic.</para>
 
-    <para>Both functions expect an event loop object as first parameter, and return the allocated event source
+    <para>These functions expect an event loop object as first parameter, and return the allocated event source
     object in the second parameter, on success. The <parameter>handler</parameter> parameter is a function to
     call when pressure is seen, or <constant>NULL</constant>. The handler function will be passed the
     <parameter>userdata</parameter> pointer, which may be chosen freely by the caller. The handler may return
     negative to signal an error (see below), other return values are ignored. If
-    <parameter>handler</parameter> is <constant>NULL</constant>, a default handler that compacts allocation
-    caches maintained by <filename>libsystemd</filename> as well as glibc (via <citerefentry
-    project='man-pages'><refentrytitle>malloc_trim</refentrytitle><manvolnum>3</manvolnum></citerefentry>)
-    will be used.</para>
+    <parameter>handler</parameter> is <constant>NULL</constant>, a default handler is used. For
+    <function>sd_event_add_memory_pressure()</function>, the default handler compacts allocation caches
+    maintained by <filename>libsystemd</filename> as well as glibc (via <citerefentry
+    project='man-pages'><refentrytitle>malloc_trim</refentrytitle><manvolnum>3</manvolnum></citerefentry>).
+    For <function>sd_event_add_cpu_pressure()</function> and
+    <function>sd_event_add_io_pressure()</function>, the default handler is a no-op. It is recommended to
+    pass a custom handler for CPU and IO pressure to take meaningful action when pressure is
+    detected.</para>
 
     <para>To destroy an event source object use
     <citerefentry><refentrytitle>sd_event_source_unref</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
     <citerefentry><refentrytitle>sd_event_source_set_enabled</refentrytitle><manvolnum>3</manvolnum></citerefentry>
     with <constant>SD_EVENT_OFF</constant>.</para>
 
-    <para>If the second parameter of <function>sd_event_add_memory_pressure()</function> or
-    <function>sd_event_add_cpu_pressure()</function> is
+    <para>If the second parameter of <function>sd_event_add_memory_pressure()</function>,
+    <function>sd_event_add_cpu_pressure()</function>, or <function>sd_event_add_io_pressure()</function> is
     <constant>NULL</constant> no reference to the event source object is returned. In this case, the event
     source is considered "floating", and will be destroyed implicitly when the event loop itself is
     destroyed.</para>
     provides the <literal>some</literal> line, not the <literal>full</literal> line, so only
     <literal>some</literal> is valid when watching at the system level.</para>
 
+    <para>The IO pressure event source follows the same logic, but uses the
+    <varname>$IO_PRESSURE_WATCH</varname>/<varname>$IO_PRESSURE_WRITE</varname> environment variables,
+    the <filename>io.pressure</filename> cgroup file, and the system-wide PSI interface file
+    <filename>/proc/pressure/io</filename> instead.</para>
+
     <para>Or in other words: preferably any explicit configuration passed in by an invoking service manager
     (or similar) is used as notification source, before falling back to local notifications of the service,
     and finally to global notifications of the system.</para>
 
     <para>Similarly, <function>sd_event_source_set_cpu_pressure_type()</function> and
     <function>sd_event_source_set_cpu_pressure_period()</function> can be used to fine-tune the PSI
-    parameters for CPU pressure notifications. They work identically to their memory pressure counterparts.
+    parameters for CPU pressure notifications, and
+    <function>sd_event_source_set_io_pressure_type()</function> and
+    <function>sd_event_source_set_io_pressure_period()</function> can be used to fine-tune the PSI
+    parameters for IO pressure notifications. They work identically to their memory pressure counterparts.
     The type parameter takes either <literal>some</literal> or <literal>full</literal>, and the period
     function takes threshold and period times in microseconds. The same constraints apply: these calls must
-    be invoked immediately after allocating the event source, and will fail if CPU pressure parameterization
-    has been passed in via the
-    <varname>$CPU_PRESSURE_WATCH</varname>/<varname>$CPU_PRESSURE_WRITE</varname> environment
+    be invoked immediately after allocating the event source, and will fail if pressure parameterization
+    has been passed in via the corresponding
+    <varname>$*_PRESSURE_WATCH</varname>/<varname>$*_PRESSURE_WRITE</varname> environment
     variables.</para>
 
     <para>The <function>sd_event_trim_memory()</function> function releases various internal allocation
         <varlistentry>
           <term><constant>-EHOSTDOWN</constant></term>
 
-          <listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname> or
-          <varname>$CPU_PRESSURE_WATCH</varname> variable has been set to the literal
-          string <filename>/dev/null</filename>, in order to explicitly disable pressure
+          <listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname>,
+          <varname>$CPU_PRESSURE_WATCH</varname>, or <varname>$IO_PRESSURE_WATCH</varname> variable has been
+          set to the literal string <filename>/dev/null</filename>, in order to explicitly disable pressure
           handling.</para>
 
           <xi:include href="version-info.xml" xpointer="v254"/></listitem>
         <varlistentry>
           <term><constant>-EBADMSG</constant></term>
 
-          <listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname> or
-          <varname>$CPU_PRESSURE_WATCH</varname> variable has been set to an invalid
-          string, for example a relative rather than an absolute path.</para>
+          <listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname>,
+          <varname>$CPU_PRESSURE_WATCH</varname>, or <varname>$IO_PRESSURE_WATCH</varname> variable has been
+          set to an invalid string, for example a relative rather than an absolute path.</para>
 
           <xi:include href="version-info.xml" xpointer="v254"/></listitem>
         </varlistentry>
         <varlistentry>
           <term><constant>-ENOTTY</constant></term>
 
-          <listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname> or
-          <varname>$CPU_PRESSURE_WATCH</varname> variable points to a regular file
-          outside of the procfs or cgroupfs file systems.</para>
+          <listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname>,
+          <varname>$CPU_PRESSURE_WATCH</varname>, or <varname>$IO_PRESSURE_WATCH</varname> variable points
+          to a regular file outside of the procfs or cgroupfs file systems.</para>
 
           <xi:include href="version-info.xml" xpointer="v254"/></listitem>
         </varlistentry>
         <varlistentry>
           <term><constant>-EOPNOTSUPP</constant></term>
 
-          <listitem><para>No configuration via <varname>$MEMORY_PRESSURE_WATCH</varname> or
-          <varname>$CPU_PRESSURE_WATCH</varname> has been specified and the local kernel does not support the
-          PSI interface.</para>
+          <listitem><para>No configuration via <varname>$MEMORY_PRESSURE_WATCH</varname>,
+          <varname>$CPU_PRESSURE_WATCH</varname>, or <varname>$IO_PRESSURE_WATCH</varname> has been specified
+          and the local kernel does not support the PSI interface.</para>
 
           <xi:include href="version-info.xml" xpointer="v254"/></listitem>
         </varlistentry>
           <listitem><para>This is returned by <function>sd_event_source_set_memory_pressure_type()</function>,
           <function>sd_event_source_set_memory_pressure_period()</function>,
           <function>sd_event_source_set_cpu_pressure_type()</function>,
-          and <function>sd_event_source_set_cpu_pressure_period()</function> if invoked on event sources
+          <function>sd_event_source_set_cpu_pressure_period()</function>,
+          <function>sd_event_source_set_io_pressure_type()</function>,
+          and <function>sd_event_source_set_io_pressure_period()</function> if invoked on event sources
           at a time later than immediately after allocating them.</para>
 
           <xi:include href="version-info.xml" xpointer="v254"/></listitem>
     <function>sd_event_source_set_memory_pressure_period()</function>, and
     <function>sd_event_trim_memory()</function> were added in version 254.</para>
     <para><function>sd_event_add_cpu_pressure()</function>,
-    <function>sd_event_source_set_cpu_pressure_type()</function>, and
-    <function>sd_event_source_set_cpu_pressure_period()</function> were added in version 261.</para>
+    <function>sd_event_source_set_cpu_pressure_type()</function>,
+    <function>sd_event_source_set_cpu_pressure_period()</function>,
+    <function>sd_event_add_io_pressure()</function>,
+    <function>sd_event_source_set_io_pressure_type()</function>, and
+    <function>sd_event_source_set_io_pressure_period()</function> were added in version 261.</para>
   </refsect1>
 
   <refsect1>
index 79133dc15ebcad1e7985a41d72712d0ad4d5f644..eb14cb7f3074636822c6b6a671c27945e8fe6cab 100644 (file)
 
         <xi:include href="version-info.xml" xpointer="v261"/></listitem>
       </varlistentry>
+
+      <varlistentry>
+        <term><varname>DefaultIOPressureWatch=</varname></term>
+        <term><varname>DefaultIOPressureThresholdSec=</varname></term>
+
+        <listitem><para>Configures the default settings for the per-unit
+        <varname>IOPressureWatch=</varname> and <varname>IOPressureThresholdSec=</varname>
+        settings. See
+        <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        for details. Defaults to <literal>auto</literal> and <literal>200ms</literal>, respectively. This
+        also sets the IO pressure monitoring threshold for the service manager itself.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
     </variablelist>
   </refsect1>
 
index 1048fcadfc376620e86c05c8941e7dfecba8400c..455f666374f99c5606ccc02855e602a3e3ba4b4f 100644 (file)
@@ -4717,6 +4717,18 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX
           <xi:include href="version-info.xml" xpointer="v261"/></listitem>
         </varlistentry>
 
+        <varlistentry>
+          <term><varname>$IO_PRESSURE_WATCH</varname></term>
+          <term><varname>$IO_PRESSURE_WRITE</varname></term>
+
+          <listitem><para>If IO pressure monitoring is enabled for this service unit, the path to watch
+          and the data to write into it. See <ulink url="https://systemd.io/PRESSURE">Resource Pressure
+          Handling</ulink> for details about these variables and the service protocol data they
+          convey.</para>
+
+          <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+        </varlistentry>
+
         <varlistentry>
           <term><varname>$FDSTORE</varname></term>
 
index 8d9e27f3d3a26e747c1d5a98730bb492aa56d736..f8a2e14e1b68603cc8d5a8135417986d226ec71b 100644 (file)
@@ -1706,6 +1706,55 @@ DeviceAllow=/dev/loop-control
 
         <xi:include href="version-info.xml" xpointer="v261"/></listitem>
       </varlistentry>
+
+      <varlistentry>
+        <term><varname>IOPressureWatch=</varname></term>
+
+        <listitem><para>Controls IO pressure monitoring for invoked processes. Takes a boolean or one of
+        <literal>auto</literal> and <literal>skip</literal>. If <literal>no</literal>, tells the service not
+        to watch for IO pressure events, by setting the <varname>$IO_PRESSURE_WATCH</varname>
+        environment variable to the literal string <filename>/dev/null</filename>. If <literal>yes</literal>,
+        tells the service to watch for IO pressure events. This enables IO accounting for the
+        service, and ensures the <filename>io.pressure</filename> cgroup attribute file is accessible for
+        reading and writing by the service's user. It then sets the <varname>$IO_PRESSURE_WATCH</varname>
+        environment variable for processes invoked by the unit to the file system path to this file. The
+        threshold information configured with <varname>IOPressureThresholdSec=</varname> is encoded in
+        the <varname>$IO_PRESSURE_WRITE</varname> environment variable. If the <literal>auto</literal>
+        value is set the protocol is enabled if IO accounting is anyway enabled for the unit (e.g. because
+        <varname>IOWeight=</varname> or <varname>IODeviceWeight=</varname> is set), and
+        disabled otherwise. If set to <literal>skip</literal> the logic is neither enabled, nor disabled and
+        the two environment variables are not set.</para>
+
+        <para>Note that services are free to use the two environment variables, but it is unproblematic if
+        they ignore them. IO pressure handling must be implemented individually in each service, and
+        usually means different things for different software.</para>
+
+        <para>Services implemented using
+        <citerefentry><refentrytitle>sd-event</refentrytitle><manvolnum>3</manvolnum></citerefentry> may use
+        <citerefentry><refentrytitle>sd_event_add_io_pressure</refentrytitle><manvolnum>3</manvolnum></citerefentry>
+        to watch for and handle IO pressure events.</para>
+
+        <para>If not explicitly set, defaults to the <varname>DefaultIOPressureWatch=</varname> setting in
+        <citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>IOPressureThresholdSec=</varname></term>
+
+        <listitem><para>Sets the IO pressure threshold time for IO pressure monitor as configured via
+        <varname>IOPressureWatch=</varname>. Specifies the maximum IO stall time before an IO
+        pressure event is signalled to the service, per 2s window. If not specified, defaults to the
+        <varname>DefaultIOPressureThresholdSec=</varname> setting in
+        <citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        (which in turn defaults to 200ms). The specified value expects a time unit such as
+        <literal>ms</literal> or <literal>μs</literal>, see
+        <citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry> for
+        details on the permitted syntax.</para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
     </variablelist>
 
     </refsect2><refsect2><title>Coredump Control</title>
index cf05485dc7b674dbf34c0050882b9c19cc92f368..f2a93e674f0d9b20499088005cbd8e375509a26c 100644 (file)
@@ -116,11 +116,17 @@ const PressureResourceInfo pressure_resource_info[_PRESSURE_RESOURCE_MAX] = {
                 .env_watch = "CPU_PRESSURE_WATCH",
                 .env_write = "CPU_PRESSURE_WRITE",
         },
+        [PRESSURE_IO] = {
+                .name      = "io",
+                .env_watch = "IO_PRESSURE_WATCH",
+                .env_write = "IO_PRESSURE_WRITE",
+        },
 };
 
 static const char* const pressure_resource_table[_PRESSURE_RESOURCE_MAX] = {
         [PRESSURE_MEMORY] = "memory",
         [PRESSURE_CPU]    = "cpu",
+        [PRESSURE_IO]     = "io",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(pressure_resource, PressureResource);
index b8737b8976bf4c1fee15fcdef30e0cbfeb71bb32..8716767ca5931eb6d10e1f9abbb083fa14cdd7b7 100644 (file)
@@ -12,6 +12,7 @@ typedef enum PressureType {
 typedef enum PressureResource {
         PRESSURE_MEMORY,
         PRESSURE_CPU,
+        PRESSURE_IO,
         _PRESSURE_RESOURCE_MAX,
         _PRESSURE_RESOURCE_INVALID = -EINVAL,
 } PressureResource;
index a9982de659ffd70b5b29644b1158ea15915620e4..c64521a7e657e5fb13dc1120e8a7537047837f23 100644 (file)
@@ -188,6 +188,7 @@ void cgroup_context_init(CGroupContext *c) {
                 .pressure = {
                         [PRESSURE_MEMORY] = { .watch = _CGROUP_PRESSURE_WATCH_INVALID, .threshold_usec = USEC_INFINITY },
                         [PRESSURE_CPU]    = { .watch = _CGROUP_PRESSURE_WATCH_INVALID, .threshold_usec = USEC_INFINITY },
+                        [PRESSURE_IO]     = { .watch = _CGROUP_PRESSURE_WATCH_INVALID, .threshold_usec = USEC_INFINITY },
                 },
         };
 }
@@ -531,6 +532,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
                 "%sManagedOOMPreference: %s\n"
                 "%sMemoryPressureWatch: %s\n"
                 "%sCPUPressureWatch: %s\n"
+                "%sIOPressureWatch: %s\n"
                 "%sCoredumpReceive: %s\n",
                 prefix, yes_no(c->io_accounting),
                 prefix, yes_no(c->memory_accounting),
@@ -568,6 +570,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
                 prefix, managed_oom_preference_to_string(c->moom_preference),
                 prefix, cgroup_pressure_watch_to_string(c->pressure[PRESSURE_MEMORY].watch),
                 prefix, cgroup_pressure_watch_to_string(c->pressure[PRESSURE_CPU].watch),
+                prefix, cgroup_pressure_watch_to_string(c->pressure[PRESSURE_IO].watch),
                 prefix, yes_no(c->coredump_receive));
 
         if (c->delegate_subgroup)
@@ -586,6 +589,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
                 fprintf(f, "%sCPUPressureThresholdSec: %s\n",
                         prefix, FORMAT_TIMESPAN(c->pressure[PRESSURE_CPU].threshold_usec, 1));
 
+        if (c->pressure[PRESSURE_IO].threshold_usec != USEC_INFINITY)
+                fprintf(f, "%sIOPressureThresholdSec: %s\n",
+                        prefix, FORMAT_TIMESPAN(c->pressure[PRESSURE_IO].threshold_usec, 1));
+
         if (c->moom_mem_pressure_duration_usec != USEC_INFINITY)
                 fprintf(f, "%sManagedOOMMemoryPressureDurationSec: %s\n",
                         prefix, FORMAT_TIMESPAN(c->moom_mem_pressure_duration_usec, 1));
index c4a22765678ebef7f377d5e6962135694a58f1eb..ce98f4ba7cd3b2f8661b9ca63886a226db2482ed 100644 (file)
@@ -376,6 +376,14 @@ static inline bool cgroup_context_want_pressure(const CGroupContext *c, Pressure
                         c->startup_cpu_weight != CGROUP_WEIGHT_INVALID ||
                         c->cpu_quota_per_sec_usec != USEC_INFINITY;
 
+        case PRESSURE_IO:
+                return c->io_accounting ||
+                        c->io_weight != CGROUP_WEIGHT_INVALID ||
+                        c->startup_io_weight != CGROUP_WEIGHT_INVALID ||
+                        c->io_device_weights ||
+                        c->io_device_latencies ||
+                        c->io_device_limits;
+
         default:
                 assert_not_reached();
         }
index c5a3302e08e84fc1a40184034fde9683b82aa095..927c133dd9e47bcaeaa4faf47b1f87725a9a8004 100644 (file)
@@ -431,6 +431,8 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
         SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure[PRESSURE_MEMORY].threshold_usec), 0),
         SD_BUS_PROPERTY("CPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure[PRESSURE_CPU].watch), 0),
         SD_BUS_PROPERTY("CPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure[PRESSURE_CPU].threshold_usec), 0),
+        SD_BUS_PROPERTY("IOPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, pressure[PRESSURE_IO].watch), 0),
+        SD_BUS_PROPERTY("IOPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, pressure[PRESSURE_IO].threshold_usec), 0),
         SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0),
         SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0),
 
@@ -714,11 +716,12 @@ static int bus_cgroup_set_transient_property(
 
                 return 1;
 
-        } else if (STR_IN_SET(name, "MemoryPressureWatch", "CPUPressureWatch")) {
+        } else if (STR_IN_SET(name, "MemoryPressureWatch", "CPUPressureWatch", "IOPressureWatch")) {
                 CGroupPressureWatch p;
                 const char *t;
 
-                PressureResource pt = streq(name, "MemoryPressureWatch") ? PRESSURE_MEMORY : PRESSURE_CPU;
+                PressureResource pt = streq(name, "MemoryPressureWatch") ? PRESSURE_MEMORY :
+                                      streq(name, "CPUPressureWatch") ? PRESSURE_CPU : PRESSURE_IO;
 
                 r = sd_bus_message_read(message, "s", &t);
                 if (r < 0)
@@ -739,10 +742,11 @@ static int bus_cgroup_set_transient_property(
 
                 return 1;
 
-        } else if (STR_IN_SET(name, "MemoryPressureThresholdUSec", "CPUPressureThresholdUSec")) {
+        } else if (STR_IN_SET(name, "MemoryPressureThresholdUSec", "CPUPressureThresholdUSec", "IOPressureThresholdUSec")) {
                 uint64_t t;
 
-                PressureResource pt = streq(name, "MemoryPressureThresholdUSec") ? PRESSURE_MEMORY : PRESSURE_CPU;
+                PressureResource pt = streq(name, "MemoryPressureThresholdUSec") ? PRESSURE_MEMORY :
+                                      streq(name, "CPUPressureThresholdUSec") ? PRESSURE_CPU : PRESSURE_IO;
 
                 r = sd_bus_message_read(message, "t", &t);
                 if (r < 0)
index 23f4c4c3de8519d8b4a9db8c677be20d8fc7ca69..78cab48f852fc35a5772631531f799a5c166f00f 100644 (file)
@@ -2984,6 +2984,8 @@ const sd_bus_vtable bus_manager_vtable[] = {
         SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure[PRESSURE_MEMORY].watch), 0),
         SD_BUS_PROPERTY("DefaultCPUPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure[PRESSURE_CPU].threshold_usec), 0),
         SD_BUS_PROPERTY("DefaultCPUPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure[PRESSURE_CPU].watch), 0),
+        SD_BUS_PROPERTY("DefaultIOPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, defaults.pressure[PRESSURE_IO].threshold_usec), 0),
+        SD_BUS_PROPERTY("DefaultIOPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, defaults.pressure[PRESSURE_IO].watch), 0),
         SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, defaults.oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),
index d3d23500a91f7693ec09140f2e75dfd608b72c37..143cfe6286b91636e00f79c8fe2085f18e6edcb2 100644 (file)
@@ -287,6 +287,10 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
         if (r < 0)
                 return r;
 
+        r = serialize_item(f, "exec-cgroup-context-io-pressure-watch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_IO].watch));
+        if (r < 0)
+                return r;
+
         r = serialize_item(f, "exec-cgroup-context-delegate-subgroup", c->delegate_subgroup);
         if (r < 0)
                 return r;
@@ -303,6 +307,12 @@ static int exec_cgroup_context_serialize(const CGroupContext *c, FILE *f) {
                         return r;
         }
 
+        if (c->pressure[PRESSURE_IO].threshold_usec != USEC_INFINITY) {
+                r = serialize_usec(f, "exec-cgroup-context-io-pressure-threshold-usec", c->pressure[PRESSURE_IO].threshold_usec);
+                if (r < 0)
+                        return r;
+        }
+
         LIST_FOREACH(device_allow, a, c->device_allow) {
                 r = serialize_item_format(f, "exec-cgroup-context-device-allow", "%s %s",
                                           a->path,
@@ -638,6 +648,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
                         c->pressure[PRESSURE_CPU].watch = cgroup_pressure_watch_from_string(val);
                         if (c->pressure[PRESSURE_CPU].watch < 0)
                                 return -EINVAL;
+                } else if ((val = startswith(l, "exec-cgroup-context-io-pressure-watch="))) {
+                        c->pressure[PRESSURE_IO].watch = cgroup_pressure_watch_from_string(val);
+                        if (c->pressure[PRESSURE_IO].watch < 0)
+                                return -EINVAL;
                 } else if ((val = startswith(l, "exec-cgroup-context-delegate-subgroup="))) {
                         r = free_and_strdup(&c->delegate_subgroup, val);
                         if (r < 0)
@@ -650,6 +664,10 @@ static int exec_cgroup_context_deserialize(CGroupContext *c, FILE *f) {
                         r = deserialize_usec(val, &c->pressure[PRESSURE_CPU].threshold_usec);
                         if (r < 0)
                                 return r;
+                } else if ((val = startswith(l, "exec-cgroup-context-io-pressure-threshold-usec="))) {
+                        r = deserialize_usec(val, &c->pressure[PRESSURE_IO].threshold_usec);
+                        if (r < 0)
+                                return r;
                 } else if ((val = startswith(l, "exec-cgroup-context-device-allow="))) {
                         _cleanup_free_ char *path = NULL, *rwm = NULL;
                         CGroupDevicePermissions p;
index 297836def17e7cad3ccb6dc109781156b3552dd7..17ac9c5138b263d4df56c557cc2b3b447a5e1b77 100644 (file)
 {{type}}.MemoryPressureWatch,                 config_parse_pressure_watch,                        0,                                  offsetof({{type}}, cgroup_context.pressure[PRESSURE_MEMORY].watch)
 {{type}}.CPUPressureThresholdSec,             config_parse_sec,                                   0,                                  offsetof({{type}}, cgroup_context.pressure[PRESSURE_CPU].threshold_usec)
 {{type}}.CPUPressureWatch,                    config_parse_pressure_watch,                        0,                                  offsetof({{type}}, cgroup_context.pressure[PRESSURE_CPU].watch)
+{{type}}.IOPressureThresholdSec,              config_parse_sec,                                   0,                                  offsetof({{type}}, cgroup_context.pressure[PRESSURE_IO].threshold_usec)
+{{type}}.IOPressureWatch,                     config_parse_pressure_watch,                        0,                                  offsetof({{type}}, cgroup_context.pressure[PRESSURE_IO].watch)
 {{type}}.NFTSet,                              config_parse_cgroup_nft_set,                        NFT_SET_PARSE_CGROUP,               offsetof({{type}}, cgroup_context)
 {{type}}.CoredumpReceive,                     config_parse_bool,                                  0,                                  offsetof({{type}}, cgroup_context.coredump_receive)
 {{type}}.BindNetworkInterface,                config_parse_bind_network_interface,                0,                                  offsetof({{type}}, cgroup_context)
index 7fcd0fa672dba5c73226cd9c966bbf5655fccc2f..655f0ac6659c62d4969843e439c74cc5b447683b 100644 (file)
@@ -821,6 +821,8 @@ static int parse_config_file(void) {
                 { "Manager", "DefaultMemoryPressureWatch",        config_parse_pressure_watch,        0,                        &arg_defaults.pressure[PRESSURE_MEMORY].watch          },
                 { "Manager", "DefaultCPUPressureThresholdSec",    config_parse_sec,                   0,                        &arg_defaults.pressure[PRESSURE_CPU].threshold_usec    },
                 { "Manager", "DefaultCPUPressureWatch",           config_parse_pressure_watch,        0,                        &arg_defaults.pressure[PRESSURE_CPU].watch             },
+                { "Manager", "DefaultIOPressureThresholdSec",     config_parse_sec,                   0,                        &arg_defaults.pressure[PRESSURE_IO].threshold_usec     },
+                { "Manager", "DefaultIOPressureWatch",            config_parse_pressure_watch,        0,                        &arg_defaults.pressure[PRESSURE_IO].watch              },
                 { "Manager", "CtrlAltDelBurstAction",             config_parse_emergency_action,      arg_runtime_scope,        &arg_cad_burst_action                                  },
                 { "Manager", "DefaultOOMPolicy",                  config_parse_oom_policy,            0,                        &arg_defaults.oom_policy                               },
                 { "Manager", "DefaultOOMScoreAdjust",             config_parse_oom_score_adjust,      0,                        NULL                                                   },
index c71d1a5d69a6ef0ec31180706472f48f86642438..73368ec18aec960158f664c1843303db489c4670 100644 (file)
@@ -621,6 +621,8 @@ static char** sanitize_environment(char **l) {
                         "CREDENTIALS_DIRECTORY",
                         "EXIT_CODE",
                         "EXIT_STATUS",
+                        "IO_PRESSURE_WATCH",
+                        "IO_PRESSURE_WRITE",
                         "INVOCATION_ID",
                         "JOURNAL_STREAM",
                         "LISTEN_FDNAMES",
@@ -807,6 +809,7 @@ static const struct {
 } pressure_dispatch_table[_PRESSURE_RESOURCE_MAX] = {
         [PRESSURE_MEMORY] = { sd_event_add_memory_pressure, sd_event_source_set_memory_pressure_period },
         [PRESSURE_CPU]    = { sd_event_add_cpu_pressure,    sd_event_source_set_cpu_pressure_period    },
+        [PRESSURE_IO]     = { sd_event_add_io_pressure,     sd_event_source_set_io_pressure_period     },
 };
 
 int manager_setup_pressure_event_source(Manager *m, PressureResource t) {
@@ -5213,6 +5216,7 @@ void unit_defaults_init(UnitDefaults *defaults, RuntimeScope scope) {
                 .pressure = {
                         [PRESSURE_MEMORY] = { .watch = CGROUP_PRESSURE_WATCH_AUTO, .threshold_usec = PRESSURE_DEFAULT_THRESHOLD_USEC },
                         [PRESSURE_CPU]    = { .watch = CGROUP_PRESSURE_WATCH_AUTO, .threshold_usec = PRESSURE_DEFAULT_THRESHOLD_USEC },
+                        [PRESSURE_IO]     = { .watch = CGROUP_PRESSURE_WATCH_AUTO, .threshold_usec = PRESSURE_DEFAULT_THRESHOLD_USEC },
                 },
 
                 .oom_policy = OOM_STOP,
index d3cb0160a01ea0f75e8db7644477a24f5f376233..63d28059305fe2419edf8f86412bf642309a0ccb 100644 (file)
@@ -80,6 +80,8 @@
 #DefaultMemoryPressureWatch=auto
 #DefaultCPUPressureThresholdSec=200ms
 #DefaultCPUPressureWatch=auto
+#DefaultIOPressureThresholdSec=200ms
+#DefaultIOPressureWatch=auto
 #DefaultOOMPolicy=stop
 #DefaultSmackProcessLabel=
 #DefaultRestrictSUIDSGID=
index fe45c00b74e4c6c30dfac89a0f32fd73744baa8f..33c6733268c08cd56e6364171704e246d89c838a 100644 (file)
@@ -56,6 +56,8 @@
 #DefaultMemoryPressureWatch=auto
 #DefaultCPUPressureThresholdSec=200ms
 #DefaultCPUPressureWatch=auto
+#DefaultIOPressureThresholdSec=200ms
+#DefaultIOPressureWatch=auto
 #DefaultSmackProcessLabel=
 #DefaultRestrictSUIDSGID=
 #ReloadLimitIntervalSec=
index d4ec6049e66dce2de0293e842949ce6b38433a52..ab32def28b7bb37a832ae30b61092b2d1db78835 100644 (file)
@@ -327,6 +327,8 @@ int unit_cgroup_context_build_json(sd_json_variant **ret, const char *name, void
                         JSON_BUILD_PAIR_FINITE_USEC("MemoryPressureThresholdUSec", c->pressure[PRESSURE_MEMORY].threshold_usec),
                         SD_JSON_BUILD_PAIR_STRING("CPUPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_CPU].watch)),
                         JSON_BUILD_PAIR_FINITE_USEC("CPUPressureThresholdUSec", c->pressure[PRESSURE_CPU].threshold_usec),
+                        SD_JSON_BUILD_PAIR_STRING("IOPressureWatch", cgroup_pressure_watch_to_string(c->pressure[PRESSURE_IO].watch)),
+                        JSON_BUILD_PAIR_FINITE_USEC("IOPressureThresholdUSec", c->pressure[PRESSURE_IO].threshold_usec),
 
                         /* Others */
                         SD_JSON_BUILD_PAIR_BOOLEAN("CoredumpReceive", c->coredump_receive));
index 3953b8619f7afd0023afdea7e8201e0e39b9c366..997bdc08d01220ac00a75b01212451ed1aae21ea 100644 (file)
@@ -110,6 +110,8 @@ static int manager_context_build_json(sd_json_variant **ret, const char *name, v
                         SD_JSON_BUILD_PAIR_STRING("DefaultMemoryPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure[PRESSURE_MEMORY].watch)),
                         JSON_BUILD_PAIR_FINITE_USEC("DefaultCPUPressureThresholdUSec", m->defaults.pressure[PRESSURE_CPU].threshold_usec),
                         SD_JSON_BUILD_PAIR_STRING("DefaultCPUPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure[PRESSURE_CPU].watch)),
+                        JSON_BUILD_PAIR_FINITE_USEC("DefaultIOPressureThresholdUSec", m->defaults.pressure[PRESSURE_IO].threshold_usec),
+                        SD_JSON_BUILD_PAIR_STRING("DefaultIOPressureWatch", cgroup_pressure_watch_to_string(m->defaults.pressure[PRESSURE_IO].watch)),
                         JSON_BUILD_PAIR_FINITE_USEC("RuntimeWatchdogUSec", manager_get_watchdog(m, WATCHDOG_RUNTIME)),
                         JSON_BUILD_PAIR_FINITE_USEC("RebootWatchdogUSec", manager_get_watchdog(m, WATCHDOG_REBOOT)),
                         JSON_BUILD_PAIR_FINITE_USEC("KExecWatchdogUSec", manager_get_watchdog(m, WATCHDOG_KEXEC)),
index 5f5eca60833b207ecf5632dfbce04752feb05988..38ab92dea124b25242d79ced8b39a985d8da80d5 100644 (file)
@@ -1099,4 +1099,7 @@ global:
         sd_event_add_cpu_pressure;
         sd_event_source_set_cpu_pressure_type;
         sd_event_source_set_cpu_pressure_period;
+        sd_event_add_io_pressure;
+        sd_event_source_set_io_pressure_type;
+        sd_event_source_set_io_pressure_period;
 } LIBSYSTEMD_260;
index c7d5ba166da31fea2d68281d1cf800e74ecefa18..8487c966ab4096b73836058d034b60641bb50de8 100644 (file)
@@ -27,6 +27,7 @@ typedef enum EventSourceType {
         SOURCE_INOTIFY,
         SOURCE_MEMORY_PRESSURE,
         SOURCE_CPU_PRESSURE,
+        SOURCE_IO_PRESSURE,
         _SOURCE_EVENT_SOURCE_TYPE_MAX,
         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL,
 } EventSourceType;
index aba6bf9b4787b317c8a2717f42bea802c5c17ed6..9256ddd81bfeaa08801871a98b662f9510b0becb 100644 (file)
@@ -77,6 +77,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX]
         [SOURCE_INOTIFY]             = "inotify",
         [SOURCE_MEMORY_PRESSURE]     = "memory-pressure",
         [SOURCE_CPU_PRESSURE]        = "cpu-pressure",
+        [SOURCE_IO_PRESSURE]         = "io-pressure",
 };
 
 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
@@ -101,7 +102,8 @@ DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
                SOURCE_DEFER,                    \
                SOURCE_INOTIFY,                  \
                SOURCE_MEMORY_PRESSURE,          \
-               SOURCE_CPU_PRESSURE)
+               SOURCE_CPU_PRESSURE,             \
+               SOURCE_IO_PRESSURE)
 
 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
@@ -566,7 +568,7 @@ static int source_child_pidfd_register(sd_event_source *s, int enabled) {
         return 0;
 }
 
-#define EVENT_SOURCE_IS_PRESSURE(s) IN_SET((s)->type, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE)
+#define EVENT_SOURCE_IS_PRESSURE(s) IN_SET((s)->type, SOURCE_MEMORY_PRESSURE, SOURCE_CPU_PRESSURE, SOURCE_IO_PRESSURE)
 
 static void source_pressure_unregister(sd_event_source *s) {
         assert(s);
@@ -1052,6 +1054,7 @@ static void source_disconnect(sd_event_source *s) {
 
         case SOURCE_MEMORY_PRESSURE:
         case SOURCE_CPU_PRESSURE:
+        case SOURCE_IO_PRESSURE:
                 source_pressure_remove_from_write_list(s);
                 source_pressure_unregister(s);
                 break;
@@ -1198,6 +1201,7 @@ static sd_event_source* source_new(sd_event *e, bool floating, EventSourceType t
                 [SOURCE_INOTIFY]             = endoffsetof_field(sd_event_source, inotify),
                 [SOURCE_MEMORY_PRESSURE]     = endoffsetof_field(sd_event_source, pressure),
                 [SOURCE_CPU_PRESSURE]        = endoffsetof_field(sd_event_source, pressure),
+                [SOURCE_IO_PRESSURE]         = endoffsetof_field(sd_event_source, pressure),
         };
 
         sd_event_source *s;
@@ -2110,8 +2114,8 @@ static int event_add_pressure(
          * fd with the epoll right-away. Instead, we just add the event source to a list of pressure event
          * sources on which writes must be executed before the first event loop iteration is executed. (We
          * could also write the data here, right away, but we want to give the caller the freedom to call
-         * sd_event_source_set_{memory,cpu}_pressure_type() and
-         * sd_event_source_set_{memory,cpu}_pressure_period() before we write it. */
+         * sd_event_source_set_{memory,cpu,io}_pressure_type() and
+         * sd_event_source_set_{memory,cpu,io}_pressure_period() before we write it. */
 
         if (s->pressure.write_buffer_size > 0)
                 source_pressure_add_to_write_list(s);
@@ -2160,6 +2164,25 @@ _public_ int sd_event_add_cpu_pressure(
                         PRESSURE_CPU);
 }
 
+static int io_pressure_callback(sd_event_source *s, void *userdata) {
+        assert(s);
+
+        return 0;
+}
+
+_public_ int sd_event_add_io_pressure(
+                sd_event *e,
+                sd_event_source **ret,
+                sd_event_handler_t callback,
+                void *userdata) {
+
+        return event_add_pressure(
+                        e, ret, callback, userdata,
+                        SOURCE_IO_PRESSURE,
+                        io_pressure_callback,
+                        PRESSURE_IO);
+}
+
 static void event_free_inotify_data(sd_event *e, InotifyData *d) {
         assert(e);
 
@@ -2962,6 +2985,7 @@ static int event_source_offline(
 
         case SOURCE_MEMORY_PRESSURE:
         case SOURCE_CPU_PRESSURE:
+        case SOURCE_IO_PRESSURE:
                 source_pressure_unregister(s);
                 break;
 
@@ -3054,6 +3078,7 @@ static int event_source_online(
 
         case SOURCE_MEMORY_PRESSURE:
         case SOURCE_CPU_PRESSURE:
+        case SOURCE_IO_PRESSURE:
                 /* As documented in sd_event_add_{memory,cpu,io}_pressure(), we can only register the PSI fd
                  * with epoll after writing the watch string. */
                 if (s->pressure.write_buffer_size == 0) {
@@ -4308,6 +4333,7 @@ static int source_dispatch(sd_event_source *s) {
 
         case SOURCE_MEMORY_PRESSURE:
         case SOURCE_CPU_PRESSURE:
+        case SOURCE_IO_PRESSURE:
                 r = s->pressure.callback(s, s->userdata);
                 break;
 
@@ -4723,6 +4749,7 @@ static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t
 
                                 case SOURCE_MEMORY_PRESSURE:
                                 case SOURCE_CPU_PRESSURE:
+                                case SOURCE_IO_PRESSURE:
                                         r = process_pressure(s, i->events);
                                         break;
 
@@ -5418,6 +5445,13 @@ _public_ int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const cha
         return event_source_set_pressure_type(s, ty);
 }
 
+_public_ int sd_event_source_set_io_pressure_type(sd_event_source *s, const char *ty) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO_PRESSURE, -EDOM);
+
+        return event_source_set_pressure_type(s, ty);
+}
+
 static int event_source_set_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
         _cleanup_free_ char *b = NULL;
         _cleanup_free_ void *w = NULL;
@@ -5478,3 +5512,10 @@ _public_ int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_
 
         return event_source_set_pressure_period(s, threshold_usec, window_usec);
 }
+
+_public_ int sd_event_source_set_io_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO_PRESSURE, -EDOM);
+
+        return event_source_set_pressure_period(s, threshold_usec, window_usec);
+}
index 9c732543fac7d16acc517f4fb47ea1ad94bef761..1a6bc7370f81e061b929fa16e380823368219219 100644 (file)
@@ -2384,6 +2384,7 @@ static const BusProperty cgroup_properties[] = {
         { "ManagedOOMPreference",                  bus_append_string                             },
         { "MemoryPressureWatch",                   bus_append_string                             },
         { "CPUPressureWatch",                      bus_append_string                             },
+        { "IOPressureWatch",                       bus_append_string                             },
         { "DelegateSubgroup",                      bus_append_string                             },
         { "ManagedOOMMemoryPressureLimit",         bus_append_parse_permyriad                    },
         { "MemoryAccounting",                      bus_append_parse_boolean                      },
@@ -2423,6 +2424,7 @@ static const BusProperty cgroup_properties[] = {
         { "SocketBindDeny",                        bus_append_socket_filter                      },
         { "MemoryPressureThresholdSec",            bus_append_parse_sec_rename                   },
         { "CPUPressureThresholdSec",               bus_append_parse_sec_rename                   },
+        { "IOPressureThresholdSec",                bus_append_parse_sec_rename                   },
         { "NFTSet",                                bus_append_nft_set                            },
         { "BindNetworkInterface",                  bus_append_string                             },
 
index f947c0a05615c8d5ef4ebbaefe34eb83dc5a5440..9ce1b8350abeee9f9e2d613bd0cfc6087c359c94 100644 (file)
@@ -68,6 +68,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE(
                 SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureThresholdUSec, SD_VARLINK_INT, 0),
                 SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultCPUPressureWatch="),
                 SD_VARLINK_DEFINE_FIELD(DefaultCPUPressureWatch, SD_VARLINK_STRING, 0),
+                SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultIOPressureThresholdUSec="),
+                SD_VARLINK_DEFINE_FIELD(DefaultIOPressureThresholdUSec, SD_VARLINK_INT, 0),
+                SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#DefaultIOPressureWatch="),
+                SD_VARLINK_DEFINE_FIELD(DefaultIOPressureWatch, SD_VARLINK_STRING, 0),
                 SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RuntimeWatchdogSec="),
                 SD_VARLINK_DEFINE_FIELD(RuntimeWatchdogUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
                 SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd-system.conf.html#RebootWatchdogSec="),
index a230f29daba8b711b98f151a2368402ebadf5820..c1ff4ebc5a76cd718c5c2dc99e3da04bc4ab0fc4 100644 (file)
@@ -232,6 +232,10 @@ static SD_VARLINK_DEFINE_STRUCT_TYPE(
                 SD_VARLINK_DEFINE_FIELD(CPUPressureWatch, SD_VARLINK_STRING, 0),
                 SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#CPUPressureThresholdSec="),
                 SD_VARLINK_DEFINE_FIELD(CPUPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
+                SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#IOPressureWatch="),
+                SD_VARLINK_DEFINE_FIELD(IOPressureWatch, SD_VARLINK_STRING, 0),
+                SD_VARLINK_FIELD_COMMENT("https://www.freedesktop.org/software/systemd/man/"PROJECT_VERSION_STR"/systemd.resource-control.html#IOPressureThresholdSec="),
+                SD_VARLINK_DEFINE_FIELD(IOPressureThresholdUSec, SD_VARLINK_INT, SD_VARLINK_NULLABLE),
 
                 /* Others */
                 SD_VARLINK_FIELD_COMMENT("Reflects whether to forward coredumps for processes that crash within this cgroup"),
index 71fc9504889e621da4f29f23a0a5cb17addf495d..34bd396080dc32d56d692749b89a28891f379e49 100644 (file)
@@ -98,6 +98,7 @@ int sd_event_add_post(sd_event *e, sd_event_source **ret, sd_event_handler_t cal
 int sd_event_add_exit(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata);
 int sd_event_add_memory_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata);
 int sd_event_add_cpu_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata);
+int sd_event_add_io_pressure(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata);
 
 int sd_event_prepare(sd_event *e);
 int sd_event_wait(sd_event *e, uint64_t timeout);
@@ -165,6 +166,8 @@ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty)
 int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec);
 int sd_event_source_set_cpu_pressure_type(sd_event_source *s, const char *ty);
 int sd_event_source_set_cpu_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec);
+int sd_event_source_set_io_pressure_type(sd_event_source *s, const char *ty);
+int sd_event_source_set_io_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec);
 int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback);
 int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret);
 int sd_event_source_get_floating(sd_event_source *s);
index 44ff810753e8aefd0ef98c3a7d60c33d6172e5ca..318b73e4fd6ccdea586ccca6271d7ddaf889ba11 100644 (file)
@@ -154,6 +154,14 @@ TEST(fake_cpu_pressure) {
         test_fake_pressure("cpu", fake_cpu_pressure_wrapper);
 }
 
+static int fake_io_pressure_wrapper(sd_event *e, sd_event_source **ret, sd_event_handler_t callback, void *userdata) {
+        return sd_event_add_io_pressure(e, ret, callback, userdata);
+}
+
+TEST(fake_io_pressure) {
+        test_fake_pressure("io", fake_io_pressure_wrapper);
+}
+
 /* Shared infrastructure for real pressure tests */
 
 struct real_pressure_context {
@@ -452,7 +460,142 @@ TEST(real_cpu_pressure) {
         ASSERT_EQ(ex, 31);
 }
 
+/* IO pressure real test */
+
+static int real_io_pressure_callback(sd_event_source *s, void *userdata) {
+        struct real_pressure_context *c = ASSERT_PTR(userdata);
+        const char *d;
+
+        ASSERT_NOT_NULL(s);
+        ASSERT_OK(sd_event_source_get_description(s, &d));
+
+        log_notice("real io pressure event: %s", d);
+
+        ASSERT_NOT_NULL(c->pid);
+        ASSERT_OK(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0));
+        c->pid = NULL;
+
+        return 0;
+}
+
+_noreturn_ static void real_pressure_eat_io(int pipe_fd) {
+        char x;
+        ASSERT_EQ(read(pipe_fd, &x, 1), 1); /* Wait for the GO! */
+
+        /* Write and fsync in a loop to generate IO pressure */
+        for (;;) {
+                _cleanup_close_ int fd = -EBADF;
+
+                fd = open("/var/tmp/.io-pressure-test", O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, 0600);
+                if (fd < 0)
+                        continue;
+
+                char buf[4096];
+                memset(buf, 'x', sizeof(buf));
+                for (int i = 0; i < 256; i++)
+                        if (write(fd, buf, sizeof(buf)) < 0)
+                                break;
+                (void) fsync(fd);
+        }
+}
+
+TEST(real_io_pressure) {
+        _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL;
+        _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+        _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL;
+        _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL;
+        _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+        _cleanup_close_pair_ int pipe_fd[2] = EBADF_PAIR;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        _cleanup_free_ char *scope = NULL;
+        const char *object;
+        int r;
+
+        if (getuid() == 0)
+                r = sd_bus_open_system(&bus);
+        else
+                r = sd_bus_open_user(&bus);
+        if (r < 0)
+                return (void) log_tests_skipped_errno(r, "can't connect to bus");
+
+        ASSERT_OK(bus_wait_for_jobs_new(bus, &w));
+
+        ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"));
+        ASSERT_OK(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64()));
+        ASSERT_OK(sd_bus_message_append(m, "ss", scope, "fail"));
+        ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)"));
+        ASSERT_OK(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0));
+        ASSERT_OK(sd_bus_message_append(m, "(sv)", "IOAccounting", "b", true));
+        ASSERT_OK(sd_bus_message_close_container(m));
+        ASSERT_OK(sd_bus_message_append(m, "a(sa(sv))", 0));
+
+        r = sd_bus_call(bus, m, 0, &error, &reply);
+        if (r < 0)
+                return (void) log_tests_skipped_errno(r, "can't issue transient unit call");
+
+        ASSERT_OK(sd_bus_message_read(reply, "o", &object));
+
+        ASSERT_OK(bus_wait_for_jobs_one(w, object, /* flags= */ BUS_WAIT_JOBS_LOG_ERROR, /* extra_args= */ NULL));
+
+        ASSERT_OK(sd_event_default(&e));
+
+        ASSERT_OK_ERRNO(pipe2(pipe_fd, O_CLOEXEC));
+
+        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+        r = pidref_safe_fork("(eat-io)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM, &pidref);
+        ASSERT_OK(r);
+        if (r == 0) {
+                real_pressure_eat_io(pipe_fd[0]);
+                _exit(EXIT_SUCCESS);
+        }
+
+        ASSERT_OK(event_add_child_pidref(e, &cs, &pidref, WEXITED, real_pressure_child_callback, NULL));
+        ASSERT_OK(sd_event_source_set_child_process_own(cs, true));
+
+        ASSERT_OK_ERRNO(unsetenv("IO_PRESSURE_WATCH"));
+        ASSERT_OK_ERRNO(unsetenv("IO_PRESSURE_WRITE"));
+
+        struct real_pressure_context context = {
+                .pid = cs,
+        };
+
+        r = sd_event_add_io_pressure(e, &es, real_io_pressure_callback, &context);
+        if (r < 0)
+                return (void) log_tests_skipped_errno(r, "can't allocate io pressure fd");
+
+        ASSERT_OK(sd_event_source_set_description(es, "real pressure event source"));
+        ASSERT_OK_ZERO(sd_event_source_set_io_pressure_type(es, "some"));
+        /* Unprivileged writes require a minimum of 2s otherwise the kernel will refuse the write. */
+        ASSERT_OK_POSITIVE(sd_event_source_set_io_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC));
+        ASSERT_OK_ZERO(sd_event_source_set_io_pressure_period(es, 70 * USEC_PER_MSEC, 2 * USEC_PER_SEC));
+        ASSERT_OK(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT));
+
+        m = sd_bus_message_unref(m);
+
+        ASSERT_OK(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties"));
+        ASSERT_OK(sd_bus_message_append(m, "sb", scope, true));
+        ASSERT_OK(sd_bus_message_open_container(m, 'a', "(sv)"));
+        ASSERT_OK(sd_bus_message_open_container(m, 'r', "sv"));
+        ASSERT_OK(sd_bus_message_append(m, "s", "IOWriteBandwidthMax"));
+        ASSERT_OK(sd_bus_message_open_container(m, 'v', "a(st)"));
+        ASSERT_OK(sd_bus_message_append(m, "a(st)", 1, "/var/tmp", (uint64_t) 1024*1024)); /* 1M/s */
+        ASSERT_OK(sd_bus_message_close_container(m));
+        ASSERT_OK(sd_bus_message_close_container(m));
+        ASSERT_OK(sd_bus_message_close_container(m));
+
+        ASSERT_OK(sd_bus_call(bus, m, 0, NULL, NULL));
+
+        /* Now start eating IO */
+        ASSERT_EQ(write(pipe_fd[1], &(const char) { 'x' }, 1), 1);
+
+        ASSERT_OK(sd_event_loop(e));
+        int ex = 0;
+        ASSERT_OK(sd_event_get_exit_code(e, &ex));
+        ASSERT_EQ(ex, 31);
+}
+
 static int outro(void) {
+        (void) unlink("/var/tmp/.io-pressure-test");
         hashmap_trim_pools();
         return 0;
 }
index d4e4a9e06b5b44b93c4804e1bd3e9983289b77e2..72de8a1d9d189b79a49c028c61f66981f4eb119e 100755 (executable)
@@ -114,4 +114,57 @@ systemd-run \
 
 rm "$SCRIPT"
 
+# Now test IO pressure
+
+if ! cat /proc/pressure/io >/dev/null ; then
+    echo "kernel has no IO PSI support." >&2
+    echo OK >/testok
+    exit 0
+fi
+
+if ! test -f "$CGROUP"/io.pressure ; then
+    echo "No IO accounting/PSI delegated via cgroup, can't test." >&2
+    echo OK >/testok
+    exit 0
+fi
+
+UNIT="test-iopress-$RANDOM.service"
+SCRIPT="/tmp/iopress-$RANDOM.sh"
+
+cat >"$SCRIPT" <<'EOF'
+#!/usr/bin/env bash
+
+set -ex
+
+export
+id
+
+test -n "$IO_PRESSURE_WATCH"
+test "$IO_PRESSURE_WATCH" != /dev/null
+test -w "$IO_PRESSURE_WATCH"
+
+ls -al "$IO_PRESSURE_WATCH"
+
+EXPECTED="$(echo -n -e "some 123000 2000000\x00" | base64)"
+
+test "$EXPECTED" = "$IO_PRESSURE_WRITE"
+
+EOF
+
+chmod +x "$SCRIPT"
+
+systemd-run \
+    -u "$UNIT" \
+    -p Type=exec \
+    -p ProtectControlGroups=1 \
+    -p DynamicUser=1 \
+    -p IOPressureWatch=on \
+    -p IOPressureThresholdSec=123ms \
+    -p BindPaths=$SCRIPT \
+    `# Make sanitizers happy when DynamicUser=1 pulls in instrumented systemd NSS modules` \
+    -p EnvironmentFile=-/usr/lib/systemd/systemd-asan-env \
+    --wait "$SCRIPT"
+
+rm "$SCRIPT"
+
 touch /testok