]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - man/systemd.exec.xml
man: document relationship of .socket units and network namespaces
[thirdparty/systemd.git] / man / systemd.exec.xml
index a17db8d8505d134fc0dcfe2357fcb044cef88108..5c043497bbe1da7fb90867df9f4209330d3e1048 100644 (file)
@@ -1,4 +1,4 @@
-<?xml version='1.0'?> <!--*- Mode: nxml; nxml-child-indent: 2; indent-tabs-mode: nil -*-->
+<?xml version='1.0'?>
 <!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
   "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
 
@@ -81,6 +81,9 @@
   <refsect1>
     <title>Paths</title>
 
+    <para>The following settings may be used to change a service's view of the filesystem. Please note that the paths
+    must be absolute and must not contain a <literal>..</literal> path component.</para>
+
     <variablelist class='unit-directives'>
 
       <varlistentry>
         partition table, or a file system within an MBR/MS-DOS or GPT partition table with only a single
         Linux-compatible partition, or a set of file systems within a GPT partition table that follows the <ulink
         url="https://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/">Discoverable Partitions
-        Specification</ulink>.</para></listitem>
+        Specification</ulink>.</para>
+
+        <para>When <varname>DevicePolicy=</varname> is set to <literal>closed</literal> or <literal>strict</literal>,
+        or set to <literal>auto</literal> and <varname>DeviceAllow=</varname> is set, then this setting adds
+        <filename>/dev/loop-control</filename> with <constant>rw</constant> mode, <literal>block-loop</literal> and
+        <literal>block-blkext</literal> with <constant>rwm</constant> mode to <varname>DeviceAllow=</varname>. See
+        <citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        for the details about <varname>DevicePolicy=</varname> or <varname>DeviceAllow=</varname>. Also, see
+        <varname>PrivateDevices=</varname> below, as it may change the setting of <varname>DevicePolicy=</varname>.
+        </para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -738,6 +750,15 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
   <refsect1>
     <title>Sandboxing</title>
 
+    <para>The following sandboxing options are an effective way to limit the exposure of the system towards the unit's
+    processes. It is recommended to turn on as many of these options for each unit as is possible without negatively
+    affecting the process' ability to operate. Note that many of these sandboxing features are gracefully turned off on
+    systems where the underlying security mechanism is not available. For example, <varname>ProtectSystem=</varname>
+    has no effect if the kernel is built without file system namespacing or if the service manager runs in a container
+    manager that makes file system namespacing unavailable to its payload. Similar,
+    <varname>RestrictRealtime=</varname> has no effect on systems that lack support for SECCOMP system call filtering,
+    or in containers where support for this is turned off.</para>
+
     <variablelist>
 
       <varlistentry>
@@ -793,15 +814,18 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
         <listitem><para>These options take a whitespace-separated list of directory names. The specified directory
         names must be relative, and may not include <literal>..</literal>. If set, one or more
         directories by the specified names will be created (including their parents) below the locations
-        defined in the following table, when the unit is started.</para>
+        defined in the following table, when the unit is started. Also, the corresponding environment variable
+        is defined with the full path of directories. If multiple directories are set, then int the environment variable
+        the paths are concatenated with colon (<literal>:</literal>).</para>
         <table>
-          <title>Automatic directory creation</title>
-          <tgroup cols='3'>
+          <title>Automatic directory creation and environment variables</title>
+          <tgroup cols='4'>
             <thead>
               <row>
                 <entry>Locations</entry>
                 <entry>for system</entry>
                 <entry>for users</entry>
+                <entry>Environment variable</entry>
               </row>
             </thead>
             <tbody>
@@ -809,26 +833,31 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
                 <entry><varname>RuntimeDirectory=</varname></entry>
                 <entry><filename>/run</filename></entry>
                 <entry><varname>$XDG_RUNTIME_DIR</varname></entry>
+                <entry><varname>$RUNTIME_DIRECTORY</varname></entry>
               </row>
               <row>
                 <entry><varname>StateDirectory=</varname></entry>
                 <entry><filename>/var/lib</filename></entry>
                 <entry><varname>$XDG_CONFIG_HOME</varname></entry>
+                <entry><varname>$STATE_DIRECTORY</varname></entry>
               </row>
               <row>
                 <entry><varname>CacheDirectory=</varname></entry>
                 <entry><filename>/var/cache</filename></entry>
                 <entry><varname>$XDG_CACHE_HOME</varname></entry>
+                <entry><varname>$CACHE_DIRECTORY</varname></entry>
               </row>
               <row>
                 <entry><varname>LogsDirectory=</varname></entry>
                 <entry><filename>/var/log</filename></entry>
                 <entry><varname>$XDG_CONFIG_HOME</varname><filename>/log</filename></entry>
+                <entry><varname>$LOGS_DIRECTORY</varname></entry>
               </row>
               <row>
                 <entry><varname>ConfigurationDirectory=</varname></entry>
                 <entry><filename>/etc</filename></entry>
                 <entry><varname>$XDG_CONFIG_HOME</varname></entry>
+                <entry><varname>$CONFIGURATION_DIRECTORY</varname></entry>
               </row>
             </tbody>
           </tgroup>
@@ -878,7 +907,13 @@ CapabilityBoundingSet=~CAP_B CAP_C</programlisting>
         <filename>/run/foo/bar</filename>, and <filename>/run/baz</filename>. The directories
         <filename>/run/foo/bar</filename> and <filename>/run/baz</filename> except <filename>/run/foo</filename> are
         owned by the user and group specified in <varname>User=</varname> and <varname>Group=</varname>, and removed
-        when the service is stopped.</para></listitem>
+        when the service is stopped.</para>
+
+        <para>Example: if a system service unit has the following,
+        <programlisting>RuntimeDirectory=foo/bar
+StateDirectory=aaa/bbb ccc</programlisting>
+        then the environment variable <literal>RUNTIME_DIRECTORY</literal> is set with <literal>/run/foo/bar</literal>, and
+        <literal>STATE_DIRECTORY</literal> is set with <literal>/var/lib/aaa/bbb:/var/lib/ccc</literal>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
@@ -1043,9 +1078,13 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
         Defaults to false. It is possible to run two or more units within the same private network namespace by using
         the <varname>JoinsNamespaceOf=</varname> directive, see
         <citerefentry><refentrytitle>systemd.unit</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
-        details. Note that this option will disconnect all socket families from the host, this includes AF_NETLINK and
-        AF_UNIX.  The latter has the effect that AF_UNIX sockets in the abstract socket namespace will become
-        unavailable to the processes (however, those located in the file system will continue to be accessible).</para>
+        details. Note that this option will disconnect all socket families from the host, including
+        <constant>AF_NETLINK</constant> and <constant>AF_UNIX</constant>. Effectively, for
+        <constant>AF_NETLINK</constant> this means that device configuration events received from
+        <citerefentry><refentrytitle>systemd-udevd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry> are
+        not delivered to the unit's processes. And for <constant>AF_UNIX</constant> this has the effect that
+        <constant>AF_UNIX</constant> sockets in the abstract socket namespace of the host will become unavailable to
+        the unit's processes (however, those located in the file system will continue to be accessible).</para>
 
         <para>Note that the implementation of this setting might be impossible (for example if network namespaces are
         not available), and the unit should be written in a way that does not solely rely on this setting for
@@ -1311,10 +1350,7 @@ RestrictNamespaces=~cgroup net</programlisting>
         settings (see the discussion in <varname>PrivateMounts=</varname> above) will implicitly disable mount and
         unmount propagation from the unit's processes towards the host by changing the propagation setting of all mount
         points in the unit's file system namepace to <option>slave</option> first. Setting this option to
-        <option>shared</option> does not reestablish propagation in that case. Conversely, if this option is set, but
-        no other file system namespace setting is used, then new file system namespaces will be created for the unit's
-        processes and this propagation flag will be applied right away to all mounts within it, without the
-        intermediary application of <option>slave</option>.</para>
+        <option>shared</option> does not reestablish propagation in that case.</para>
 
         <para>If not set – but file system namespaces are enabled through another file system namespace unit setting –
         <option>shared</option> mount propagation is used, but — as mentioned — as <option>slave</option> is applied
@@ -1490,6 +1526,10 @@ RestrictNamespaces=~cgroup net</programlisting>
                 <entry>@sync</entry>
                 <entry>Synchronizing files and memory to disk: (<citerefentry project='man-pages'><refentrytitle>fsync</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>msync</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry>
               </row>
+              <row>
+                <entry>@system-service</entry>
+                <entry>A reasonable set of system calls used by common system services, excluding any special purpose calls. This is the recommended starting point for whitelisting system calls for system services, as it contains what is typically needed by system services, but excludes overly specific interfaces. For example, the following APIs are excluded: <literal>@clock</literal>, <literal>@mount</literal>, <literal>@swap</literal>, <literal>@reboot</literal>.</entry>
+              </row>
               <row>
                 <entry>@timer</entry>
                 <entry>System calls for scheduling operations by time (<citerefentry project='man-pages'><refentrytitle>alarm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>timer_create</refentrytitle><manvolnum>2</manvolnum></citerefentry>, …)</entry>
@@ -1504,6 +1544,14 @@ RestrictNamespaces=~cgroup net</programlisting>
         <command>systemd-analyze syscall-filter</command> to list the actual list of system calls in each
         filter.</para>
 
+        <para>Generally, whitelisting system calls (rather than blacklisting) is the safer mode of operation. It is
+        recommended to enforce system call whitelists for all long-running system services. Specifically, the
+        following lines are a relatively safe basic choice for the majority of system services:</para>
+
+        <programlisting>[Service]
+SystemCallFilter=@system-service
+SystemCallErrorNumber=EPERM</programlisting>
+
         <para>It is recommended to combine the file system namespacing related options with
         <varname>SystemCallFilter=~@mount</varname>, in order to prohibit the unit's processes to undo the
         mappings. Specifically these are the options <varname>PrivateTmp=</varname>,
@@ -1737,8 +1785,8 @@ RestrictNamespaces=~cgroup net</programlisting>
         of <option>inherit</option>, <option>null</option>, <option>tty</option>, <option>journal</option>,
         <option>syslog</option>, <option>kmsg</option>, <option>journal+console</option>,
         <option>syslog+console</option>, <option>kmsg+console</option>,
-        <option>file:<replaceable>path</replaceable></option>, <option>socket</option> or
-        <option>fd:<replaceable>name</replaceable></option>.</para>
+        <option>file:<replaceable>path</replaceable></option>, <option>append:<replaceable>path</replaceable></option>,
+        <option>socket</option> or<option>fd:<replaceable>name</replaceable></option>.</para>
 
         <para><option>inherit</option> duplicates the file descriptor of standard input for standard output.</para>
 
@@ -1769,11 +1817,17 @@ RestrictNamespaces=~cgroup net</programlisting>
 
         <para>The <option>file:<replaceable>path</replaceable></option> option may be used to connect a specific file
         system object to standard output. The semantics are similar to the same option of
-        <varname>StandardInput=</varname>, see above. If standard input and output are directed to the same file path,
-        it is opened only once, for reading as well as writing and duplicated. This is particular useful when the
-        specified path refers to an <constant>AF_UNIX</constant> socket in the file system, as in that case only a
+        <varname>StandardInput=</varname>, see above. If <replaceable>path</replaceable> refers to a regular file
+        on the filesystem, it is opened (created if it doesn't exist yet) for writing at the beginning of the file,
+        but without truncating it.
+        If standard input and output are directed to the same file path, it is opened only once, for reading as well
+        as writing and duplicated. This is particularly useful when the specified path refers to an
+        <constant>AF_UNIX</constant> socket in the file system, as in that case only a
         single stream connection is created for both input and output.</para>
 
+        <para><option>append:<replaceable>path</replaceable></option> is similar to <option>file:<replaceable>path
+        </replaceable></option> above, but it opens the file in append mode.</para>
+
         <para><option>socket</option> connects standard output to a socket acquired via socket activation. The
         semantics are similar to the same option of <varname>StandardInput=</varname>, see above.</para>
 
@@ -1893,6 +1947,22 @@ StandardInputData=SWNrIHNpdHplIGRhIHVuJyBlc3NlIEtsb3BzLAp1ZmYgZWVtYWwga2xvcHAncy
         matching. Assign an empty string to reset the list.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>LogRateLimitIntervalSec=</varname></term>
+        <term><varname>LogRateLimitBurst=</varname></term>
+
+        <listitem><para>Configures the rate limiting that is applied to messages generated by this unit. If, in the
+        time interval defined by <varname>LogRateLimitIntervalSec=</varname>, more messages than specified in
+        <varname>LogRateLimitBurst=</varname> are logged by a service, all further messages within the interval are
+        dropped until the interval is over. A message about the number of dropped messages is generated. The time
+        specification for <varname>LogRateLimitIntervalSec=</varname> may be specified in the following units: "s",
+        "min", "h", "ms", "us" (see
+        <citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details).
+        The default settings are set by <varname>RateLimitIntervalSec=</varname> and <varname>RateLimitBurst=</varname>
+        configured in <citerefentry><refentrytitle>journald.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>SyslogIdentifier=</varname></term>