]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: add new --rlimit= switch, and always set resource limits explicitly for our...
authorLennart Poettering <lennart@poettering.net>
Mon, 7 May 2018 15:59:18 +0000 (17:59 +0200)
committerLennart Poettering <lennart@poettering.net>
Thu, 17 May 2018 18:45:54 +0000 (20:45 +0200)
This ensures we set the various resource limits of our container
explicitly on each invocation so that we inherit less from our callers
into the payload.

By default resource limits are now set to the same values Linux
generally passes to the host PID 1, thus minimizing needless differences
between host and container environments.

The limits are now also configurable using a new --rlimit= switch. This
is preparation for teaching nspawn native OCI runtime support as OCI
permits setting resource limits for container payloads, and it hence
probably makes sense if we do too.

man/systemd-nspawn.xml
man/systemd.nspawn.xml
src/nspawn/nspawn-gperf.gperf
src/nspawn/nspawn-settings.c
src/nspawn/nspawn-settings.h
src/nspawn/nspawn.c

index 713782b85955788a625e57847646f6842ec28705..c6b027c58f72931429eaa929651e7f29d4048d8c 100644 (file)
         capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><option>--rlimit=</option></term>
+
+        <listitem><para>Sets the specified POSIX resource limit for the container payload. Expects an assignment of the
+        form
+        <literal><replaceable>LIMIT</replaceable>=<replaceable>SOFT</replaceable>:<replaceable>HARD</replaceable></literal>
+        or <literal><replaceable>LIMIT</replaceable>=<replaceable>VALUE</replaceable></literal>, where
+        <replaceable>LIMIT</replaceable> should refer to a resource limit type, such as
+        <constant>RLIMIT_NOFILE</constant> or <constant>RLIMIT_NICE</constant>. The <replaceable>SOFT</replaceable> and
+        <replaceable>HARD</replaceable> fields should refer to the numeric soft and hard resource limit values. If the
+        second form is used, <replaceable>VALUE</replaceable> may specifiy a value that is used both as soft and hard
+        limit. In place of a numeric value the special string <literal>infinity</literal> may be used to turn off
+        resource limiting for the specific type of resource. This command line option may be used multiple times to
+        control limits on multiple limit types. If used multiple times for the same limit type, the last last use
+        wins. For details about resource limits see <citerefentry
+        project='man-pages'><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry>. By default
+        resource limits for the container's init process (PID 1) are set to the same values the Linux kernel originally
+        passed to the host init system. Note that some resource limits are enforced on resources counted per user, in
+        particular <constant>RLIMIT_NPROC</constant>. This means that unless user namespacing is deployed
+        (i.e. <option>--private-users=</option> is used, see above), any limits set will be applied to the resource
+        usage of the same user on all local containers as well as the host. This means particular care needs to be
+        taken with these limits as they might be triggered by possibly less trusted code. Example:
+        <literal>--rlimit=RLIMIT_NOFILE=8192:16384</literal>.</para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><option>--kill-signal=</option></term>
 
index b5c60a33e0b0ac2f2b642f80b0bdb7400511e14f..6bd7b33b34b432c15173944f8725bbc8b16fdfc4 100644 (file)
         details.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>LimitCPU=</varname></term>
+        <term><varname>LimitFSIZE=</varname></term>
+        <term><varname>LimitDATA=</varname></term>
+        <term><varname>LimitSTACK=</varname></term>
+        <term><varname>LimitCORE=</varname></term>
+        <term><varname>LimitRSS=</varname></term>
+        <term><varname>LimitNOFILE=</varname></term>
+        <term><varname>LimitAS=</varname></term>
+        <term><varname>LimitNPROC=</varname></term>
+        <term><varname>LimitMEMLOCK=</varname></term>
+        <term><varname>LimitLOCKS=</varname></term>
+        <term><varname>LimitSIGPENDING=</varname></term>
+        <term><varname>LimitMSGQUEUE=</varname></term>
+        <term><varname>LimitNICE=</varname></term>
+        <term><varname>LimitRTPRIO=</varname></term>
+        <term><varname>LimitRTTIME=</varname></term>
+
+        <listitem><para>Configures various types of resource limits applied to containers. This is equivalent to the
+        <option>--rlimit=</option> command line switch, and takes the same arguments. See
+        <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for
+        details.</para></listitem>
+      </varlistentry>
+
     </variablelist>
   </refsect1>
 
index ea66971fac710b42cdfaeece952a219641ccef74..58184d412d16c110700747c82d0cbd0739887912 100644 (file)
@@ -18,35 +18,51 @@ struct ConfigPerfItem;
 %struct-type
 %includes
 %%
-Exec.Boot,                    config_parse_boot,          0, 0
-Exec.ProcessTwo,              config_parse_pid2,          0, 0
-Exec.Parameters,              config_parse_strv,          0, offsetof(Settings, parameters)
-Exec.Environment,             config_parse_strv,          0, offsetof(Settings, environment)
-Exec.User,                    config_parse_string,        0, offsetof(Settings, user)
-Exec.Capability,              config_parse_capability,    0, offsetof(Settings, capability)
-Exec.DropCapability,          config_parse_capability,    0, offsetof(Settings, drop_capability)
-Exec.KillSignal,              config_parse_signal,        0, offsetof(Settings, kill_signal)
-Exec.Personality,             config_parse_personality,   0, offsetof(Settings, personality)
-Exec.MachineID,               config_parse_id128,         0, offsetof(Settings, machine_id)
-Exec.WorkingDirectory,        config_parse_path,          0, offsetof(Settings, working_directory)
-Exec.PivotRoot,               config_parse_pivot_root,    0, 0
-Exec.PrivateUsers,            config_parse_private_users, 0, 0
-Exec.NotifyReady,             config_parse_bool,          0, offsetof(Settings, notify_ready)
-Exec.SystemCallFilter,        config_parse_syscall_filter,0, 0,
-Files.ReadOnly,               config_parse_tristate,      0, offsetof(Settings, read_only)
-Files.Volatile,               config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
-Files.Bind,                   config_parse_bind,          0, 0
-Files.BindReadOnly,           config_parse_bind,          1, 0
-Files.TemporaryFileSystem,    config_parse_tmpfs,         0, 0
-Files.Overlay,                config_parse_overlay,       0, 0
-Files.OverlayReadOnly,        config_parse_overlay,       1, 0
-Files.PrivateUsersChown,      config_parse_tristate,      0, offsetof(Settings, userns_chown)
-Network.Private,              config_parse_tristate,      0, offsetof(Settings, private_network)
-Network.Interface,            config_parse_strv,          0, offsetof(Settings, network_interfaces)
-Network.MACVLAN,              config_parse_strv,          0, offsetof(Settings, network_macvlan)
-Network.IPVLAN,               config_parse_strv,          0, offsetof(Settings, network_ipvlan)
-Network.VirtualEthernet,      config_parse_tristate,      0, offsetof(Settings, network_veth)
-Network.VirtualEthernetExtra, config_parse_veth_extra,    0, 0
-Network.Bridge,               config_parse_ifname,        0, offsetof(Settings, network_bridge)
-Network.Zone,                 config_parse_network_zone,  0, 0
-Network.Port,                 config_parse_expose_port,   0, 0
+Exec.Boot,                    config_parse_boot,           0,                 0
+Exec.ProcessTwo,              config_parse_pid2,           0,                 0
+Exec.Parameters,              config_parse_strv,           0,                 offsetof(Settings, parameters)
+Exec.Environment,             config_parse_strv,           0,                 offsetof(Settings, environment)
+Exec.User,                    config_parse_string,         0,                 offsetof(Settings, user)
+Exec.Capability,              config_parse_capability,     0,                 offsetof(Settings, capability)
+Exec.DropCapability,          config_parse_capability,     0,                 offsetof(Settings, drop_capability)
+Exec.KillSignal,              config_parse_signal,         0,                 offsetof(Settings, kill_signal)
+Exec.Personality,             config_parse_personality,    0,                 offsetof(Settings, personality)
+Exec.MachineID,               config_parse_id128,          0,                 offsetof(Settings, machine_id)
+Exec.WorkingDirectory,        config_parse_path,           0,                 offsetof(Settings, working_directory)
+Exec.PivotRoot,               config_parse_pivot_root,     0,                 0
+Exec.PrivateUsers,            config_parse_private_users,  0,                 0
+Exec.NotifyReady,             config_parse_bool,           0,                 offsetof(Settings, notify_ready)
+Exec.SystemCallFilter,        config_parse_syscall_filter, 0,                 0,
+Exec.LimitCPU,                config_parse_rlimit,         RLIMIT_CPU,        offsetof(Settings, rlimit)
+Exec.LimitFSIZE,              config_parse_rlimit,         RLIMIT_FSIZE,      offsetof(Settings, rlimit)
+Exec.LimitDATA,               config_parse_rlimit,         RLIMIT_DATA,       offsetof(Settings, rlimit)
+Exec.LimitSTACK,              config_parse_rlimit,         RLIMIT_STACK,      offsetof(Settings, rlimit)
+Exec.LimitCORE,               config_parse_rlimit,         RLIMIT_CORE,       offsetof(Settings, rlimit)
+Exec.LimitRSS,                config_parse_rlimit,         RLIMIT_RSS,        offsetof(Settings, rlimit)
+Exec.LimitNOFILE,             config_parse_rlimit,         RLIMIT_NOFILE,     offsetof(Settings, rlimit)
+Exec.LimitAS,                 config_parse_rlimit,         RLIMIT_AS,         offsetof(Settings, rlimit)
+Exec.LimitNPROC,              config_parse_rlimit,         RLIMIT_NPROC,      offsetof(Settings, rlimit)
+Exec.LimitMEMLOCK,            config_parse_rlimit,         RLIMIT_MEMLOCK,    offsetof(Settings, rlimit)
+Exec.LimitLOCKS,              config_parse_rlimit,         RLIMIT_LOCKS,      offsetof(Settings, rlimit)
+Exec.LimitSIGPENDING,         config_parse_rlimit,         RLIMIT_SIGPENDING, offsetof(Settings, rlimit)
+Exec.LimitMSGQUEUE,           config_parse_rlimit,         RLIMIT_MSGQUEUE,   offsetof(Settings, rlimit)
+Exec.LimitNICE,               config_parse_rlimit,         RLIMIT_NICE,       offsetof(Settings, rlimit)
+Exec.LimitRTPRIO,             config_parse_rlimit,         RLIMIT_RTPRIO,     offsetof(Settings, rlimit)
+Exec.LimitRTTIME,             config_parse_rlimit,         RLIMIT_RTTIME,     offsetof(Settings, rlimit)
+Files.ReadOnly,               config_parse_tristate,       0,                 offsetof(Settings, read_only)
+Files.Volatile,               config_parse_volatile_mode,  0,                 offsetof(Settings, volatile_mode)
+Files.Bind,                   config_parse_bind,           0,                 0
+Files.BindReadOnly,           config_parse_bind,           1,                 0
+Files.TemporaryFileSystem,    config_parse_tmpfs,          0,                 0
+Files.Overlay,                config_parse_overlay,        0,                 0
+Files.OverlayReadOnly,        config_parse_overlay,        1,                 0
+Files.PrivateUsersChown,      config_parse_tristate,       0,                 offsetof(Settings, userns_chown)
+Network.Private,              config_parse_tristate,       0,                 offsetof(Settings, private_network)
+Network.Interface,            config_parse_strv,           0,                 offsetof(Settings, network_interfaces)
+Network.MACVLAN,              config_parse_strv,           0,                 offsetof(Settings, network_macvlan)
+Network.IPVLAN,               config_parse_strv,           0,                 offsetof(Settings, network_ipvlan)
+Network.VirtualEthernet,      config_parse_tristate,       0,                 offsetof(Settings, network_veth)
+Network.VirtualEthernetExtra, config_parse_veth_extra,     0,                 0
+Network.Bridge,               config_parse_ifname,         0,                 offsetof(Settings, network_bridge)
+Network.Zone,                 config_parse_network_zone,   0,                 0
+Network.Port,                 config_parse_expose_port,    0,                 0
index 487514d40a956f4132e7d5fae6fe456cb241cf7c..19bf1d4b941e015b99799a1e7c62ae648277f467 100644 (file)
@@ -12,6 +12,7 @@
 #include "nspawn-settings.h"
 #include "parse-util.h"
 #include "process-util.h"
+#include "rlimit-util.h"
 #include "socket-util.h"
 #include "string-util.h"
 #include "strv.h"
@@ -80,6 +81,7 @@ Settings* settings_free(Settings *s) {
         free(s->working_directory);
         strv_free(s->syscall_whitelist);
         strv_free(s->syscall_blacklist);
+        rlimit_free_all(s->rlimit);
 
         strv_free(s->network_interfaces);
         strv_free(s->network_macvlan);
index 731db87260689661dc753b6668de25c589565d9c..0e6ce6121718b4a36ae6138dbf79c266757cd780 100644 (file)
@@ -32,24 +32,26 @@ typedef enum UserNamespaceMode {
 } UserNamespaceMode;
 
 typedef enum SettingsMask {
-        SETTING_START_MODE        = 1 << 0,
-        SETTING_ENVIRONMENT       = 1 << 1,
-        SETTING_USER              = 1 << 2,
-        SETTING_CAPABILITY        = 1 << 3,
-        SETTING_KILL_SIGNAL       = 1 << 4,
-        SETTING_PERSONALITY       = 1 << 5,
-        SETTING_MACHINE_ID        = 1 << 6,
-        SETTING_NETWORK           = 1 << 7,
-        SETTING_EXPOSE_PORTS      = 1 << 8,
-        SETTING_READ_ONLY         = 1 << 9,
-        SETTING_VOLATILE_MODE     = 1 << 10,
-        SETTING_CUSTOM_MOUNTS     = 1 << 11,
-        SETTING_WORKING_DIRECTORY = 1 << 12,
-        SETTING_USERNS            = 1 << 13,
-        SETTING_NOTIFY_READY      = 1 << 14,
-        SETTING_PIVOT_ROOT        = 1 << 15,
-        SETTING_SYSCALL_FILTER    = 1 << 16,
-        _SETTINGS_MASK_ALL        = (1 << 17) -1
+        SETTING_START_MODE        = UINT64_C(1) << 0,
+        SETTING_ENVIRONMENT       = UINT64_C(1) << 1,
+        SETTING_USER              = UINT64_C(1) << 2,
+        SETTING_CAPABILITY        = UINT64_C(1) << 3,
+        SETTING_KILL_SIGNAL       = UINT64_C(1) << 4,
+        SETTING_PERSONALITY       = UINT64_C(1) << 5,
+        SETTING_MACHINE_ID        = UINT64_C(1) << 6,
+        SETTING_NETWORK           = UINT64_C(1) << 7,
+        SETTING_EXPOSE_PORTS      = UINT64_C(1) << 8,
+        SETTING_READ_ONLY         = UINT64_C(1) << 9,
+        SETTING_VOLATILE_MODE     = UINT64_C(1) << 10,
+        SETTING_CUSTOM_MOUNTS     = UINT64_C(1) << 11,
+        SETTING_WORKING_DIRECTORY = UINT64_C(1) << 12,
+        SETTING_USERNS            = UINT64_C(1) << 13,
+        SETTING_NOTIFY_READY      = UINT64_C(1) << 14,
+        SETTING_PIVOT_ROOT        = UINT64_C(1) << 15,
+        SETTING_SYSCALL_FILTER    = UINT64_C(1) << 16,
+        SETTING_RLIMIT_FIRST      = UINT64_C(1) << 17, /* we define one bit per resource limit here */
+        SETTING_RLIMIT_LAST       = UINT64_C(1) << (17 + _RLIMIT_MAX - 1),
+        _SETTINGS_MASK_ALL        = (UINT64_C(1) << (17 + _RLIMIT_MAX))
 } SettingsMask;
 
 typedef struct Settings {
@@ -71,6 +73,7 @@ typedef struct Settings {
         bool notify_ready;
         char **syscall_whitelist;
         char **syscall_blacklist;
+        struct rlimit *rlimit[_RLIMIT_MAX];
 
         /* [Image] */
         int read_only;
index 12eaa6c0d74bd5d894ad8a63f08e56e9866cd171..8ba8e73bf73e3c8a6dae935c2bbb0304fef6e873 100644 (file)
@@ -81,6 +81,7 @@
 #include "ptyfwd.h"
 #include "random-util.h"
 #include "raw-clone.h"
+#include "rlimit-util.h"
 #include "rm-rf.h"
 #include "selinux-util.h"
 #include "signal-util.h"
@@ -200,6 +201,7 @@ static void *arg_root_hash = NULL;
 static size_t arg_root_hash_size = 0;
 static char **arg_syscall_whitelist = NULL;
 static char **arg_syscall_blacklist = NULL;
+static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -264,6 +266,7 @@ static void help(void) {
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
                "     --system-call-filter=LIST|~LIST\n"
                "                            Permit/prohibit specific system calls\n"
+               "     --rlimit=NAME=LIMIT    Set a resource limit for the payload\n"
                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, \n"
                "                            host, try-guest, try-host\n"
@@ -439,6 +442,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_NOTIFY_READY,
                 ARG_ROOT_HASH,
                 ARG_SYSTEM_CALL_FILTER,
+                ARG_RLIMIT,
         };
 
         static const struct option options[] = {
@@ -492,6 +496,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "notify-ready",           required_argument, NULL, ARG_NOTIFY_READY           },
                 { "root-hash",              required_argument, NULL, ARG_ROOT_HASH              },
                 { "system-call-filter",     required_argument, NULL, ARG_SYSTEM_CALL_FILTER     },
+                { "rlimit",                 required_argument, NULL, ARG_RLIMIT                 },
                 {}
         };
 
@@ -1094,6 +1099,41 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
                 }
 
+                case ARG_RLIMIT: {
+                        const char *eq;
+                        char *name;
+                        int rl;
+
+                        eq = strchr(optarg, '=');
+                        if (!eq) {
+                                log_error("--rlimit= expects an '=' assignment.");
+                                return -EINVAL;
+                        }
+
+                        name = strndup(optarg, eq - optarg);
+                        if (!name)
+                                return log_oom();
+
+                        rl = rlimit_from_string_harder(name);
+                        if (rl < 0) {
+                                log_error("Unknown resource limit: %s", name);
+                                return -EINVAL;
+                        }
+
+                        if (!arg_rlimit[rl]) {
+                                arg_rlimit[rl] = new0(struct rlimit, 1);
+                                if (!arg_rlimit[rl])
+                                        return log_oom();
+                        }
+
+                        r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
+
+                        arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
+                        break;
+                }
+
                 case '?':
                         return -EINVAL;
 
@@ -2282,7 +2322,6 @@ static int inner_child(
                 NULL
         };
         const char *exec_target;
-
         _cleanup_strv_free_ char **env_use = NULL;
         int r;
 
@@ -2559,10 +2598,10 @@ static int outer_child(
                 FDSet *fds,
                 int netns_fd) {
 
+        _cleanup_close_ int fd = -1;
+        int r, which_failed;
         pid_t pid;
         ssize_t l;
-        int r;
-        _cleanup_close_ int fd = -1;
 
         assert(barrier);
         assert(directory);
@@ -2805,6 +2844,10 @@ static int outer_child(
         if (fd < 0)
                 return fd;
 
+        r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
+        if (r < 0)
+                return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
                         arg_clone_ns_flags |
                         (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
@@ -3046,7 +3089,7 @@ static int load_settings(void) {
         _cleanup_fclose_ FILE *f = NULL;
         _cleanup_free_ char *p = NULL;
         const char *fn, *i;
-        int r;
+        int r, rl;
 
         /* If all settings are masked, there's no point in looking for
          * the settings file */
@@ -3256,6 +3299,21 @@ static int load_settings(void) {
                 }
         }
 
+        for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
+                if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
+                        continue;
+
+                if (!settings->rlimit[rl])
+                        continue;
+
+                if (!arg_settings_trusted) {
+                        log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), p);
+                        continue;
+                }
+
+                free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
+        }
+
         return 0;
 }
 
@@ -3767,6 +3825,71 @@ static int run(int master,
         return 1; /* loop again */
 }
 
+static int initialize_rlimits(void) {
+
+        /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
+         * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
+         * container execution environments. */
+
+        static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
+                [RLIMIT_AS]       = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_CORE]     = { 0,             RLIM_INFINITY },
+                [RLIMIT_CPU]      = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_DATA]     = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_FSIZE]    = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_LOCKS]    = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_MEMLOCK]  = { 65536,         65536         },
+                [RLIMIT_MSGQUEUE] = { 819200,        819200        },
+                [RLIMIT_NICE]     = { 0,             0             },
+                [RLIMIT_NOFILE]   = { 1024,          4096          },
+                [RLIMIT_RSS]      = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_RTPRIO]   = { 0,             0             },
+                [RLIMIT_RTTIME]   = { RLIM_INFINITY, RLIM_INFINITY },
+                [RLIMIT_STACK]    = { 8388608,       RLIM_INFINITY },
+
+                /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
+                 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
+                 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
+                 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
+                 * that PID 1 changes a number of other resource limits during early initialization which is why we
+                 * don't read the other limits from PID 1 but prefer the static table above. */
+        };
+
+        int rl;
+
+        for (rl = 0; rl < _RLIMIT_MAX; rl++) {
+
+                /* Let's only fill in what the user hasn't explicitly configured anyway */
+                if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
+                        const struct rlimit *v;
+                        struct rlimit buffer;
+
+                        if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
+                                /* For these two let's read the limits off PID 1. See above for an explanation. */
+
+                                if (prlimit(1, rl, NULL, &buffer) < 0)
+                                        return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
+
+                                v = &buffer;
+                        } else
+                                v = kernel_defaults + rl;
+
+                        arg_rlimit[rl] = newdup(struct rlimit, v, 1);
+                        if (!arg_rlimit[rl])
+                                return log_oom();
+                }
+
+                if (DEBUG_LOGGING) {
+                        _cleanup_free_ char *k = NULL;
+
+                        (void) rlimit_format(arg_rlimit[rl], &k);
+                        log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
+                }
+        }
+
+        return 0;
+}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *console = NULL;
@@ -3799,6 +3922,10 @@ int main(int argc, char *argv[]) {
         if (r < 0)
                 goto finish;
 
+        r = initialize_rlimits();
+        if (r < 0)
+                goto finish;
+
         r = determine_names();
         if (r < 0)
                 goto finish;
@@ -4161,6 +4288,7 @@ finish:
         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
         expose_port_free_all(arg_expose_ports);
         free(arg_root_hash);
+        rlimit_free_all(arg_rlimit);
 
         return r < 0 ? EXIT_FAILURE : ret;
 }