capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--rlimit=</option></term>
+
+ <listitem><para>Sets the specified POSIX resource limit for the container payload. Expects an assignment of the
+ form
+ <literal><replaceable>LIMIT</replaceable>=<replaceable>SOFT</replaceable>:<replaceable>HARD</replaceable></literal>
+ or <literal><replaceable>LIMIT</replaceable>=<replaceable>VALUE</replaceable></literal>, where
+ <replaceable>LIMIT</replaceable> should refer to a resource limit type, such as
+ <constant>RLIMIT_NOFILE</constant> or <constant>RLIMIT_NICE</constant>. The <replaceable>SOFT</replaceable> and
+ <replaceable>HARD</replaceable> fields should refer to the numeric soft and hard resource limit values. If the
+ second form is used, <replaceable>VALUE</replaceable> may specifiy a value that is used both as soft and hard
+ limit. In place of a numeric value the special string <literal>infinity</literal> may be used to turn off
+ resource limiting for the specific type of resource. This command line option may be used multiple times to
+ control limits on multiple limit types. If used multiple times for the same limit type, the last last use
+ wins. For details about resource limits see <citerefentry
+ project='man-pages'><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry>. By default
+ resource limits for the container's init process (PID 1) are set to the same values the Linux kernel originally
+ passed to the host init system. Note that some resource limits are enforced on resources counted per user, in
+ particular <constant>RLIMIT_NPROC</constant>. This means that unless user namespacing is deployed
+ (i.e. <option>--private-users=</option> is used, see above), any limits set will be applied to the resource
+ usage of the same user on all local containers as well as the host. This means particular care needs to be
+ taken with these limits as they might be triggered by possibly less trusted code. Example:
+ <literal>--rlimit=RLIMIT_NOFILE=8192:16384</literal>.</para></listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>--kill-signal=</option></term>
details.</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>LimitCPU=</varname></term>
+ <term><varname>LimitFSIZE=</varname></term>
+ <term><varname>LimitDATA=</varname></term>
+ <term><varname>LimitSTACK=</varname></term>
+ <term><varname>LimitCORE=</varname></term>
+ <term><varname>LimitRSS=</varname></term>
+ <term><varname>LimitNOFILE=</varname></term>
+ <term><varname>LimitAS=</varname></term>
+ <term><varname>LimitNPROC=</varname></term>
+ <term><varname>LimitMEMLOCK=</varname></term>
+ <term><varname>LimitLOCKS=</varname></term>
+ <term><varname>LimitSIGPENDING=</varname></term>
+ <term><varname>LimitMSGQUEUE=</varname></term>
+ <term><varname>LimitNICE=</varname></term>
+ <term><varname>LimitRTPRIO=</varname></term>
+ <term><varname>LimitRTTIME=</varname></term>
+
+ <listitem><para>Configures various types of resource limits applied to containers. This is equivalent to the
+ <option>--rlimit=</option> command line switch, and takes the same arguments. See
+ <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for
+ details.</para></listitem>
+ </varlistentry>
+
</variablelist>
</refsect1>
%struct-type
%includes
%%
-Exec.Boot, config_parse_boot, 0, 0
-Exec.ProcessTwo, config_parse_pid2, 0, 0
-Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters)
-Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment)
-Exec.User, config_parse_string, 0, offsetof(Settings, user)
-Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability)
-Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability)
-Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal)
-Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality)
-Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id)
-Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory)
-Exec.PivotRoot, config_parse_pivot_root, 0, 0
-Exec.PrivateUsers, config_parse_private_users, 0, 0
-Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
-Exec.SystemCallFilter, config_parse_syscall_filter,0, 0,
-Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
-Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
-Files.Bind, config_parse_bind, 0, 0
-Files.BindReadOnly, config_parse_bind, 1, 0
-Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0
-Files.Overlay, config_parse_overlay, 0, 0
-Files.OverlayReadOnly, config_parse_overlay, 1, 0
-Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown)
-Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network)
-Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces)
-Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan)
-Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan)
-Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth)
-Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0
-Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge)
-Network.Zone, config_parse_network_zone, 0, 0
-Network.Port, config_parse_expose_port, 0, 0
+Exec.Boot, config_parse_boot, 0, 0
+Exec.ProcessTwo, config_parse_pid2, 0, 0
+Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters)
+Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment)
+Exec.User, config_parse_string, 0, offsetof(Settings, user)
+Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability)
+Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability)
+Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal)
+Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality)
+Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id)
+Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory)
+Exec.PivotRoot, config_parse_pivot_root, 0, 0
+Exec.PrivateUsers, config_parse_private_users, 0, 0
+Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
+Exec.SystemCallFilter, config_parse_syscall_filter, 0, 0,
+Exec.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof(Settings, rlimit)
+Exec.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof(Settings, rlimit)
+Exec.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof(Settings, rlimit)
+Exec.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof(Settings, rlimit)
+Exec.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof(Settings, rlimit)
+Exec.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof(Settings, rlimit)
+Exec.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof(Settings, rlimit)
+Exec.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof(Settings, rlimit)
+Exec.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof(Settings, rlimit)
+Exec.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof(Settings, rlimit)
+Exec.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof(Settings, rlimit)
+Exec.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof(Settings, rlimit)
+Exec.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof(Settings, rlimit)
+Exec.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof(Settings, rlimit)
+Exec.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof(Settings, rlimit)
+Exec.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof(Settings, rlimit)
+Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
+Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
+Files.Bind, config_parse_bind, 0, 0
+Files.BindReadOnly, config_parse_bind, 1, 0
+Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0
+Files.Overlay, config_parse_overlay, 0, 0
+Files.OverlayReadOnly, config_parse_overlay, 1, 0
+Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown)
+Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network)
+Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces)
+Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan)
+Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan)
+Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth)
+Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0
+Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge)
+Network.Zone, config_parse_network_zone, 0, 0
+Network.Port, config_parse_expose_port, 0, 0
#include "nspawn-settings.h"
#include "parse-util.h"
#include "process-util.h"
+#include "rlimit-util.h"
#include "socket-util.h"
#include "string-util.h"
#include "strv.h"
free(s->working_directory);
strv_free(s->syscall_whitelist);
strv_free(s->syscall_blacklist);
+ rlimit_free_all(s->rlimit);
strv_free(s->network_interfaces);
strv_free(s->network_macvlan);
} UserNamespaceMode;
typedef enum SettingsMask {
- SETTING_START_MODE = 1 << 0,
- SETTING_ENVIRONMENT = 1 << 1,
- SETTING_USER = 1 << 2,
- SETTING_CAPABILITY = 1 << 3,
- SETTING_KILL_SIGNAL = 1 << 4,
- SETTING_PERSONALITY = 1 << 5,
- SETTING_MACHINE_ID = 1 << 6,
- SETTING_NETWORK = 1 << 7,
- SETTING_EXPOSE_PORTS = 1 << 8,
- SETTING_READ_ONLY = 1 << 9,
- SETTING_VOLATILE_MODE = 1 << 10,
- SETTING_CUSTOM_MOUNTS = 1 << 11,
- SETTING_WORKING_DIRECTORY = 1 << 12,
- SETTING_USERNS = 1 << 13,
- SETTING_NOTIFY_READY = 1 << 14,
- SETTING_PIVOT_ROOT = 1 << 15,
- SETTING_SYSCALL_FILTER = 1 << 16,
- _SETTINGS_MASK_ALL = (1 << 17) -1
+ SETTING_START_MODE = UINT64_C(1) << 0,
+ SETTING_ENVIRONMENT = UINT64_C(1) << 1,
+ SETTING_USER = UINT64_C(1) << 2,
+ SETTING_CAPABILITY = UINT64_C(1) << 3,
+ SETTING_KILL_SIGNAL = UINT64_C(1) << 4,
+ SETTING_PERSONALITY = UINT64_C(1) << 5,
+ SETTING_MACHINE_ID = UINT64_C(1) << 6,
+ SETTING_NETWORK = UINT64_C(1) << 7,
+ SETTING_EXPOSE_PORTS = UINT64_C(1) << 8,
+ SETTING_READ_ONLY = UINT64_C(1) << 9,
+ SETTING_VOLATILE_MODE = UINT64_C(1) << 10,
+ SETTING_CUSTOM_MOUNTS = UINT64_C(1) << 11,
+ SETTING_WORKING_DIRECTORY = UINT64_C(1) << 12,
+ SETTING_USERNS = UINT64_C(1) << 13,
+ SETTING_NOTIFY_READY = UINT64_C(1) << 14,
+ SETTING_PIVOT_ROOT = UINT64_C(1) << 15,
+ SETTING_SYSCALL_FILTER = UINT64_C(1) << 16,
+ SETTING_RLIMIT_FIRST = UINT64_C(1) << 17, /* we define one bit per resource limit here */
+ SETTING_RLIMIT_LAST = UINT64_C(1) << (17 + _RLIMIT_MAX - 1),
+ _SETTINGS_MASK_ALL = (UINT64_C(1) << (17 + _RLIMIT_MAX))
} SettingsMask;
typedef struct Settings {
bool notify_ready;
char **syscall_whitelist;
char **syscall_blacklist;
+ struct rlimit *rlimit[_RLIMIT_MAX];
/* [Image] */
int read_only;
#include "ptyfwd.h"
#include "random-util.h"
#include "raw-clone.h"
+#include "rlimit-util.h"
#include "rm-rf.h"
#include "selinux-util.h"
#include "signal-util.h"
static size_t arg_root_hash_size = 0;
static char **arg_syscall_whitelist = NULL;
static char **arg_syscall_blacklist = NULL;
+static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
" --system-call-filter=LIST|~LIST\n"
" Permit/prohibit specific system calls\n"
+ " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
" host, try-guest, try-host\n"
ARG_NOTIFY_READY,
ARG_ROOT_HASH,
ARG_SYSTEM_CALL_FILTER,
+ ARG_RLIMIT,
};
static const struct option options[] = {
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{ "root-hash", required_argument, NULL, ARG_ROOT_HASH },
{ "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
+ { "rlimit", required_argument, NULL, ARG_RLIMIT },
{}
};
break;
}
+ case ARG_RLIMIT: {
+ const char *eq;
+ char *name;
+ int rl;
+
+ eq = strchr(optarg, '=');
+ if (!eq) {
+ log_error("--rlimit= expects an '=' assignment.");
+ return -EINVAL;
+ }
+
+ name = strndup(optarg, eq - optarg);
+ if (!name)
+ return log_oom();
+
+ rl = rlimit_from_string_harder(name);
+ if (rl < 0) {
+ log_error("Unknown resource limit: %s", name);
+ return -EINVAL;
+ }
+
+ if (!arg_rlimit[rl]) {
+ arg_rlimit[rl] = new0(struct rlimit, 1);
+ if (!arg_rlimit[rl])
+ return log_oom();
+ }
+
+ r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
+
+ arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
+ break;
+ }
+
case '?':
return -EINVAL;
NULL
};
const char *exec_target;
-
_cleanup_strv_free_ char **env_use = NULL;
int r;
FDSet *fds,
int netns_fd) {
+ _cleanup_close_ int fd = -1;
+ int r, which_failed;
pid_t pid;
ssize_t l;
- int r;
- _cleanup_close_ int fd = -1;
assert(barrier);
assert(directory);
if (fd < 0)
return fd;
+ r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
+ if (r < 0)
+ return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
+
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
arg_clone_ns_flags |
(arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
_cleanup_fclose_ FILE *f = NULL;
_cleanup_free_ char *p = NULL;
const char *fn, *i;
- int r;
+ int r, rl;
/* If all settings are masked, there's no point in looking for
* the settings file */
}
}
+ for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
+ if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
+ continue;
+
+ if (!settings->rlimit[rl])
+ continue;
+
+ if (!arg_settings_trusted) {
+ log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), p);
+ continue;
+ }
+
+ free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
+ }
+
return 0;
}
return 1; /* loop again */
}
+static int initialize_rlimits(void) {
+
+ /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload
+ * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
+ * container execution environments. */
+
+ static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
+ [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_CORE] = { 0, RLIM_INFINITY },
+ [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_MEMLOCK] = { 65536, 65536 },
+ [RLIMIT_MSGQUEUE] = { 819200, 819200 },
+ [RLIMIT_NICE] = { 0, 0 },
+ [RLIMIT_NOFILE] = { 1024, 4096 },
+ [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_RTPRIO] = { 0, 0 },
+ [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY },
+ [RLIMIT_STACK] = { 8388608, RLIM_INFINITY },
+
+ /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
+ * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
+ * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
+ * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
+ * that PID 1 changes a number of other resource limits during early initialization which is why we
+ * don't read the other limits from PID 1 but prefer the static table above. */
+ };
+
+ int rl;
+
+ for (rl = 0; rl < _RLIMIT_MAX; rl++) {
+
+ /* Let's only fill in what the user hasn't explicitly configured anyway */
+ if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
+ const struct rlimit *v;
+ struct rlimit buffer;
+
+ if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
+ /* For these two let's read the limits off PID 1. See above for an explanation. */
+
+ if (prlimit(1, rl, NULL, &buffer) < 0)
+ return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
+
+ v = &buffer;
+ } else
+ v = kernel_defaults + rl;
+
+ arg_rlimit[rl] = newdup(struct rlimit, v, 1);
+ if (!arg_rlimit[rl])
+ return log_oom();
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *k = NULL;
+
+ (void) rlimit_format(arg_rlimit[rl], &k);
+ log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
+ }
+ }
+
+ return 0;
+}
+
int main(int argc, char *argv[]) {
_cleanup_free_ char *console = NULL;
if (r < 0)
goto finish;
+ r = initialize_rlimits();
+ if (r < 0)
+ goto finish;
+
r = determine_names();
if (r < 0)
goto finish;
custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
expose_port_free_all(arg_expose_ports);
free(arg_root_hash);
+ rlimit_free_all(arg_rlimit);
return r < 0 ? EXIT_FAILURE : ret;
}