From: Lennart Poettering Date: Mon, 7 May 2018 15:59:18 +0000 (+0200) Subject: nspawn: add new --rlimit= switch, and always set resource limits explicitly for our... X-Git-Tag: v239~243^2~12 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=bf428efb0776d45f12ac81dc67463663f92b552f;p=thirdparty%2Fsystemd.git nspawn: add new --rlimit= switch, and always set resource limits explicitly for our container payloads This ensures we set the various resource limits of our container explicitly on each invocation so that we inherit less from our callers into the payload. By default resource limits are now set to the same values Linux generally passes to the host PID 1, thus minimizing needless differences between host and container environments. The limits are now also configurable using a new --rlimit= switch. This is preparation for teaching nspawn native OCI runtime support as OCI permits setting resource limits for container payloads, and it hence probably makes sense if we do too. --- diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 713782b8595..c6b027c58f7 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -734,6 +734,31 @@ capabilities are passed using the --capabilities=. + + + + Sets the specified POSIX resource limit for the container payload. Expects an assignment of the + form + LIMIT=SOFT:HARD + or LIMIT=VALUE, where + LIMIT should refer to a resource limit type, such as + RLIMIT_NOFILE or RLIMIT_NICE. The SOFT and + HARD fields should refer to the numeric soft and hard resource limit values. If the + second form is used, VALUE may specifiy a value that is used both as soft and hard + limit. In place of a numeric value the special string infinity may be used to turn off + resource limiting for the specific type of resource. This command line option may be used multiple times to + control limits on multiple limit types. If used multiple times for the same limit type, the last last use + wins. For details about resource limits see setrlimit2. By default + resource limits for the container's init process (PID 1) are set to the same values the Linux kernel originally + passed to the host init system. Note that some resource limits are enforced on resources counted per user, in + particular RLIMIT_NPROC. This means that unless user namespacing is deployed + (i.e. is used, see above), any limits set will be applied to the resource + usage of the same user on all local containers as well as the host. This means particular care needs to be + taken with these limits as they might be triggered by possibly less trusted code. Example: + --rlimit=RLIMIT_NOFILE=8192:16384. + + diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index b5c60a33e0b..6bd7b33b34b 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -278,6 +278,30 @@ details. + + LimitCPU= + LimitFSIZE= + LimitDATA= + LimitSTACK= + LimitCORE= + LimitRSS= + LimitNOFILE= + LimitAS= + LimitNPROC= + LimitMEMLOCK= + LimitLOCKS= + LimitSIGPENDING= + LimitMSGQUEUE= + LimitNICE= + LimitRTPRIO= + LimitRTTIME= + + Configures various types of resource limits applied to containers. This is equivalent to the + command line switch, and takes the same arguments. See + systemd-nspawn1 for + details. + + diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index ea66971fac7..58184d412d1 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -18,35 +18,51 @@ struct ConfigPerfItem; %struct-type %includes %% -Exec.Boot, config_parse_boot, 0, 0 -Exec.ProcessTwo, config_parse_pid2, 0, 0 -Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) -Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) -Exec.User, config_parse_string, 0, offsetof(Settings, user) -Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability) -Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability) -Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal) -Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) -Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) -Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) -Exec.PivotRoot, config_parse_pivot_root, 0, 0 -Exec.PrivateUsers, config_parse_private_users, 0, 0 -Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready) -Exec.SystemCallFilter, config_parse_syscall_filter,0, 0, -Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) -Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) -Files.Bind, config_parse_bind, 0, 0 -Files.BindReadOnly, config_parse_bind, 1, 0 -Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 -Files.Overlay, config_parse_overlay, 0, 0 -Files.OverlayReadOnly, config_parse_overlay, 1, 0 -Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) -Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) -Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) -Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) -Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan) -Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth) -Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0 -Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge) -Network.Zone, config_parse_network_zone, 0, 0 -Network.Port, config_parse_expose_port, 0, 0 +Exec.Boot, config_parse_boot, 0, 0 +Exec.ProcessTwo, config_parse_pid2, 0, 0 +Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) +Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) +Exec.User, config_parse_string, 0, offsetof(Settings, user) +Exec.Capability, config_parse_capability, 0, offsetof(Settings, capability) +Exec.DropCapability, config_parse_capability, 0, offsetof(Settings, drop_capability) +Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, kill_signal) +Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) +Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) +Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) +Exec.PivotRoot, config_parse_pivot_root, 0, 0 +Exec.PrivateUsers, config_parse_private_users, 0, 0 +Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready) +Exec.SystemCallFilter, config_parse_syscall_filter, 0, 0, +Exec.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof(Settings, rlimit) +Exec.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof(Settings, rlimit) +Exec.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof(Settings, rlimit) +Exec.LimitSTACK, config_parse_rlimit, RLIMIT_STACK, offsetof(Settings, rlimit) +Exec.LimitCORE, config_parse_rlimit, RLIMIT_CORE, offsetof(Settings, rlimit) +Exec.LimitRSS, config_parse_rlimit, RLIMIT_RSS, offsetof(Settings, rlimit) +Exec.LimitNOFILE, config_parse_rlimit, RLIMIT_NOFILE, offsetof(Settings, rlimit) +Exec.LimitAS, config_parse_rlimit, RLIMIT_AS, offsetof(Settings, rlimit) +Exec.LimitNPROC, config_parse_rlimit, RLIMIT_NPROC, offsetof(Settings, rlimit) +Exec.LimitMEMLOCK, config_parse_rlimit, RLIMIT_MEMLOCK, offsetof(Settings, rlimit) +Exec.LimitLOCKS, config_parse_rlimit, RLIMIT_LOCKS, offsetof(Settings, rlimit) +Exec.LimitSIGPENDING, config_parse_rlimit, RLIMIT_SIGPENDING, offsetof(Settings, rlimit) +Exec.LimitMSGQUEUE, config_parse_rlimit, RLIMIT_MSGQUEUE, offsetof(Settings, rlimit) +Exec.LimitNICE, config_parse_rlimit, RLIMIT_NICE, offsetof(Settings, rlimit) +Exec.LimitRTPRIO, config_parse_rlimit, RLIMIT_RTPRIO, offsetof(Settings, rlimit) +Exec.LimitRTTIME, config_parse_rlimit, RLIMIT_RTTIME, offsetof(Settings, rlimit) +Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) +Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) +Files.Bind, config_parse_bind, 0, 0 +Files.BindReadOnly, config_parse_bind, 1, 0 +Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Files.Overlay, config_parse_overlay, 0, 0 +Files.OverlayReadOnly, config_parse_overlay, 1, 0 +Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) +Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) +Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) +Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) +Network.IPVLAN, config_parse_strv, 0, offsetof(Settings, network_ipvlan) +Network.VirtualEthernet, config_parse_tristate, 0, offsetof(Settings, network_veth) +Network.VirtualEthernetExtra, config_parse_veth_extra, 0, 0 +Network.Bridge, config_parse_ifname, 0, offsetof(Settings, network_bridge) +Network.Zone, config_parse_network_zone, 0, 0 +Network.Port, config_parse_expose_port, 0, 0 diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 487514d40a9..19bf1d4b941 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -12,6 +12,7 @@ #include "nspawn-settings.h" #include "parse-util.h" #include "process-util.h" +#include "rlimit-util.h" #include "socket-util.h" #include "string-util.h" #include "strv.h" @@ -80,6 +81,7 @@ Settings* settings_free(Settings *s) { free(s->working_directory); strv_free(s->syscall_whitelist); strv_free(s->syscall_blacklist); + rlimit_free_all(s->rlimit); strv_free(s->network_interfaces); strv_free(s->network_macvlan); diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index 731db872606..0e6ce612171 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -32,24 +32,26 @@ typedef enum UserNamespaceMode { } UserNamespaceMode; typedef enum SettingsMask { - SETTING_START_MODE = 1 << 0, - SETTING_ENVIRONMENT = 1 << 1, - SETTING_USER = 1 << 2, - SETTING_CAPABILITY = 1 << 3, - SETTING_KILL_SIGNAL = 1 << 4, - SETTING_PERSONALITY = 1 << 5, - SETTING_MACHINE_ID = 1 << 6, - SETTING_NETWORK = 1 << 7, - SETTING_EXPOSE_PORTS = 1 << 8, - SETTING_READ_ONLY = 1 << 9, - SETTING_VOLATILE_MODE = 1 << 10, - SETTING_CUSTOM_MOUNTS = 1 << 11, - SETTING_WORKING_DIRECTORY = 1 << 12, - SETTING_USERNS = 1 << 13, - SETTING_NOTIFY_READY = 1 << 14, - SETTING_PIVOT_ROOT = 1 << 15, - SETTING_SYSCALL_FILTER = 1 << 16, - _SETTINGS_MASK_ALL = (1 << 17) -1 + SETTING_START_MODE = UINT64_C(1) << 0, + SETTING_ENVIRONMENT = UINT64_C(1) << 1, + SETTING_USER = UINT64_C(1) << 2, + SETTING_CAPABILITY = UINT64_C(1) << 3, + SETTING_KILL_SIGNAL = UINT64_C(1) << 4, + SETTING_PERSONALITY = UINT64_C(1) << 5, + SETTING_MACHINE_ID = UINT64_C(1) << 6, + SETTING_NETWORK = UINT64_C(1) << 7, + SETTING_EXPOSE_PORTS = UINT64_C(1) << 8, + SETTING_READ_ONLY = UINT64_C(1) << 9, + SETTING_VOLATILE_MODE = UINT64_C(1) << 10, + SETTING_CUSTOM_MOUNTS = UINT64_C(1) << 11, + SETTING_WORKING_DIRECTORY = UINT64_C(1) << 12, + SETTING_USERNS = UINT64_C(1) << 13, + SETTING_NOTIFY_READY = UINT64_C(1) << 14, + SETTING_PIVOT_ROOT = UINT64_C(1) << 15, + SETTING_SYSCALL_FILTER = UINT64_C(1) << 16, + SETTING_RLIMIT_FIRST = UINT64_C(1) << 17, /* we define one bit per resource limit here */ + SETTING_RLIMIT_LAST = UINT64_C(1) << (17 + _RLIMIT_MAX - 1), + _SETTINGS_MASK_ALL = (UINT64_C(1) << (17 + _RLIMIT_MAX)) } SettingsMask; typedef struct Settings { @@ -71,6 +73,7 @@ typedef struct Settings { bool notify_ready; char **syscall_whitelist; char **syscall_blacklist; + struct rlimit *rlimit[_RLIMIT_MAX]; /* [Image] */ int read_only; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 12eaa6c0d74..8ba8e73bf73 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -81,6 +81,7 @@ #include "ptyfwd.h" #include "random-util.h" #include "raw-clone.h" +#include "rlimit-util.h" #include "rm-rf.h" #include "selinux-util.h" #include "signal-util.h" @@ -200,6 +201,7 @@ static void *arg_root_hash = NULL; static size_t arg_root_hash_size = 0; static char **arg_syscall_whitelist = NULL; static char **arg_syscall_blacklist = NULL; +static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {}; static void help(void) { printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n" @@ -264,6 +266,7 @@ static void help(void) { " --drop-capability=CAP Drop the specified capability from the default set\n" " --system-call-filter=LIST|~LIST\n" " Permit/prohibit specific system calls\n" + " --rlimit=NAME=LIMIT Set a resource limit for the payload\n" " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n" " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n" " host, try-guest, try-host\n" @@ -439,6 +442,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_NOTIFY_READY, ARG_ROOT_HASH, ARG_SYSTEM_CALL_FILTER, + ARG_RLIMIT, }; static const struct option options[] = { @@ -492,6 +496,7 @@ static int parse_argv(int argc, char *argv[]) { { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, { "root-hash", required_argument, NULL, ARG_ROOT_HASH }, { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER }, + { "rlimit", required_argument, NULL, ARG_RLIMIT }, {} }; @@ -1094,6 +1099,41 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_RLIMIT: { + const char *eq; + char *name; + int rl; + + eq = strchr(optarg, '='); + if (!eq) { + log_error("--rlimit= expects an '=' assignment."); + return -EINVAL; + } + + name = strndup(optarg, eq - optarg); + if (!name) + return log_oom(); + + rl = rlimit_from_string_harder(name); + if (rl < 0) { + log_error("Unknown resource limit: %s", name); + return -EINVAL; + } + + if (!arg_rlimit[rl]) { + arg_rlimit[rl] = new0(struct rlimit, 1); + if (!arg_rlimit[rl]) + return log_oom(); + } + + r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]); + if (r < 0) + return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1); + + arg_settings_mask |= SETTING_RLIMIT_FIRST << rl; + break; + } + case '?': return -EINVAL; @@ -2282,7 +2322,6 @@ static int inner_child( NULL }; const char *exec_target; - _cleanup_strv_free_ char **env_use = NULL; int r; @@ -2559,10 +2598,10 @@ static int outer_child( FDSet *fds, int netns_fd) { + _cleanup_close_ int fd = -1; + int r, which_failed; pid_t pid; ssize_t l; - int r; - _cleanup_close_ int fd = -1; assert(barrier); assert(directory); @@ -2805,6 +2844,10 @@ static int outer_child( if (fd < 0) return fd; + r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed); + if (r < 0) + return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed)); + pid = raw_clone(SIGCHLD|CLONE_NEWNS| arg_clone_ns_flags | (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); @@ -3046,7 +3089,7 @@ static int load_settings(void) { _cleanup_fclose_ FILE *f = NULL; _cleanup_free_ char *p = NULL; const char *fn, *i; - int r; + int r, rl; /* If all settings are masked, there's no point in looking for * the settings file */ @@ -3256,6 +3299,21 @@ static int load_settings(void) { } } + for (rl = 0; rl < _RLIMIT_MAX; rl ++) { + if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl))) + continue; + + if (!settings->rlimit[rl]) + continue; + + if (!arg_settings_trusted) { + log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), p); + continue; + } + + free_and_replace(arg_rlimit[rl], settings->rlimit[rl]); + } + return 0; } @@ -3767,6 +3825,71 @@ static int run(int master, return 1; /* loop again */ } +static int initialize_rlimits(void) { + + /* The default resource limits the kernel passes to PID 1, as per kernel 4.16. Let's pass our container payload + * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and + * container execution environments. */ + + static const struct rlimit kernel_defaults[_RLIMIT_MAX] = { + [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_CORE] = { 0, RLIM_INFINITY }, + [RLIMIT_CPU] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_DATA] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_FSIZE] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_MEMLOCK] = { 65536, 65536 }, + [RLIMIT_MSGQUEUE] = { 819200, 819200 }, + [RLIMIT_NICE] = { 0, 0 }, + [RLIMIT_NOFILE] = { 1024, 4096 }, + [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_RTPRIO] = { 0, 0 }, + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, + [RLIMIT_STACK] = { 8388608, RLIM_INFINITY }, + + /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of + * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them + * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original + * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note + * that PID 1 changes a number of other resource limits during early initialization which is why we + * don't read the other limits from PID 1 but prefer the static table above. */ + }; + + int rl; + + for (rl = 0; rl < _RLIMIT_MAX; rl++) { + + /* Let's only fill in what the user hasn't explicitly configured anyway */ + if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) { + const struct rlimit *v; + struct rlimit buffer; + + if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) { + /* For these two let's read the limits off PID 1. See above for an explanation. */ + + if (prlimit(1, rl, NULL, &buffer) < 0) + return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl)); + + v = &buffer; + } else + v = kernel_defaults + rl; + + arg_rlimit[rl] = newdup(struct rlimit, v, 1); + if (!arg_rlimit[rl]) + return log_oom(); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *k = NULL; + + (void) rlimit_format(arg_rlimit[rl], &k); + log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k); + } + } + + return 0; +} + int main(int argc, char *argv[]) { _cleanup_free_ char *console = NULL; @@ -3799,6 +3922,10 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + r = initialize_rlimits(); + if (r < 0) + goto finish; + r = determine_names(); if (r < 0) goto finish; @@ -4161,6 +4288,7 @@ finish: custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts); expose_port_free_all(arg_expose_ports); free(arg_root_hash); + rlimit_free_all(arg_rlimit); return r < 0 ? EXIT_FAILURE : ret; }