above).</para></listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--system-call-filter=</option></term>
+
+ <listitem><para>Alter the system call filter applied to containers. Takes a space-separated list of system call
+ names or group names (the latter prefixed with <literal>@</literal>, as listed by the
+ <command>syscall-filter</command> command of <citerefentry
+ project='man-pages'><refentrytitle>systemd-analyze</refentrytitle><manvolnum>1</manvolnum></citerefentry>). Passed
+ system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
+ listed system calls are prohibited. If this command line option is used multiple times the configured lists are
+ combined. If both a positive and a negative list (that is one system call list without and one with the
+ <literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
+ that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
+ and this command line option hence adds or removes entries from the default blacklist, depending on the
+ <literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
+ capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>--kill-signal=</option></term>
<varlistentry>
<term><varname>NotifyReady=</varname></term>
- <listitem><para>Configures support for notifications from the container's init process.
- This is equivalent to use <option>--notify-ready=</option> command line switch,
- and takes the same options. See <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
- for details about the specific options supported.</para></listitem>
+ <listitem><para>Configures support for notifications from the container's init process. This is equivalent to
+ the <option>--notify-ready=</option> command line switch, and takes the same paramaters. See
+ <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for details
+ about the specific options supported.</para></listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><varname>SystemCallFilter=</varname></term>
+
+ <listitem><para>Configures the system call filter applied to containers. This is equivalent to the
+ <option>--system-call-filter=</option> command line switch, and takes the same list parameter. See
+ <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for
+ details.</para></listitem>
</varlistentry>
+
</variablelist>
</refsect1>
Exec.PivotRoot, config_parse_pivot_root, 0, 0
Exec.PrivateUsers, config_parse_private_users, 0, 0
Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
+Exec.SystemCallFilter, config_parse_syscall_filter,0, 0,
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0
#include "seccomp-util.h"
#endif
#include "string-util.h"
+#include "strv.h"
#ifdef HAVE_SECCOMP
static int seccomp_add_default_syscall_filter(
scmp_filter_ctx ctx,
uint32_t arch,
- uint64_t cap_list_retain) {
+ uint64_t cap_list_retain,
+ char **syscall_whitelist,
+ char **syscall_blacklist) {
static const struct {
uint64_t capability;
int r, c = 0;
size_t i;
+ char **p;
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
continue;
- r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM));
+ r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
if (r < 0)
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
c++;
}
+ STRV_FOREACH(p, syscall_blacklist) {
+ r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
+ else
+ c++;
+ }
+
return c;
}
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
uint32_t arch;
int r;
if (!is_seccomp_available()) {
- log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
+ log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
return 0;
}
if (r < 0)
return log_error_errno(r, "Failed to allocate seccomp object: %m");
- n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain);
+ n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
if (n < 0)
return n;
#else
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
return 0;
}
#include <sys/types.h>
-int setup_seccomp(uint64_t cap_list_retain);
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist);
free(s->pivot_root_new);
free(s->pivot_root_old);
free(s->working_directory);
+ strv_free(s->syscall_whitelist);
+ strv_free(s->syscall_blacklist);
strv_free(s->network_interfaces);
strv_free(s->network_macvlan);
return 0;
}
+
+int config_parse_syscall_filter(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Settings *settings = data;
+ bool negative;
+ const char *items;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ negative = rvalue[0] == '~';
+ items = negative ? rvalue + 1 : rvalue;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&items, &word, NULL, 0);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue);
+ return 0;
+ }
+
+ if (negative)
+ r = strv_extend(&settings->syscall_blacklist, word);
+ else
+ r = strv_extend(&settings->syscall_whitelist, word);
+ if (r < 0)
+ return log_oom();
+ }
+
+ return 0;
+}
SETTING_USERNS = 1 << 13,
SETTING_NOTIFY_READY = 1 << 14,
SETTING_PIVOT_ROOT = 1 << 15,
- _SETTINGS_MASK_ALL = (1 << 16) -1
+ SETTING_SYSCALL_FILTER = 1 << 16,
+ _SETTINGS_MASK_ALL = (1 << 17) -1
} SettingsMask;
typedef struct Settings {
UserNamespaceMode userns_mode;
uid_t uid_shift, uid_range;
bool notify_ready;
+ char **syscall_whitelist;
+ char **syscall_blacklist;
/* [Image] */
int read_only;
int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
+int config_parse_syscall_filter(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
static void *arg_root_hash = NULL;
static size_t arg_root_hash_size = 0;
+static char **arg_syscall_whitelist = NULL;
+static char **arg_syscall_blacklist = NULL;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
" --capability=CAP In addition to the default, retain specified\n"
" capability\n"
" --drop-capability=CAP Drop the specified capability from the default set\n"
+ " --system-call-filter=LIST|~LIST\n"
+ " Permit/prohibit specific system calls\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
" host, try-guest, try-host\n"
ARG_PRIVATE_USERS_CHOWN,
ARG_NOTIFY_READY,
ARG_ROOT_HASH,
+ ARG_SYSTEM_CALL_FILTER,
};
static const struct option options[] = {
{ "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
{ "root-hash", required_argument, NULL, ARG_ROOT_HASH },
+ { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
{}
};
break;
}
+ case ARG_SYSTEM_CALL_FILTER: {
+ bool negative;
+ const char *items;
+
+ negative = optarg[0] == '~';
+ items = negative ? optarg + 1 : optarg;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&items, &word, NULL, 0);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse system call filter: %m");
+
+ if (negative)
+ r = strv_extend(&arg_syscall_blacklist, word);
+ else
+ r = strv_extend(&arg_syscall_whitelist, word);
+ if (r < 0)
+ return log_oom();
+ }
+
+ arg_settings_mask |= SETTING_SYSCALL_FILTER;
+ break;
+ }
+
case '?':
return -EINVAL;
if (r < 0)
return r;
- r = setup_seccomp(arg_caps_retain);
+ r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
if (r < 0)
return r;
if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
arg_notify_ready = settings->notify_ready;
+ if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
+
+ if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
+ log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
+ else {
+ strv_free(arg_syscall_whitelist);
+ strv_free(arg_syscall_blacklist);
+
+ arg_syscall_whitelist = settings->syscall_whitelist;
+ arg_syscall_blacklist = settings->syscall_blacklist;
+
+ settings->syscall_whitelist = settings->syscall_blacklist = NULL;
+ }
+ }
+
return 0;
}
return NULL;
}
-static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
+static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
-int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action) {
+int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
int r;
assert(seccomp);
assert(name);
+ if (strv_contains(exclude, name))
+ return 0;
+
if (name[0] == '@') {
const SyscallFilterSet *other;
if (!other)
return -EINVAL;
- r = seccomp_add_syscall_filter_set(seccomp, other, action);
+ r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
if (r < 0)
return r;
} else {
static int seccomp_add_syscall_filter_set(
scmp_filter_ctx seccomp,
const SyscallFilterSet *set,
- uint32_t action) {
+ uint32_t action,
+ char **exclude) {
const char *sys;
int r;
assert(set);
NULSTR_FOREACH(sys, set->value) {
- r = seccomp_add_syscall_filter_item(seccomp, sys, action);
+ r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
if (r < 0)
return r;
}
if (r < 0)
return r;
- r = seccomp_add_syscall_filter_set(seccomp, set, action);
+ r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
if (r < 0) {
log_debug_errno(r, "Failed to add filter set, ignoring: %m");
continue;
int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set);
-int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action);
+int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action, char **exclude);
int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);