]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nspawn: implement configurable syscall whitelisting/blacklisting 6798/head
authorLennart Poettering <lennart@poettering.net>
Mon, 11 Sep 2017 15:45:21 +0000 (17:45 +0200)
committerLennart Poettering <lennart@poettering.net>
Tue, 12 Sep 2017 12:06:21 +0000 (14:06 +0200)
Now that we have ported nspawn's seccomp code to the generic code in
seccomp-util, let's extend it to support whitelisting and blacklisting
of specific additional syscalls.

This uses similar syntax as PID1's support for system call filtering,
but in contrast to that always implements a blacklist (and not a
whitelist), as we prepopulate the filter with a blacklist, and the
unit's system call filter logic does not come with anything
prepopulated.

(Later on we might actually want to invert the logic here, and
whitelist rather than blacklist things, but at this point let's not do
that. In case we switch this over later, the syscall add/remove logic of
this commit should be compatible conceptually.)

Fixes: #5163
Replaces: #5944

man/systemd-nspawn.xml
man/systemd.nspawn.xml
src/nspawn/nspawn-gperf.gperf
src/nspawn/nspawn-seccomp.c
src/nspawn/nspawn-seccomp.h
src/nspawn/nspawn-settings.c
src/nspawn/nspawn-settings.h
src/nspawn/nspawn.c
src/shared/seccomp-util.c
src/shared/seccomp-util.h

index 5d3212dec7e60243899ffdd1fb871d23619702bc..c4db6a3adab9bce008601ff0a24d43140acc2157 100644 (file)
         above).</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><option>--system-call-filter=</option></term>
+
+        <listitem><para>Alter the system call filter applied to containers. Takes a space-separated list of system call
+        names or group names (the latter prefixed with <literal>@</literal>, as listed by the
+        <command>syscall-filter</command> command of <citerefentry
+        project='man-pages'><refentrytitle>systemd-analyze</refentrytitle><manvolnum>1</manvolnum></citerefentry>). Passed
+        system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
+        listed system calls are prohibited. If this command line option is used multiple times the configured lists are
+        combined. If both a positive and a negative list (that is one system call list without and one with the
+        <literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
+        that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
+        and this command line option hence adds or removes entries from the default blacklist, depending on the
+        <literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
+        capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><option>--kill-signal=</option></term>
 
index 4f3f0529119fafc766696ab9afdb7307fe2765cc..58024a071d545f579083ad9cc8594997e1ff8f40 100644 (file)
       <varlistentry>
         <term><varname>NotifyReady=</varname></term>
 
-        <listitem><para>Configures support for notifications from the container's init process.
-        This is equivalent to use <option>--notify-ready=</option> command line switch,
-        and takes the same options. See <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
-        for details about the specific options supported.</para></listitem>
+        <listitem><para>Configures support for notifications from the container's init process.  This is equivalent to
+        the <option>--notify-ready=</option> command line switch, and takes the same paramaters. See
+        <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for details
+        about the specific options supported.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>SystemCallFilter=</varname></term>
+
+        <listitem><para>Configures the system call filter applied to containers. This is equivalent to the
+        <option>--system-call-filter=</option> command line switch, and takes the same list parameter. See
+        <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for
+        details.</para></listitem>
       </varlistentry>
+
     </variablelist>
   </refsect1>
 
index e5fdf63162f25951f39ac2d123d80e1429840f24..b61b347ee7c514fec1d92e1eaf6a325864935549 100644 (file)
@@ -29,6 +29,7 @@ Exec.WorkingDirectory,        config_parse_path,          0, offsetof(Settings,
 Exec.PivotRoot,               config_parse_pivot_root,    0, 0
 Exec.PrivateUsers,            config_parse_private_users, 0, 0
 Exec.NotifyReady,             config_parse_bool,          0, offsetof(Settings, notify_ready)
+Exec.SystemCallFilter,        config_parse_syscall_filter,0, 0,
 Files.ReadOnly,               config_parse_tristate,      0, offsetof(Settings, read_only)
 Files.Volatile,               config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
 Files.Bind,                   config_parse_bind,          0, 0
index 25851401f330fff7027160fcc4f15e2020439b1a..a6f7a7dabc58b1e3666617d9746b67773115474b 100644 (file)
 #include "seccomp-util.h"
 #endif
 #include "string-util.h"
+#include "strv.h"
 
 #ifdef HAVE_SECCOMP
 
 static int seccomp_add_default_syscall_filter(
                 scmp_filter_ctx ctx,
                 uint32_t arch,
-                uint64_t cap_list_retain) {
+                uint64_t cap_list_retain,
+                char **syscall_whitelist,
+                char **syscall_blacklist) {
 
         static const struct {
                 uint64_t capability;
@@ -67,12 +70,13 @@ static int seccomp_add_default_syscall_filter(
 
         int r, c = 0;
         size_t i;
+        char **p;
 
         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
                 if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
                         continue;
 
-                r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM));
+                r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
                 if (r < 0)
                         /* If the system call is not known on this architecture, then that's fine, let's ignore it */
                         log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
@@ -80,15 +84,23 @@ static int seccomp_add_default_syscall_filter(
                         c++;
         }
 
+        STRV_FOREACH(p, syscall_blacklist) {
+                r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
+                else
+                        c++;
+        }
+
         return c;
 }
 
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
         uint32_t arch;
         int r;
 
         if (!is_seccomp_available()) {
-                log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
+                log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
                 return 0;
         }
 
@@ -102,7 +114,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
-                n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain);
+                n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
                 if (n < 0)
                         return n;
 
@@ -141,7 +153,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
 
 #else
 
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
         return 0;
 }
 
index 5bde16faf978429a9a19f32be542bbfe65689de9..5cf5ad1e141adf082dce65115205dbad690dcba7 100644 (file)
@@ -21,4 +21,4 @@
 
 #include <sys/types.h>
 
-int setup_seccomp(uint64_t cap_list_retain);
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist);
index 5217d10665d7058c321a2de09d1db8c7d41531af..c02c1ea697d70c985fa904c99a60081d8335dda8 100644 (file)
@@ -93,6 +93,8 @@ Settings* settings_free(Settings *s) {
         free(s->pivot_root_new);
         free(s->pivot_root_old);
         free(s->working_directory);
+        strv_free(s->syscall_whitelist);
+        strv_free(s->syscall_blacklist);
 
         strv_free(s->network_interfaces);
         strv_free(s->network_macvlan);
@@ -568,3 +570,51 @@ int config_parse_private_users(
 
         return 0;
 }
+
+int config_parse_syscall_filter(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        Settings *settings = data;
+        bool negative;
+        const char *items;
+        int r;
+
+        assert(filename);
+        assert(lvalue);
+        assert(rvalue);
+
+        negative = rvalue[0] == '~';
+        items = negative ? rvalue + 1 : rvalue;
+
+        for (;;) {
+                _cleanup_free_ char *word = NULL;
+
+                r = extract_first_word(&items, &word, NULL, 0);
+                if (r == 0)
+                        break;
+                if (r == -ENOMEM)
+                        return log_oom();
+                if (r < 0) {
+                        log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue);
+                        return 0;
+                }
+
+                if (negative)
+                        r = strv_extend(&settings->syscall_blacklist, word);
+                else
+                        r = strv_extend(&settings->syscall_whitelist, word);
+                if (r < 0)
+                        return log_oom();
+        }
+
+        return 0;
+}
index 021403258ff674b20016e93d19b00959e0d45cba..75d68ce4cfb1c2e36f215191a67ddea098cdeea8 100644 (file)
@@ -58,7 +58,8 @@ typedef enum SettingsMask {
         SETTING_USERNS            = 1 << 13,
         SETTING_NOTIFY_READY      = 1 << 14,
         SETTING_PIVOT_ROOT        = 1 << 15,
-        _SETTINGS_MASK_ALL        = (1 << 16) -1
+        SETTING_SYSCALL_FILTER    = 1 << 16,
+        _SETTINGS_MASK_ALL        = (1 << 17) -1
 } SettingsMask;
 
 typedef struct Settings {
@@ -78,6 +79,8 @@ typedef struct Settings {
         UserNamespaceMode userns_mode;
         uid_t uid_shift, uid_range;
         bool notify_ready;
+        char **syscall_whitelist;
+        char **syscall_blacklist;
 
         /* [Image] */
         int read_only;
@@ -121,3 +124,4 @@ int config_parse_network_zone(const char *unit, const char *filename, unsigned l
 int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
 int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
 int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
+int config_parse_syscall_filter(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
index 24a3da68ca3cb582028e2dcebe7a3c07f7eab91e..cf804ed1b356b9840cca4d457c12b438903be427 100644 (file)
@@ -208,6 +208,8 @@ static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS
 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
 static void *arg_root_hash = NULL;
 static size_t arg_root_hash_size = 0;
+static char **arg_syscall_whitelist = NULL;
+static char **arg_syscall_blacklist = NULL;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -267,6 +269,8 @@ static void help(void) {
                "     --capability=CAP       In addition to the default, retain specified\n"
                "                            capability\n"
                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
+               "     --system-call-filter=LIST|~LIST\n"
+               "                            Permit/prohibit specific system calls\n"
                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, \n"
                "                            host, try-guest, try-host\n"
@@ -431,6 +435,7 @@ static int parse_argv(int argc, char *argv[]) {
                 ARG_PRIVATE_USERS_CHOWN,
                 ARG_NOTIFY_READY,
                 ARG_ROOT_HASH,
+                ARG_SYSTEM_CALL_FILTER,
         };
 
         static const struct option options[] = {
@@ -482,6 +487,7 @@ static int parse_argv(int argc, char *argv[]) {
                 { "pivot-root",            required_argument, NULL, ARG_PIVOT_ROOT          },
                 { "notify-ready",          required_argument, NULL, ARG_NOTIFY_READY        },
                 { "root-hash",             required_argument, NULL, ARG_ROOT_HASH           },
+                { "system-call-filter",    required_argument, NULL, ARG_SYSTEM_CALL_FILTER  },
                 {}
         };
 
@@ -1051,6 +1057,36 @@ static int parse_argv(int argc, char *argv[]) {
                         break;
                 }
 
+                case ARG_SYSTEM_CALL_FILTER: {
+                        bool negative;
+                        const char *items;
+
+                        negative = optarg[0] == '~';
+                        items = negative ? optarg + 1 : optarg;
+
+                        for (;;) {
+                                _cleanup_free_ char *word = NULL;
+
+                                r = extract_first_word(&items, &word, NULL, 0);
+                                if (r == 0)
+                                        break;
+                                if (r == -ENOMEM)
+                                        return log_oom();
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to parse system call filter: %m");
+
+                                if (negative)
+                                        r = strv_extend(&arg_syscall_blacklist, word);
+                                else
+                                        r = strv_extend(&arg_syscall_whitelist, word);
+                                if (r < 0)
+                                        return log_oom();
+                        }
+
+                        arg_settings_mask |= SETTING_SYSCALL_FILTER;
+                        break;
+                }
+
                 case '?':
                         return -EINVAL;
 
@@ -2606,7 +2642,7 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        r = setup_seccomp(arg_caps_retain);
+        r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
         if (r < 0)
                 return r;
 
@@ -3111,6 +3147,21 @@ static int load_settings(void) {
         if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
                 arg_notify_ready = settings->notify_ready;
 
+        if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
+
+                if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
+                        log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
+                else {
+                        strv_free(arg_syscall_whitelist);
+                        strv_free(arg_syscall_blacklist);
+
+                        arg_syscall_whitelist = settings->syscall_whitelist;
+                        arg_syscall_blacklist = settings->syscall_blacklist;
+
+                        settings->syscall_whitelist = settings->syscall_blacklist = NULL;
+                }
+        }
+
         return 0;
 }
 
index 1215f714f1ff22e2526af33f0c9793f365c0a497..643dde6c4a75b3594c2b2a484319b9be66b473dd 100644 (file)
@@ -682,14 +682,17 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) {
         return NULL;
 }
 
-static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
+static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
 
-int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action) {
+int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
         int r;
 
         assert(seccomp);
         assert(name);
 
+        if (strv_contains(exclude, name))
+                return 0;
+
         if (name[0] == '@') {
                 const SyscallFilterSet *other;
 
@@ -697,7 +700,7 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
                 if (!other)
                         return -EINVAL;
 
-                r = seccomp_add_syscall_filter_set(seccomp, other, action);
+                r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
                 if (r < 0)
                         return r;
         } else {
@@ -719,7 +722,8 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
 static int seccomp_add_syscall_filter_set(
                 scmp_filter_ctx seccomp,
                 const SyscallFilterSet *set,
-                uint32_t action) {
+                uint32_t action,
+                char **exclude) {
 
         const char *sys;
         int r;
@@ -728,7 +732,7 @@ static int seccomp_add_syscall_filter_set(
         assert(set);
 
         NULSTR_FOREACH(sys, set->value) {
-                r = seccomp_add_syscall_filter_item(seccomp, sys, action);
+                r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
                 if (r < 0)
                         return r;
         }
@@ -754,7 +758,7 @@ int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilter
                 if (r < 0)
                         return r;
 
-                r = seccomp_add_syscall_filter_set(seccomp, set, action);
+                r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
                 if (r < 0) {
                         log_debug_errno(r, "Failed to add filter set, ignoring: %m");
                         continue;
index 894c53e6fdc275b497a9a4e5333b64d6613ebf2f..c1612f58949f3ad7a92f436413191fd8fb143a70 100644 (file)
@@ -69,7 +69,7 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name);
 
 int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set);
 
-int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action);
+int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action, char **exclude);
 
 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);