]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn-seccomp.c
seccomp: add three more seccomp groups
[thirdparty/systemd.git] / src / nspawn / nspawn-seccomp.c
index 25851401f330fff7027160fcc4f15e2020439b1a..1890dd8e274ecd4de695ebe642ce6cbf8fc2b010 100644 (file)
 #include <sys/capability.h>
 #include <sys/types.h>
 
-#ifdef HAVE_SECCOMP
+#if HAVE_SECCOMP
 #include <seccomp.h>
 #endif
 
 #include "alloc-util.h"
 #include "log.h"
 #include "nspawn-seccomp.h"
-#ifdef HAVE_SECCOMP
+#if HAVE_SECCOMP
 #include "seccomp-util.h"
 #endif
 #include "string-util.h"
+#include "strv.h"
 
-#ifdef HAVE_SECCOMP
+#if HAVE_SECCOMP
 
 static int seccomp_add_default_syscall_filter(
                 scmp_filter_ctx ctx,
                 uint32_t arch,
-                uint64_t cap_list_retain) {
+                uint64_t cap_list_retain,
+                char **syscall_whitelist,
+                char **syscall_blacklist) {
 
         static const struct {
                 uint64_t capability;
                 const char* name;
-        } blacklist[] = {
-                { 0,              "@obsolete"           },
-                { 0,              "@keyring"            }, /* keyring is not namespaced */
-                { 0,              "bpf"                 },
-                { 0,              "kexec_file_load"     },
-                { 0,              "kexec_load"          },
-                { 0,              "lookup_dcookie"      },
-                { 0,              "open_by_handle_at"   },
-                { 0,              "perf_event_open"     },
-                { 0,              "quotactl"            },
-                { 0,              "@swap"               },
-                { CAP_SYSLOG,     "syslog"              },
-                { CAP_SYS_MODULE, "@module"             },
-                { CAP_SYS_PACCT,  "acct"                },
-                { CAP_SYS_PTRACE, "process_vm_readv"    },
-                { CAP_SYS_PTRACE, "process_vm_writev"   },
-                { CAP_SYS_PTRACE, "ptrace"              },
-                { CAP_SYS_RAWIO,  "@raw-io"             },
-                { CAP_SYS_TIME,   "@clock"              },
+        } whitelist[] = {
+                /* Let's use set names where we can */
+                { 0,                  "@aio"                   },
+                { 0,                  "@basic-io"              },
+                { 0,                  "@chown"                 },
+                { 0,                  "@default"               },
+                { 0,                  "@file-system"           },
+                { 0,                  "@io-event"              },
+                { 0,                  "@ipc"                   },
+                { 0,                  "@mount"                 },
+                { 0,                  "@network-io"            },
+                { 0,                  "@process"               },
+                { 0,                  "@resources"             },
+                { 0,                  "@setuid"                },
+                { 0,                  "@signal"                },
+                { 0,                  "@sync"                  },
+                { 0,                  "@timer"                 },
+
+                /* The following four are sets we optionally enable, in case the caps have been configured for it */
+                { CAP_SYS_TIME,       "@clock"                 },
+                { CAP_SYS_MODULE,     "@module"                },
+                { CAP_SYS_RAWIO,      "@raw-io"                },
+                { CAP_IPC_LOCK,       "@memlock"               },
+
+                /* Plus a good set of additional syscalls which are not part of any of the groups above */
+                { 0,                  "brk"                    },
+                { 0,                  "capget"                 },
+                { 0,                  "capset"                 },
+                { 0,                  "copy_file_range"        },
+                { 0,                  "fadvise64"              },
+                { 0,                  "fadvise64_64"           },
+                { 0,                  "flock"                  },
+                { 0,                  "get_mempolicy"          },
+                { 0,                  "getcpu"                 },
+                { 0,                  "getpriority"            },
+                { 0,                  "getrandom"              },
+                { 0,                  "ioctl"                  },
+                { 0,                  "ioprio_get"             },
+                { 0,                  "kcmp"                   },
+                { 0,                  "madvise"                },
+                { 0,                  "mincore"                },
+                { 0,                  "mprotect"               },
+                { 0,                  "mremap"                 },
+                { 0,                  "name_to_handle_at"      },
+                { 0,                  "oldolduname"            },
+                { 0,                  "olduname"               },
+                { 0,                  "personality"            },
+                { 0,                  "readahead"              },
+                { 0,                  "readdir"                },
+                { 0,                  "remap_file_pages"       },
+                { 0,                  "sched_get_priority_max" },
+                { 0,                  "sched_get_priority_min" },
+                { 0,                  "sched_getaffinity"      },
+                { 0,                  "sched_getattr"          },
+                { 0,                  "sched_getparam"         },
+                { 0,                  "sched_getscheduler"     },
+                { 0,                  "sched_rr_get_interval"  },
+                { 0,                  "sched_yield"            },
+                { 0,                  "seccomp"                },
+                { 0,                  "sendfile"               },
+                { 0,                  "sendfile64"             },
+                { 0,                  "setdomainname"          },
+                { 0,                  "setfsgid"               },
+                { 0,                  "setfsgid32"             },
+                { 0,                  "setfsuid"               },
+                { 0,                  "setfsuid32"             },
+                { 0,                  "sethostname"            },
+                { 0,                  "setpgid"                },
+                { 0,                  "setsid"                 },
+                { 0,                  "splice"                 },
+                { 0,                  "sysinfo"                },
+                { 0,                  "tee"                    },
+                { 0,                  "umask"                  },
+                { 0,                  "uname"                  },
+                { 0,                  "userfaultfd"            },
+                { 0,                  "vmsplice"               },
+
+                /* The following individual syscalls are added depending on specified caps */
+                { CAP_SYS_PACCT,      "acct"                   },
+                { CAP_SYS_PTRACE,     "process_vm_readv"       },
+                { CAP_SYS_PTRACE,     "process_vm_writev"      },
+                { CAP_SYS_PTRACE,     "ptrace"                 },
+                { CAP_SYS_BOOT,       "reboot"                 },
+                { CAP_SYSLOG,         "syslog"                 },
+                { CAP_SYS_TTY_CONFIG, "vhangup"                },
+
+                /*
+                 * The following syscalls and groups are knowingly excluded:
+                 *
+                 * @cpu-emulation
+                 * @keyring           (NB: keyring is not namespaced!)
+                 * @obsolete
+                 * @swap
+                 *
+                 * bpf                (NB: bpffs is not namespaced!)
+                 * fanotify_init
+                 * fanotify_mark
+                 * kexec_file_load
+                 * kexec_load
+                 * lookup_dcookie
+                 * nfsservctl
+                 * open_by_handle_at
+                 * perf_event_open
+                 * pkey_alloc
+                 * pkey_free
+                 * pkey_mprotect
+                 * quotactl
+                 */
         };
 
         int r, c = 0;
         size_t i;
+        char **p;
 
-        for (i = 0; i < ELEMENTSOF(blacklist); i++) {
-                if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
+        for (i = 0; i < ELEMENTSOF(whitelist); i++) {
+                if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
                         continue;
 
-                r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM));
+                r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
                 if (r < 0)
                         /* If the system call is not known on this architecture, then that's fine, let's ignore it */
-                        log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
+                        log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
+                else
+                        c++;
+        }
+
+        STRV_FOREACH(p, syscall_whitelist) {
+                r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
                 else
                         c++;
         }
@@ -83,28 +184,43 @@ static int seccomp_add_default_syscall_filter(
         return c;
 }
 
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
         uint32_t arch;
         int r;
 
         if (!is_seccomp_available()) {
-                log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
+                log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
                 return 0;
         }
 
         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
-                int n;
 
-                log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+                log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
 
-                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
-                n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain);
-                if (n < 0)
-                        return n;
+                r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
+                if (r < 0)
+                        return r;
+
+                r = seccomp_load(seccomp);
+                if (IN_SET(r, -EPERM, -EACCES))
+                        return log_error_errno(r, "Failed to install seccomp filter: %m");
+                if (r < 0)
+                        log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+        }
+
+        SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+                _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+                log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
+
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
                 /*
                   Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
@@ -121,13 +237,10 @@ int setup_seccomp(uint64_t cap_list_retain) {
                                 2,
                                 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
                                 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
-                if (r < 0)
+                if (r < 0) {
                         log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
-                else
-                        n++;
-
-                if (n <= 0) /* no rule added? then skip this architecture */
                         continue;
+                }
 
                 r = seccomp_load(seccomp);
                 if (IN_SET(r, -EPERM, -EACCES))
@@ -141,7 +254,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
 
 #else
 
-int setup_seccomp(uint64_t cap_list_retain) {
+int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
         return 0;
 }