]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/nspawn/nspawn-seccomp.c
nspawn: log syscalls we cannot add at debug level
[thirdparty/systemd.git] / src / nspawn / nspawn-seccomp.c
index a6f7a7dabc58b1e3666617d9746b67773115474b..f94f131f22e29663a903346bebc4cf636d651765 100644 (file)
@@ -1,41 +1,25 @@
-/***
-  This file is part of systemd.
-
-  Copyright 2016 Lennart Poettering
-
-  systemd is free software; you can redistribute it and/or modify it
-  under the terms of the GNU Lesser General Public License as published by
-  the Free Software Foundation; either version 2.1 of the License, or
-  (at your option) any later version.
-
-  systemd is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public License
-  along with systemd; If not, see <http://www.gnu.org/licenses/>.
-***/
+/* SPDX-License-Identifier: LGPL-2.1+ */
 
 #include <errno.h>
 #include <linux/netlink.h>
 #include <sys/capability.h>
+#include <sys/socket.h>
 #include <sys/types.h>
 
-#ifdef HAVE_SECCOMP
+#if HAVE_SECCOMP
 #include <seccomp.h>
 #endif
 
 #include "alloc-util.h"
 #include "log.h"
 #include "nspawn-seccomp.h"
-#ifdef HAVE_SECCOMP
+#if HAVE_SECCOMP
 #include "seccomp-util.h"
 #endif
 #include "string-util.h"
 #include "strv.h"
 
-#ifdef HAVE_SECCOMP
+#if HAVE_SECCOMP
 
 static int seccomp_add_default_syscall_filter(
                 scmp_filter_ctx ctx,
@@ -47,52 +31,134 @@ static int seccomp_add_default_syscall_filter(
         static const struct {
                 uint64_t capability;
                 const char* name;
-        } blacklist[] = {
-                { 0,              "@obsolete"           },
-                { 0,              "@keyring"            }, /* keyring is not namespaced */
-                { 0,              "bpf"                 },
-                { 0,              "kexec_file_load"     },
-                { 0,              "kexec_load"          },
-                { 0,              "lookup_dcookie"      },
-                { 0,              "open_by_handle_at"   },
-                { 0,              "perf_event_open"     },
-                { 0,              "quotactl"            },
-                { 0,              "@swap"               },
-                { CAP_SYSLOG,     "syslog"              },
-                { CAP_SYS_MODULE, "@module"             },
-                { CAP_SYS_PACCT,  "acct"                },
-                { CAP_SYS_PTRACE, "process_vm_readv"    },
-                { CAP_SYS_PTRACE, "process_vm_writev"   },
-                { CAP_SYS_PTRACE, "ptrace"              },
-                { CAP_SYS_RAWIO,  "@raw-io"             },
-                { CAP_SYS_TIME,   "@clock"              },
+        } whitelist[] = {
+                /* Let's use set names where we can */
+                { 0,                  "@aio"                   },
+                { 0,                  "@basic-io"              },
+                { 0,                  "@chown"                 },
+                { 0,                  "@default"               },
+                { 0,                  "@file-system"           },
+                { 0,                  "@io-event"              },
+                { 0,                  "@ipc"                   },
+                { 0,                  "@mount"                 },
+                { 0,                  "@network-io"            },
+                { 0,                  "@process"               },
+                { 0,                  "@resources"             },
+                { 0,                  "@setuid"                },
+                { 0,                  "@signal"                },
+                { 0,                  "@sync"                  },
+                { 0,                  "@timer"                 },
+
+                /* The following four are sets we optionally enable, in case the caps have been configured for it */
+                { CAP_SYS_TIME,       "@clock"                 },
+                { CAP_SYS_MODULE,     "@module"                },
+                { CAP_SYS_RAWIO,      "@raw-io"                },
+                { CAP_IPC_LOCK,       "@memlock"               },
+
+                /* Plus a good set of additional syscalls which are not part of any of the groups above */
+                { 0,                  "brk"                    },
+                { 0,                  "capget"                 },
+                { 0,                  "capset"                 },
+                { 0,                  "copy_file_range"        },
+                { 0,                  "fadvise64"              },
+                { 0,                  "fadvise64_64"           },
+                { 0,                  "flock"                  },
+                { 0,                  "get_mempolicy"          },
+                { 0,                  "getcpu"                 },
+                { 0,                  "getpriority"            },
+                { 0,                  "getrandom"              },
+                { 0,                  "ioctl"                  },
+                { 0,                  "ioprio_get"             },
+                { 0,                  "kcmp"                   },
+                { 0,                  "madvise"                },
+                { 0,                  "mincore"                },
+                { 0,                  "mprotect"               },
+                { 0,                  "mremap"                 },
+                { 0,                  "name_to_handle_at"      },
+                { 0,                  "oldolduname"            },
+                { 0,                  "olduname"               },
+                { 0,                  "personality"            },
+                { 0,                  "readahead"              },
+                { 0,                  "readdir"                },
+                { 0,                  "remap_file_pages"       },
+                { 0,                  "sched_get_priority_max" },
+                { 0,                  "sched_get_priority_min" },
+                { 0,                  "sched_getaffinity"      },
+                { 0,                  "sched_getattr"          },
+                { 0,                  "sched_getparam"         },
+                { 0,                  "sched_getscheduler"     },
+                { 0,                  "sched_rr_get_interval"  },
+                { 0,                  "sched_yield"            },
+                { 0,                  "seccomp"                },
+                { 0,                  "sendfile"               },
+                { 0,                  "sendfile64"             },
+                { 0,                  "setdomainname"          },
+                { 0,                  "setfsgid"               },
+                { 0,                  "setfsgid32"             },
+                { 0,                  "setfsuid"               },
+                { 0,                  "setfsuid32"             },
+                { 0,                  "sethostname"            },
+                { 0,                  "setpgid"                },
+                { 0,                  "setsid"                 },
+                { 0,                  "splice"                 },
+                { 0,                  "sysinfo"                },
+                { 0,                  "tee"                    },
+                { 0,                  "umask"                  },
+                { 0,                  "uname"                  },
+                { 0,                  "userfaultfd"            },
+                { 0,                  "vmsplice"               },
+
+                /* The following individual syscalls are added depending on specified caps */
+                { CAP_SYS_PACCT,      "acct"                   },
+                { CAP_SYS_PTRACE,     "process_vm_readv"       },
+                { CAP_SYS_PTRACE,     "process_vm_writev"      },
+                { CAP_SYS_PTRACE,     "ptrace"                 },
+                { CAP_SYS_BOOT,       "reboot"                 },
+                { CAP_SYSLOG,         "syslog"                 },
+                { CAP_SYS_TTY_CONFIG, "vhangup"                },
+
+                /*
+                 * The following syscalls and groups are knowingly excluded:
+                 *
+                 * @cpu-emulation
+                 * @keyring           (NB: keyring is not namespaced!)
+                 * @obsolete
+                 * @pkey
+                 * @swap
+                 *
+                 * bpf                (NB: bpffs is not namespaced!)
+                 * fanotify_init
+                 * fanotify_mark
+                 * kexec_file_load
+                 * kexec_load
+                 * lookup_dcookie
+                 * nfsservctl
+                 * open_by_handle_at
+                 * perf_event_open
+                 * quotactl
+                 */
         };
 
-        int r, c = 0;
-        size_t i;
         char **p;
+        int r;
 
-        for (i = 0; i < ELEMENTSOF(blacklist); i++) {
-                if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
+        for (size_t i = 0; i < ELEMENTSOF(whitelist); i++) {
+                if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
                         continue;
 
-                r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
+                r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist, false);
                 if (r < 0)
-                        /* If the system call is not known on this architecture, then that's fine, let's ignore it */
-                        log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
-                else
-                        c++;
+                        return log_error_errno(r, "Failed to add syscall filter item %s: %m", whitelist[i].name);
         }
 
-        STRV_FOREACH(p, syscall_blacklist) {
-                r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
+        STRV_FOREACH(p, syscall_whitelist) {
+                r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist, true);
                 if (r < 0)
-                        log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
-                else
-                        c++;
+                        log_warning_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m",
+                                          *p, seccomp_arch_to_string(arch));
         }
 
-        return c;
+        return 0;
 }
 
 int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
@@ -106,17 +172,32 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
 
         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
-                int n;
 
-                log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+                log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
 
-                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
-                n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
-                if (n < 0)
-                        return n;
+                r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
+                if (r < 0)
+                        return r;
+
+                r = seccomp_load(seccomp);
+                if (ERRNO_IS_SECCOMP_FATAL(r))
+                        return log_error_errno(r, "Failed to install seccomp filter: %m");
+                if (r < 0)
+                        log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+        }
+
+        SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+                _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+                log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
+
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
                 /*
                   Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
@@ -133,16 +214,13 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
                                 2,
                                 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
                                 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
-                if (r < 0)
+                if (r < 0) {
                         log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
-                else
-                        n++;
-
-                if (n <= 0) /* no rule added? then skip this architecture */
                         continue;
+                }
 
                 r = seccomp_load(seccomp);
-                if (IN_SET(r, -EPERM, -EACCES))
+                if (ERRNO_IS_SECCOMP_FATAL(r))
                         return log_error_errno(r, "Failed to install seccomp audit filter: %m");
                 if (r < 0)
                         log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));