]>
git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-seccomp.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2016 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <linux/netlink.h>
23 #include <sys/capability.h>
24 #include <sys/types.h>
30 #include "alloc-util.h"
32 #include "nspawn-seccomp.h"
34 #include "seccomp-util.h"
36 #include "string-util.h"
41 static int seccomp_add_default_syscall_filter(
44 uint64_t cap_list_retain
,
45 char **syscall_whitelist
,
46 char **syscall_blacklist
) {
52 /* Let's use set names where we can */
57 { 0, "@file-system" },
69 /* The following four are sets we optionally enable, in case the caps have been configured for it */
70 { CAP_SYS_TIME
, "@clock" },
71 { CAP_SYS_MODULE
, "@module" },
72 { CAP_SYS_RAWIO
, "@raw-io" },
73 { CAP_IPC_LOCK
, "@memlock" },
75 /* Plus a good set of additional syscalls which are not part of any of the groups above */
79 { 0, "copy_file_range" },
81 { 0, "fadvise64_64" },
83 { 0, "get_mempolicy" },
94 { 0, "name_to_handle_at" },
100 { 0, "remap_file_pages" },
101 { 0, "sched_get_priority_max" },
102 { 0, "sched_get_priority_min" },
103 { 0, "sched_getaffinity" },
104 { 0, "sched_getattr" },
105 { 0, "sched_getparam" },
106 { 0, "sched_getscheduler" },
107 { 0, "sched_rr_get_interval" },
108 { 0, "sched_yield" },
112 { 0, "setdomainname" },
117 { 0, "sethostname" },
125 { 0, "userfaultfd" },
128 /* The following individual syscalls are added depending on specified caps */
129 { CAP_SYS_PACCT
, "acct" },
130 { CAP_SYS_PTRACE
, "process_vm_readv" },
131 { CAP_SYS_PTRACE
, "process_vm_writev" },
132 { CAP_SYS_PTRACE
, "ptrace" },
133 { CAP_SYS_BOOT
, "reboot" },
134 { CAP_SYSLOG
, "syslog" },
135 { CAP_SYS_TTY_CONFIG
, "vhangup" },
138 * The following syscalls and groups are knowingly excluded:
141 * @keyring (NB: keyring is not namespaced!)
145 * bpf (NB: bpffs is not namespaced!)
165 for (i
= 0; i
< ELEMENTSOF(whitelist
); i
++) {
166 if (whitelist
[i
].capability
!= 0 && (cap_list_retain
& (1ULL << whitelist
[i
].capability
)) == 0)
169 r
= seccomp_add_syscall_filter_item(ctx
, whitelist
[i
].name
, SCMP_ACT_ALLOW
, syscall_blacklist
);
171 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
172 log_debug_errno(r
, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist
[i
].name
, seccomp_arch_to_string(arch
));
177 STRV_FOREACH(p
, syscall_whitelist
) {
178 r
= seccomp_add_syscall_filter_item(ctx
, *p
, SCMP_ACT_ALLOW
, syscall_blacklist
);
180 log_debug_errno(r
, "Failed to add rule for system call %s on %s, ignoring: %m", *p
, seccomp_arch_to_string(arch
));
188 int setup_seccomp(uint64_t cap_list_retain
, char **syscall_whitelist
, char **syscall_blacklist
) {
192 if (!is_seccomp_available()) {
193 log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
197 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
198 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
200 log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch
));
202 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ERRNO(EPERM
));
204 return log_error_errno(r
, "Failed to allocate seccomp object: %m");
206 r
= seccomp_add_default_syscall_filter(seccomp
, arch
, cap_list_retain
, syscall_whitelist
, syscall_blacklist
);
210 r
= seccomp_load(seccomp
);
211 if (IN_SET(r
, -EPERM
, -EACCES
))
212 return log_error_errno(r
, "Failed to install seccomp filter: %m");
214 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
217 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
218 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
220 log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch
));
222 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
224 return log_error_errno(r
, "Failed to allocate seccomp object: %m");
227 Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
228 container. We don't care and just turn off creation of audit sockets.
230 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses
231 as indication that audit is disabled in the kernel.
234 r
= seccomp_rule_add_exact(
236 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
239 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
240 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
242 log_debug_errno(r
, "Failed to add audit seccomp rule, ignoring: %m");
246 r
= seccomp_load(seccomp
);
247 if (IN_SET(r
, -EPERM
, -EACCES
))
248 return log_error_errno(r
, "Failed to install seccomp audit filter: %m");
250 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
258 int setup_seccomp(uint64_t cap_list_retain
, char **syscall_whitelist
, char **syscall_blacklist
) {