]>
git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-seccomp.c
2 This file is part of systemd.
4 Copyright 2016 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <linux/netlink.h>
22 #include <sys/capability.h>
23 #include <sys/types.h>
29 #include "alloc-util.h"
31 #include "nspawn-seccomp.h"
33 #include "seccomp-util.h"
35 #include "string-util.h"
40 static int seccomp_add_default_syscall_filter(
43 uint64_t cap_list_retain
,
44 char **syscall_whitelist
,
45 char **syscall_blacklist
) {
51 /* Let's use set names where we can */
56 { 0, "@file-system" },
68 /* The following four are sets we optionally enable, in case the caps have been configured for it */
69 { CAP_SYS_TIME
, "@clock" },
70 { CAP_SYS_MODULE
, "@module" },
71 { CAP_SYS_RAWIO
, "@raw-io" },
72 { CAP_IPC_LOCK
, "@memlock" },
74 /* Plus a good set of additional syscalls which are not part of any of the groups above */
78 { 0, "copy_file_range" },
80 { 0, "fadvise64_64" },
82 { 0, "get_mempolicy" },
93 { 0, "name_to_handle_at" },
99 { 0, "remap_file_pages" },
100 { 0, "sched_get_priority_max" },
101 { 0, "sched_get_priority_min" },
102 { 0, "sched_getaffinity" },
103 { 0, "sched_getattr" },
104 { 0, "sched_getparam" },
105 { 0, "sched_getscheduler" },
106 { 0, "sched_rr_get_interval" },
107 { 0, "sched_yield" },
111 { 0, "setdomainname" },
116 { 0, "sethostname" },
124 { 0, "userfaultfd" },
127 /* The following individual syscalls are added depending on specified caps */
128 { CAP_SYS_PACCT
, "acct" },
129 { CAP_SYS_PTRACE
, "process_vm_readv" },
130 { CAP_SYS_PTRACE
, "process_vm_writev" },
131 { CAP_SYS_PTRACE
, "ptrace" },
132 { CAP_SYS_BOOT
, "reboot" },
133 { CAP_SYSLOG
, "syslog" },
134 { CAP_SYS_TTY_CONFIG
, "vhangup" },
137 * The following syscalls and groups are knowingly excluded:
140 * @keyring (NB: keyring is not namespaced!)
144 * bpf (NB: bpffs is not namespaced!)
164 for (i
= 0; i
< ELEMENTSOF(whitelist
); i
++) {
165 if (whitelist
[i
].capability
!= 0 && (cap_list_retain
& (1ULL << whitelist
[i
].capability
)) == 0)
168 r
= seccomp_add_syscall_filter_item(ctx
, whitelist
[i
].name
, SCMP_ACT_ALLOW
, syscall_blacklist
);
170 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
171 log_debug_errno(r
, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist
[i
].name
, seccomp_arch_to_string(arch
));
176 STRV_FOREACH(p
, syscall_whitelist
) {
177 r
= seccomp_add_syscall_filter_item(ctx
, *p
, SCMP_ACT_ALLOW
, syscall_blacklist
);
179 log_debug_errno(r
, "Failed to add rule for system call %s on %s, ignoring: %m", *p
, seccomp_arch_to_string(arch
));
187 int setup_seccomp(uint64_t cap_list_retain
, char **syscall_whitelist
, char **syscall_blacklist
) {
191 if (!is_seccomp_available()) {
192 log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
196 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
197 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
199 log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch
));
201 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ERRNO(EPERM
));
203 return log_error_errno(r
, "Failed to allocate seccomp object: %m");
205 r
= seccomp_add_default_syscall_filter(seccomp
, arch
, cap_list_retain
, syscall_whitelist
, syscall_blacklist
);
209 r
= seccomp_load(seccomp
);
210 if (IN_SET(r
, -EPERM
, -EACCES
))
211 return log_error_errno(r
, "Failed to install seccomp filter: %m");
213 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
216 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
217 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
219 log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch
));
221 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
223 return log_error_errno(r
, "Failed to allocate seccomp object: %m");
226 Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
227 container. We don't care and just turn off creation of audit sockets.
229 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses
230 as indication that audit is disabled in the kernel.
233 r
= seccomp_rule_add_exact(
235 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
238 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
239 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
241 log_debug_errno(r
, "Failed to add audit seccomp rule, ignoring: %m");
245 r
= seccomp_load(seccomp
);
246 if (IN_SET(r
, -EPERM
, -EACCES
))
247 return log_error_errno(r
, "Failed to install seccomp audit filter: %m");
249 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
257 int setup_seccomp(uint64_t cap_list_retain
, char **syscall_whitelist
, char **syscall_blacklist
) {