]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-seccomp.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / nspawn / nspawn-seccomp.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2016 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <linux/netlink.h>
23 #include <sys/capability.h>
24 #include <sys/types.h>
25
26 #if HAVE_SECCOMP
27 #include <seccomp.h>
28 #endif
29
30 #include "alloc-util.h"
31 #include "log.h"
32 #include "nspawn-seccomp.h"
33 #if HAVE_SECCOMP
34 #include "seccomp-util.h"
35 #endif
36 #include "string-util.h"
37 #include "strv.h"
38
39 #if HAVE_SECCOMP
40
41 static int seccomp_add_default_syscall_filter(
42 scmp_filter_ctx ctx,
43 uint32_t arch,
44 uint64_t cap_list_retain,
45 char **syscall_whitelist,
46 char **syscall_blacklist) {
47
48 static const struct {
49 uint64_t capability;
50 const char* name;
51 } whitelist[] = {
52 /* Let's use set names where we can */
53 { 0, "@aio" },
54 { 0, "@basic-io" },
55 { 0, "@chown" },
56 { 0, "@default" },
57 { 0, "@file-system" },
58 { 0, "@io-event" },
59 { 0, "@ipc" },
60 { 0, "@mount" },
61 { 0, "@network-io" },
62 { 0, "@process" },
63 { 0, "@resources" },
64 { 0, "@setuid" },
65 { 0, "@signal" },
66 { 0, "@sync" },
67 { 0, "@timer" },
68
69 /* The following four are sets we optionally enable, in case the caps have been configured for it */
70 { CAP_SYS_TIME, "@clock" },
71 { CAP_SYS_MODULE, "@module" },
72 { CAP_SYS_RAWIO, "@raw-io" },
73 { CAP_IPC_LOCK, "@memlock" },
74
75 /* Plus a good set of additional syscalls which are not part of any of the groups above */
76 { 0, "brk" },
77 { 0, "capget" },
78 { 0, "capset" },
79 { 0, "copy_file_range" },
80 { 0, "fadvise64" },
81 { 0, "fadvise64_64" },
82 { 0, "flock" },
83 { 0, "get_mempolicy" },
84 { 0, "getcpu" },
85 { 0, "getpriority" },
86 { 0, "getrandom" },
87 { 0, "ioctl" },
88 { 0, "ioprio_get" },
89 { 0, "kcmp" },
90 { 0, "madvise" },
91 { 0, "mincore" },
92 { 0, "mprotect" },
93 { 0, "mremap" },
94 { 0, "name_to_handle_at" },
95 { 0, "oldolduname" },
96 { 0, "olduname" },
97 { 0, "personality" },
98 { 0, "readahead" },
99 { 0, "readdir" },
100 { 0, "remap_file_pages" },
101 { 0, "sched_get_priority_max" },
102 { 0, "sched_get_priority_min" },
103 { 0, "sched_getaffinity" },
104 { 0, "sched_getattr" },
105 { 0, "sched_getparam" },
106 { 0, "sched_getscheduler" },
107 { 0, "sched_rr_get_interval" },
108 { 0, "sched_yield" },
109 { 0, "seccomp" },
110 { 0, "sendfile" },
111 { 0, "sendfile64" },
112 { 0, "setdomainname" },
113 { 0, "setfsgid" },
114 { 0, "setfsgid32" },
115 { 0, "setfsuid" },
116 { 0, "setfsuid32" },
117 { 0, "sethostname" },
118 { 0, "setpgid" },
119 { 0, "setsid" },
120 { 0, "splice" },
121 { 0, "sysinfo" },
122 { 0, "tee" },
123 { 0, "umask" },
124 { 0, "uname" },
125 { 0, "userfaultfd" },
126 { 0, "vmsplice" },
127
128 /* The following individual syscalls are added depending on specified caps */
129 { CAP_SYS_PACCT, "acct" },
130 { CAP_SYS_PTRACE, "process_vm_readv" },
131 { CAP_SYS_PTRACE, "process_vm_writev" },
132 { CAP_SYS_PTRACE, "ptrace" },
133 { CAP_SYS_BOOT, "reboot" },
134 { CAP_SYSLOG, "syslog" },
135 { CAP_SYS_TTY_CONFIG, "vhangup" },
136
137 /*
138 * The following syscalls and groups are knowingly excluded:
139 *
140 * @cpu-emulation
141 * @keyring (NB: keyring is not namespaced!)
142 * @obsolete
143 * @swap
144 *
145 * bpf (NB: bpffs is not namespaced!)
146 * fanotify_init
147 * fanotify_mark
148 * kexec_file_load
149 * kexec_load
150 * lookup_dcookie
151 * nfsservctl
152 * open_by_handle_at
153 * perf_event_open
154 * pkey_alloc
155 * pkey_free
156 * pkey_mprotect
157 * quotactl
158 */
159 };
160
161 int r, c = 0;
162 size_t i;
163 char **p;
164
165 for (i = 0; i < ELEMENTSOF(whitelist); i++) {
166 if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
167 continue;
168
169 r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
170 if (r < 0)
171 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
172 log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
173 else
174 c++;
175 }
176
177 STRV_FOREACH(p, syscall_whitelist) {
178 r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
179 if (r < 0)
180 log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
181 else
182 c++;
183 }
184
185 return c;
186 }
187
188 int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
189 uint32_t arch;
190 int r;
191
192 if (!is_seccomp_available()) {
193 log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
194 return 0;
195 }
196
197 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
198 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
199
200 log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
201
202 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
203 if (r < 0)
204 return log_error_errno(r, "Failed to allocate seccomp object: %m");
205
206 r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
207 if (r < 0)
208 return r;
209
210 r = seccomp_load(seccomp);
211 if (IN_SET(r, -EPERM, -EACCES))
212 return log_error_errno(r, "Failed to install seccomp filter: %m");
213 if (r < 0)
214 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
215 }
216
217 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
218 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
219
220 log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
221
222 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
223 if (r < 0)
224 return log_error_errno(r, "Failed to allocate seccomp object: %m");
225
226 /*
227 Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
228 container. We don't care and just turn off creation of audit sockets.
229
230 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses
231 as indication that audit is disabled in the kernel.
232 */
233
234 r = seccomp_rule_add_exact(
235 seccomp,
236 SCMP_ACT_ERRNO(EAFNOSUPPORT),
237 SCMP_SYS(socket),
238 2,
239 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
240 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
241 if (r < 0) {
242 log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
243 continue;
244 }
245
246 r = seccomp_load(seccomp);
247 if (IN_SET(r, -EPERM, -EACCES))
248 return log_error_errno(r, "Failed to install seccomp audit filter: %m");
249 if (r < 0)
250 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
251 }
252
253 return 0;
254 }
255
256 #else
257
258 int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
259 return 0;
260 }
261
262 #endif