]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
man: add missing link in sd-bus(3)
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
469830d1 4#include <linux/seccomp.h>
57183d11 5#include <seccomp.h>
a8fbdf54 6#include <stddef.h>
469830d1 7#include <sys/mman.h>
d347d902 8#include <sys/prctl.h>
469830d1 9#include <sys/shm.h>
57183d11 10
469830d1 11#include "af-list.h"
add00535 12#include "alloc-util.h"
a8fbdf54 13#include "macro.h"
add00535 14#include "nsflags.h"
78e864e5 15#include "process-util.h"
cf0fbc49 16#include "seccomp-util.h"
b16bd535 17#include "set.h"
07630cea 18#include "string-util.h"
b16bd535 19#include "strv.h"
8130926d 20#include "util.h"
469830d1
LP
21#include "errno-list.h"
22
23const uint32_t seccomp_local_archs[] = {
24
f2d9751c
LP
25 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
26
27#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
28 SCMP_ARCH_X86,
29 SCMP_ARCH_X86_64,
f2d9751c
LP
30 SCMP_ARCH_X32, /* native */
31#elif defined(__x86_64__) && !defined(__ILP32__)
32 SCMP_ARCH_X86,
469830d1 33 SCMP_ARCH_X32,
f2d9751c
LP
34 SCMP_ARCH_X86_64, /* native */
35#elif defined(__i386__)
36 SCMP_ARCH_X86,
37#elif defined(__aarch64__)
469830d1 38 SCMP_ARCH_ARM,
f2d9751c
LP
39 SCMP_ARCH_AARCH64, /* native */
40#elif defined(__arm__)
41 SCMP_ARCH_ARM,
42#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
43 SCMP_ARCH_MIPSEL,
44 SCMP_ARCH_MIPS, /* native */
45#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 46 SCMP_ARCH_MIPS,
f2d9751c
LP
47 SCMP_ARCH_MIPSEL, /* native */
48#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
49 SCMP_ARCH_MIPSEL,
50 SCMP_ARCH_MIPS,
51 SCMP_ARCH_MIPSEL64N32,
469830d1 52 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
53 SCMP_ARCH_MIPSEL64,
54 SCMP_ARCH_MIPS64, /* native */
55#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPS,
469830d1 57 SCMP_ARCH_MIPSEL,
f2d9751c
LP
58 SCMP_ARCH_MIPS64N32,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64,
61 SCMP_ARCH_MIPSEL64, /* native */
62#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
63 SCMP_ARCH_MIPSEL,
64 SCMP_ARCH_MIPS,
469830d1 65 SCMP_ARCH_MIPSEL64,
f2d9751c 66 SCMP_ARCH_MIPS64,
469830d1 67 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
68 SCMP_ARCH_MIPS64N32, /* native */
69#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPS,
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS64,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64N32,
75 SCMP_ARCH_MIPSEL64N32, /* native */
76#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 77 SCMP_ARCH_PPC,
469830d1 78 SCMP_ARCH_PPC64LE,
f2d9751c
LP
79 SCMP_ARCH_PPC64, /* native */
80#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
81 SCMP_ARCH_PPC,
82 SCMP_ARCH_PPC64,
83 SCMP_ARCH_PPC64LE, /* native */
84#elif defined(__powerpc__)
85 SCMP_ARCH_PPC,
86#elif defined(__s390x__)
87 SCMP_ARCH_S390,
88 SCMP_ARCH_S390X, /* native */
89#elif defined(__s390__)
469830d1 90 SCMP_ARCH_S390,
469830d1
LP
91#endif
92 (uint32_t) -1
93 };
57183d11
LP
94
95const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
96 /* Maintain order used in <seccomp.h>.
97 *
98 * Names used here should be the same as those used for ConditionArchitecture=,
99 * except for "subarchitectures" like x32. */
57183d11 100
aa34055f
ZJS
101 switch(c) {
102 case SCMP_ARCH_NATIVE:
57183d11 103 return "native";
aa34055f 104 case SCMP_ARCH_X86:
57183d11 105 return "x86";
aa34055f 106 case SCMP_ARCH_X86_64:
57183d11 107 return "x86-64";
aa34055f 108 case SCMP_ARCH_X32:
57183d11 109 return "x32";
aa34055f 110 case SCMP_ARCH_ARM:
57183d11 111 return "arm";
aa34055f
ZJS
112 case SCMP_ARCH_AARCH64:
113 return "arm64";
114 case SCMP_ARCH_MIPS:
115 return "mips";
116 case SCMP_ARCH_MIPS64:
117 return "mips64";
118 case SCMP_ARCH_MIPS64N32:
119 return "mips64-n32";
120 case SCMP_ARCH_MIPSEL:
121 return "mips-le";
122 case SCMP_ARCH_MIPSEL64:
123 return "mips64-le";
124 case SCMP_ARCH_MIPSEL64N32:
125 return "mips64-le-n32";
126 case SCMP_ARCH_PPC:
127 return "ppc";
128 case SCMP_ARCH_PPC64:
129 return "ppc64";
130 case SCMP_ARCH_PPC64LE:
131 return "ppc64-le";
132 case SCMP_ARCH_S390:
6abfd303 133 return "s390";
aa34055f 134 case SCMP_ARCH_S390X:
6abfd303 135 return "s390x";
aa34055f
ZJS
136 default:
137 return NULL;
138 }
57183d11
LP
139}
140
141int seccomp_arch_from_string(const char *n, uint32_t *ret) {
142 if (!n)
143 return -EINVAL;
144
145 assert(ret);
146
147 if (streq(n, "native"))
148 *ret = SCMP_ARCH_NATIVE;
149 else if (streq(n, "x86"))
150 *ret = SCMP_ARCH_X86;
151 else if (streq(n, "x86-64"))
152 *ret = SCMP_ARCH_X86_64;
153 else if (streq(n, "x32"))
154 *ret = SCMP_ARCH_X32;
155 else if (streq(n, "arm"))
156 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
157 else if (streq(n, "arm64"))
158 *ret = SCMP_ARCH_AARCH64;
159 else if (streq(n, "mips"))
160 *ret = SCMP_ARCH_MIPS;
161 else if (streq(n, "mips64"))
162 *ret = SCMP_ARCH_MIPS64;
163 else if (streq(n, "mips64-n32"))
164 *ret = SCMP_ARCH_MIPS64N32;
165 else if (streq(n, "mips-le"))
166 *ret = SCMP_ARCH_MIPSEL;
167 else if (streq(n, "mips64-le"))
168 *ret = SCMP_ARCH_MIPSEL64;
169 else if (streq(n, "mips64-le-n32"))
170 *ret = SCMP_ARCH_MIPSEL64N32;
171 else if (streq(n, "ppc"))
172 *ret = SCMP_ARCH_PPC;
173 else if (streq(n, "ppc64"))
174 *ret = SCMP_ARCH_PPC64;
175 else if (streq(n, "ppc64-le"))
176 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
177 else if (streq(n, "s390"))
178 *ret = SCMP_ARCH_S390;
179 else if (streq(n, "s390x"))
180 *ret = SCMP_ARCH_S390X;
57183d11
LP
181 else
182 return -EINVAL;
183
184 return 0;
185}
e9642be2 186
469830d1 187int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
188 scmp_filter_ctx seccomp;
189 int r;
190
469830d1
LP
191 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
192 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
193
194 seccomp = seccomp_init(default_action);
195 if (!seccomp)
196 return -ENOMEM;
197
469830d1
LP
198 if (arch != SCMP_ARCH_NATIVE &&
199 arch != seccomp_arch_native()) {
200
1b52793d 201 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
202 if (r < 0)
203 goto finish;
204
1b52793d 205 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
206 if (r < 0)
207 goto finish;
208
209 assert(seccomp_arch_exist(seccomp, arch) >= 0);
210 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
211 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
212 } else {
213 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
214 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
215 }
216
217 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
218 if (r < 0)
219 goto finish;
220
221 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
222 if (r < 0)
223 goto finish;
224
225 *ret = seccomp;
226 return 0;
227
228finish:
229 seccomp_release(seccomp);
230 return r;
231}
232
d347d902 233static bool is_basic_seccomp_available(void) {
4d5bd50a 234 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
235}
236
237static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
238 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
239 errno == EFAULT;
d347d902
FS
240}
241
83f12b27 242bool is_seccomp_available(void) {
83f12b27 243 static int cached_enabled = -1;
4d5bd50a 244
83f12b27 245 if (cached_enabled < 0)
4d5bd50a
LP
246 cached_enabled =
247 is_basic_seccomp_available() &&
248 is_seccomp_filter_available();
249
83f12b27
FS
250 return cached_enabled;
251}
252
8130926d 253const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 254 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 255 .name = "@default",
d5efc18b 256 .help = "System calls that are always permitted",
40eb6a80
ZJS
257 .value =
258 "clock_getres\0"
259 "clock_gettime\0"
260 "clock_nanosleep\0"
261 "execve\0"
262 "exit\0"
263 "exit_group\0"
e41b0f42
LP
264 "futex\0"
265 "get_robust_list\0"
266 "get_thread_area\0"
09d3020b
DH
267 "getegid\0"
268 "getegid32\0"
269 "geteuid\0"
270 "geteuid32\0"
271 "getgid\0"
272 "getgid32\0"
273 "getgroups\0"
274 "getgroups32\0"
275 "getpgid\0"
276 "getpgrp\0"
277 "getpid\0"
278 "getppid\0"
279 "getresgid\0"
280 "getresgid32\0"
281 "getresuid\0"
282 "getresuid32\0"
40eb6a80 283 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
284 "getsid\0"
285 "gettid\0"
40eb6a80 286 "gettimeofday\0"
09d3020b
DH
287 "getuid\0"
288 "getuid32\0"
e41b0f42 289 "membarrier\0"
40eb6a80
ZJS
290 "nanosleep\0"
291 "pause\0"
4c3a9176 292 "prlimit64\0"
e41b0f42 293 "restart_syscall\0"
40eb6a80 294 "rt_sigreturn\0"
8f44de08 295 "sched_yield\0"
e41b0f42
LP
296 "set_robust_list\0"
297 "set_thread_area\0"
298 "set_tid_address\0"
ce5faeac 299 "set_tls\0"
40eb6a80
ZJS
300 "sigreturn\0"
301 "time\0"
4c3a9176 302 "ugetrlimit\0"
40eb6a80 303 },
44898c53
LP
304 [SYSCALL_FILTER_SET_AIO] = {
305 .name = "@aio",
306 .help = "Asynchronous IO",
307 .value =
308 "io_cancel\0"
309 "io_destroy\0"
310 "io_getevents\0"
311 "io_setup\0"
312 "io_submit\0"
313 },
133ddbbe 314 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 315 .name = "@basic-io",
d5efc18b 316 .help = "Basic IO",
133ddbbe 317 .value =
648a0ed0 318 "_llseek\0"
133ddbbe 319 "close\0"
648a0ed0 320 "dup\0"
133ddbbe
LP
321 "dup2\0"
322 "dup3\0"
133ddbbe
LP
323 "lseek\0"
324 "pread64\0"
325 "preadv\0"
44898c53 326 "preadv2\0"
133ddbbe
LP
327 "pwrite64\0"
328 "pwritev\0"
44898c53 329 "pwritev2\0"
133ddbbe
LP
330 "read\0"
331 "readv\0"
332 "write\0"
333 "writev\0"
334 },
44898c53
LP
335 [SYSCALL_FILTER_SET_CHOWN] = {
336 .name = "@chown",
337 .help = "Change ownership of files and directories",
338 .value =
339 "chown\0"
340 "chown32\0"
341 "fchown\0"
342 "fchown32\0"
343 "fchownat\0"
344 "lchown\0"
345 "lchown32\0"
346 },
8130926d 347 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 348 .name = "@clock",
d5efc18b 349 .help = "Change the system time",
201c1cc2
TM
350 .value =
351 "adjtimex\0"
1f9ac68b
LP
352 "clock_adjtime\0"
353 "clock_settime\0"
201c1cc2 354 "settimeofday\0"
1f9ac68b 355 "stime\0"
8130926d
LP
356 },
357 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 358 .name = "@cpu-emulation",
d5efc18b 359 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
360 .value =
361 "modify_ldt\0"
362 "subpage_prot\0"
363 "switch_endian\0"
364 "vm86\0"
365 "vm86old\0"
8130926d
LP
366 },
367 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 368 .name = "@debug",
d5efc18b 369 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
370 .value =
371 "lookup_dcookie\0"
372 "perf_event_open\0"
373 "process_vm_readv\0"
374 "process_vm_writev\0"
375 "ptrace\0"
376 "rtas\0"
8130926d 377#ifdef __NR_s390_runtime_instr
1f9ac68b 378 "s390_runtime_instr\0"
8130926d 379#endif
1f9ac68b 380 "sys_debug_setcontext\0"
8130926d 381 },
1a1b13c9
LP
382 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
383 .name = "@file-system",
384 .help = "File system operations",
385 .value =
386 "access\0"
387 "chdir\0"
388 "chmod\0"
389 "close\0"
390 "creat\0"
391 "faccessat\0"
392 "fallocate\0"
393 "fchdir\0"
394 "fchmod\0"
395 "fchmodat\0"
1a1b13c9 396 "fcntl\0"
ceaa6aa7 397 "fcntl64\0"
1a1b13c9
LP
398 "fgetxattr\0"
399 "flistxattr\0"
ceaa6aa7 400 "fremovexattr\0"
1a1b13c9 401 "fsetxattr\0"
1a1b13c9 402 "fstat\0"
ceaa6aa7 403 "fstat64\0"
1a1b13c9 404 "fstatat64\0"
1a1b13c9 405 "fstatfs\0"
ceaa6aa7 406 "fstatfs64\0"
1a1b13c9 407 "ftruncate\0"
ceaa6aa7 408 "ftruncate64\0"
1a1b13c9
LP
409 "futimesat\0"
410 "getcwd\0"
1a1b13c9 411 "getdents\0"
ceaa6aa7 412 "getdents64\0"
1a1b13c9
LP
413 "getxattr\0"
414 "inotify_add_watch\0"
ceaa6aa7 415 "inotify_init\0"
1a1b13c9
LP
416 "inotify_init1\0"
417 "inotify_rm_watch\0"
418 "lgetxattr\0"
419 "link\0"
420 "linkat\0"
421 "listxattr\0"
422 "llistxattr\0"
423 "lremovexattr\0"
424 "lsetxattr\0"
1a1b13c9 425 "lstat\0"
ceaa6aa7 426 "lstat64\0"
1a1b13c9
LP
427 "mkdir\0"
428 "mkdirat\0"
429 "mknod\0"
430 "mknodat\0"
1a1b13c9 431 "mmap\0"
ceaa6aa7 432 "mmap2\0"
7961116e 433 "munmap\0"
1a1b13c9 434 "newfstatat\0"
ceaa6aa7
LP
435 "oldfstat\0"
436 "oldlstat\0"
437 "oldstat\0"
1a1b13c9
LP
438 "open\0"
439 "openat\0"
440 "readlink\0"
441 "readlinkat\0"
442 "removexattr\0"
443 "rename\0"
1a1b13c9 444 "renameat\0"
ceaa6aa7 445 "renameat2\0"
1a1b13c9
LP
446 "rmdir\0"
447 "setxattr\0"
1a1b13c9 448 "stat\0"
ceaa6aa7 449 "stat64\0"
1a1b13c9 450 "statfs\0"
ceaa6aa7 451 "statfs64\0"
8e6a7a8b 452#ifdef __NR_statx
a4135a74 453 "statx\0"
ceaa6aa7 454#endif
1a1b13c9
LP
455 "symlink\0"
456 "symlinkat\0"
1a1b13c9 457 "truncate\0"
ceaa6aa7 458 "truncate64\0"
1a1b13c9
LP
459 "unlink\0"
460 "unlinkat\0"
ceaa6aa7 461 "utime\0"
1a1b13c9
LP
462 "utimensat\0"
463 "utimes\0"
464 },
8130926d 465 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 466 .name = "@io-event",
d5efc18b 467 .help = "Event loop system calls",
201c1cc2
TM
468 .value =
469 "_newselect\0"
201c1cc2 470 "epoll_create\0"
215728ff 471 "epoll_create1\0"
201c1cc2
TM
472 "epoll_ctl\0"
473 "epoll_ctl_old\0"
474 "epoll_pwait\0"
475 "epoll_wait\0"
476 "epoll_wait_old\0"
201c1cc2 477 "eventfd\0"
215728ff 478 "eventfd2\0"
201c1cc2
TM
479 "poll\0"
480 "ppoll\0"
481 "pselect6\0"
482 "select\0"
8130926d
LP
483 },
484 [SYSCALL_FILTER_SET_IPC] = {
8130926d 485 .name = "@ipc",
d5efc18b
ZJS
486 .help = "SysV IPC, POSIX Message Queues or other IPC",
487 .value =
488 "ipc\0"
cd5bfd7e 489 "memfd_create\0"
201c1cc2
TM
490 "mq_getsetattr\0"
491 "mq_notify\0"
492 "mq_open\0"
493 "mq_timedreceive\0"
494 "mq_timedsend\0"
495 "mq_unlink\0"
496 "msgctl\0"
497 "msgget\0"
498 "msgrcv\0"
499 "msgsnd\0"
cd5bfd7e 500 "pipe\0"
215728ff 501 "pipe2\0"
201c1cc2
TM
502 "process_vm_readv\0"
503 "process_vm_writev\0"
504 "semctl\0"
505 "semget\0"
506 "semop\0"
507 "semtimedop\0"
508 "shmat\0"
509 "shmctl\0"
510 "shmdt\0"
511 "shmget\0"
8130926d
LP
512 },
513 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 514 .name = "@keyring",
d5efc18b 515 .help = "Kernel keyring access",
1f9ac68b
LP
516 .value =
517 "add_key\0"
518 "keyctl\0"
519 "request_key\0"
8130926d 520 },
cd0ddf6f
LP
521 [SYSCALL_FILTER_SET_MEMLOCK] = {
522 .name = "@memlock",
523 .help = "Memory locking control",
524 .value =
525 "mlock\0"
526 "mlock2\0"
527 "mlockall\0"
528 "munlock\0"
529 "munlockall\0"
530 },
8130926d 531 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 532 .name = "@module",
d5efc18b 533 .help = "Loading and unloading of kernel modules",
201c1cc2 534 .value =
201c1cc2
TM
535 "delete_module\0"
536 "finit_module\0"
537 "init_module\0"
8130926d
LP
538 },
539 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 540 .name = "@mount",
d5efc18b 541 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
542 .value =
543 "chroot\0"
544 "mount\0"
201c1cc2 545 "pivot_root\0"
201c1cc2 546 "umount\0"
215728ff 547 "umount2\0"
8130926d
LP
548 },
549 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 550 .name = "@network-io",
d5efc18b 551 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 552 .value =
201c1cc2 553 "accept\0"
215728ff 554 "accept4\0"
201c1cc2
TM
555 "bind\0"
556 "connect\0"
557 "getpeername\0"
558 "getsockname\0"
559 "getsockopt\0"
560 "listen\0"
561 "recv\0"
562 "recvfrom\0"
563 "recvmmsg\0"
564 "recvmsg\0"
565 "send\0"
566 "sendmmsg\0"
567 "sendmsg\0"
568 "sendto\0"
569 "setsockopt\0"
570 "shutdown\0"
571 "socket\0"
572 "socketcall\0"
573 "socketpair\0"
8130926d
LP
574 },
575 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 576 /* some unknown even to libseccomp */
8130926d 577 .name = "@obsolete",
d5efc18b 578 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
579 .value =
580 "_sysctl\0"
581 "afs_syscall\0"
802fa07a 582 "bdflush\0"
201c1cc2 583 "break\0"
1f9ac68b 584 "create_module\0"
201c1cc2
TM
585 "ftime\0"
586 "get_kernel_syms\0"
201c1cc2
TM
587 "getpmsg\0"
588 "gtty\0"
7e0c3b8f 589 "idle\0"
201c1cc2 590 "lock\0"
201c1cc2 591 "mpx\0"
201c1cc2
TM
592 "prof\0"
593 "profil\0"
201c1cc2
TM
594 "putpmsg\0"
595 "query_module\0"
201c1cc2
TM
596 "security\0"
597 "sgetmask\0"
598 "ssetmask\0"
599 "stty\0"
1f9ac68b 600 "sysfs\0"
201c1cc2
TM
601 "tuxcall\0"
602 "ulimit\0"
603 "uselib\0"
1f9ac68b 604 "ustat\0"
201c1cc2 605 "vserver\0"
8130926d
LP
606 },
607 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 608 .name = "@privileged",
d5efc18b 609 .help = "All system calls which need super-user capabilities",
201c1cc2 610 .value =
44898c53 611 "@chown\0"
201c1cc2
TM
612 "@clock\0"
613 "@module\0"
614 "@raw-io\0"
af0f047b
LP
615 "@reboot\0"
616 "@swap\0"
215728ff 617 "_sysctl\0"
201c1cc2 618 "acct\0"
201c1cc2 619 "bpf\0"
1f9ac68b 620 "capset\0"
201c1cc2 621 "chroot\0"
201c1cc2
TM
622 "nfsservctl\0"
623 "pivot_root\0"
624 "quotactl\0"
201c1cc2 625 "setdomainname\0"
201c1cc2 626 "setfsuid\0"
215728ff 627 "setfsuid32\0"
201c1cc2 628 "setgroups\0"
215728ff 629 "setgroups32\0"
201c1cc2 630 "sethostname\0"
201c1cc2 631 "setresuid\0"
215728ff 632 "setresuid32\0"
201c1cc2 633 "setreuid\0"
215728ff 634 "setreuid32\0"
e05ee49b 635 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 636 "setuid32\0"
201c1cc2 637 "vhangup\0"
8130926d
LP
638 },
639 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 640 .name = "@process",
d5efc18b 641 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
642 .value =
643 "arch_prctl\0"
09d3020b 644 "capget\0" /* Able to query arbitrary processes */
201c1cc2 645 "clone\0"
201c1cc2
TM
646 "execveat\0"
647 "fork\0"
b887d2eb 648 "getrusage\0"
201c1cc2
TM
649 "kill\0"
650 "prctl\0"
b887d2eb
LP
651 "rt_sigqueueinfo\0"
652 "rt_tgsigqueueinfo\0"
201c1cc2 653 "setns\0"
a9518dc3 654 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 655 "tgkill\0"
b887d2eb 656 "times\0"
201c1cc2
TM
657 "tkill\0"
658 "unshare\0"
659 "vfork\0"
b887d2eb
LP
660 "wait4\0"
661 "waitid\0"
662 "waitpid\0"
8130926d
LP
663 },
664 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 665 .name = "@raw-io",
d5efc18b 666 .help = "Raw I/O port access",
201c1cc2
TM
667 .value =
668 "ioperm\0"
669 "iopl\0"
1f9ac68b 670 "pciconfig_iobase\0"
201c1cc2
TM
671 "pciconfig_read\0"
672 "pciconfig_write\0"
8130926d 673#ifdef __NR_s390_pci_mmio_read
201c1cc2 674 "s390_pci_mmio_read\0"
8130926d
LP
675#endif
676#ifdef __NR_s390_pci_mmio_write
201c1cc2 677 "s390_pci_mmio_write\0"
8130926d
LP
678#endif
679 },
bd2ab3f4
LP
680 [SYSCALL_FILTER_SET_REBOOT] = {
681 .name = "@reboot",
682 .help = "Reboot and reboot preparation/kexec",
683 .value =
bd2ab3f4 684 "kexec_file_load\0"
e59608fa 685 "kexec_load\0"
bd2ab3f4
LP
686 "reboot\0"
687 },
133ddbbe 688 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 689 .name = "@resources",
58a8f68b 690 .help = "Alter resource settings",
133ddbbe 691 .value =
0963c053
LP
692 "ioprio_set\0"
693 "mbind\0"
694 "migrate_pages\0"
695 "move_pages\0"
696 "nice\0"
0963c053
LP
697 "sched_setaffinity\0"
698 "sched_setattr\0"
133ddbbe
LP
699 "sched_setparam\0"
700 "sched_setscheduler\0"
0963c053 701 "set_mempolicy\0"
133ddbbe
LP
702 "setpriority\0"
703 "setrlimit\0"
133ddbbe 704 },
6eaaeee9
LP
705 [SYSCALL_FILTER_SET_SETUID] = {
706 .name = "@setuid",
707 .help = "Operations for changing user/group credentials",
708 .value =
6eaaeee9 709 "setgid\0"
215728ff 710 "setgid32\0"
6eaaeee9 711 "setgroups\0"
215728ff 712 "setgroups32\0"
6eaaeee9 713 "setregid\0"
215728ff 714 "setregid32\0"
6eaaeee9 715 "setresgid\0"
215728ff 716 "setresgid32\0"
6eaaeee9 717 "setresuid\0"
215728ff 718 "setresuid32\0"
6eaaeee9 719 "setreuid\0"
215728ff 720 "setreuid32\0"
6eaaeee9 721 "setuid\0"
215728ff 722 "setuid32\0"
6eaaeee9 723 },
cd0ddf6f
LP
724 [SYSCALL_FILTER_SET_SIGNAL] = {
725 .name = "@signal",
726 .help = "Process signal handling",
727 .value =
728 "rt_sigaction\0"
729 "rt_sigpending\0"
730 "rt_sigprocmask\0"
731 "rt_sigsuspend\0"
732 "rt_sigtimedwait\0"
733 "sigaction\0"
734 "sigaltstack\0"
735 "signal\0"
736 "signalfd\0"
737 "signalfd4\0"
738 "sigpending\0"
739 "sigprocmask\0"
740 "sigsuspend\0"
741 },
bd2ab3f4
LP
742 [SYSCALL_FILTER_SET_SWAP] = {
743 .name = "@swap",
744 .help = "Enable/disable swap devices",
745 .value =
746 "swapoff\0"
747 "swapon\0"
748 },
44898c53
LP
749 [SYSCALL_FILTER_SET_SYNC] = {
750 .name = "@sync",
751 .help = "Synchronize files and memory to storage",
752 .value =
753 "fdatasync\0"
754 "fsync\0"
755 "msync\0"
756 "sync\0"
757 "sync_file_range\0"
758 "syncfs\0"
759 },
70526841
LP
760 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
761 .name = "@system-service",
762 .help = "General system service operations",
763 .value =
764 "@aio\0"
765 "@basic-io\0"
766 "@chown\0"
767 "@default\0"
768 "@file-system\0"
769 "@io-event\0"
770 "@ipc\0"
771 "@keyring\0"
772 "@memlock\0"
773 "@network-io\0"
774 "@process\0"
775 "@resources\0"
776 "@setuid\0"
777 "@signal\0"
778 "@sync\0"
779 "@timer\0"
780 "brk\0"
781 "capget\0"
782 "capset\0"
783 "copy_file_range\0"
784 "fadvise64\0"
785 "fadvise64_64\0"
786 "flock\0"
787 "get_mempolicy\0"
788 "getcpu\0"
789 "getpriority\0"
790 "getrandom\0"
791 "ioctl\0"
792 "ioprio_get\0"
793 "kcmp\0"
794 "madvise\0"
795 "mincore\0"
796 "mprotect\0"
797 "mremap\0"
798 "name_to_handle_at\0"
799 "oldolduname\0"
800 "olduname\0"
801 "personality\0"
802 "readahead\0"
803 "readdir\0"
804 "remap_file_pages\0"
805 "sched_get_priority_max\0"
806 "sched_get_priority_min\0"
807 "sched_getaffinity\0"
808 "sched_getattr\0"
809 "sched_getparam\0"
810 "sched_getscheduler\0"
811 "sched_rr_get_interval\0"
812 "sched_yield\0"
813 "sendfile\0"
814 "sendfile64\0"
815 "setfsgid\0"
816 "setfsgid32\0"
817 "setfsuid\0"
818 "setfsuid32\0"
819 "setpgid\0"
820 "setsid\0"
821 "splice\0"
822 "sysinfo\0"
823 "tee\0"
824 "umask\0"
825 "uname\0"
826 "userfaultfd\0"
827 "vmsplice\0"
828 },
cd0ddf6f
LP
829 [SYSCALL_FILTER_SET_TIMER] = {
830 .name = "@timer",
831 .help = "Schedule operations by time",
832 .value =
833 "alarm\0"
834 "getitimer\0"
835 "setitimer\0"
836 "timer_create\0"
837 "timer_delete\0"
838 "timer_getoverrun\0"
839 "timer_gettime\0"
840 "timer_settime\0"
841 "timerfd_create\0"
842 "timerfd_gettime\0"
843 "timerfd_settime\0"
844 "times\0"
845 },
201c1cc2 846};
8130926d
LP
847
848const SyscallFilterSet *syscall_filter_set_find(const char *name) {
849 unsigned i;
850
851 if (isempty(name) || name[0] != '@')
852 return NULL;
853
854 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
855 if (streq(syscall_filter_sets[i].name, name))
856 return syscall_filter_sets + i;
857
858 return NULL;
859}
860
b54f36c6 861static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 862
b54f36c6 863int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
864 assert(seccomp);
865 assert(name);
866
960e4569
LP
867 if (strv_contains(exclude, name))
868 return 0;
869
69b1b241
LP
870 if (name[0] == '@') {
871 const SyscallFilterSet *other;
872
873 other = syscall_filter_set_find(name);
cff7bff8
LP
874 if (!other) {
875 log_debug("Filter set %s is not known!", name);
69b1b241 876 return -EINVAL;
cff7bff8 877 }
69b1b241 878
b54f36c6
ZJS
879 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
880
69b1b241 881 } else {
b54f36c6 882 int id, r;
69b1b241
LP
883
884 id = seccomp_syscall_resolve_name(name);
cff7bff8 885 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
886 if (log_missing)
887 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 888 return 0;
cff7bff8 889 }
69b1b241
LP
890
891 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 892 if (r < 0) {
69b1b241 893 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
894 bool ignore = r == -EDOM;
895
896 if (!ignore || log_missing)
897 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
898 name, id, ignore ? ", ignoring" : "");
899 if (!ignore)
900 return r;
b54f36c6 901 }
69b1b241 902
b54f36c6
ZJS
903 return 0;
904 }
69b1b241
LP
905}
906
469830d1
LP
907static int seccomp_add_syscall_filter_set(
908 scmp_filter_ctx seccomp,
469830d1 909 const SyscallFilterSet *set,
960e4569 910 uint32_t action,
b54f36c6
ZJS
911 char **exclude,
912 bool log_missing) {
469830d1 913
8130926d
LP
914 const char *sys;
915 int r;
916
917 assert(seccomp);
918 assert(set);
919
920 NULSTR_FOREACH(sys, set->value) {
b54f36c6 921 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
922 if (r < 0)
923 return r;
469830d1
LP
924 }
925
926 return 0;
927}
928
b54f36c6 929int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
930 uint32_t arch;
931 int r;
932
933 assert(set);
934
935 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
936 * earch local arch. */
937
938 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
939 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
940
941 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
942
943 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
944 if (r < 0)
945 return r;
469830d1 946
b54f36c6 947 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
948 if (r < 0)
949 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
950
951 r = seccomp_load(seccomp);
952 if (IN_SET(r, -EPERM, -EACCES))
953 return r;
954 if (r < 0)
955 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
956 }
957
958 return 0;
959}
a3be2849 960
b54f36c6 961int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 962 uint32_t arch;
a3be2849
LP
963 int r;
964
469830d1
LP
965 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
966 * SyscallFilterSet* table. */
a3be2849 967
8cfa775f 968 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 969 return 0;
a3be2849 970
469830d1
LP
971 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
972 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
973 Iterator i;
b54f36c6 974 void *syscall_id, *val;
a3be2849 975
469830d1 976 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 977
469830d1
LP
978 r = seccomp_init_for_arch(&seccomp, arch, default_action);
979 if (r < 0)
980 return r;
a3be2849 981
b54f36c6 982 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 983 uint32_t a = action;
b54f36c6
ZJS
984 int id = PTR_TO_INT(syscall_id) - 1;
985 int error = PTR_TO_INT(val);
8cfa775f 986
b54f36c6
ZJS
987 if (action != SCMP_ACT_ALLOW && error >= 0)
988 a = SCMP_ACT_ERRNO(error);
8cfa775f 989
b54f36c6 990 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
991 if (r < 0) {
992 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
993 _cleanup_free_ char *n = NULL;
7e86bd73 994 bool ignore;
469830d1 995
b54f36c6 996 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
997 ignore = r == -EDOM;
998 if (!ignore || log_missing)
999 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1000 strna(n), id, ignore ? ", ignoring" : "");
1001 if (!ignore)
1002 return r;
469830d1
LP
1003 }
1004 }
1005
1006 r = seccomp_load(seccomp);
1007 if (IN_SET(r, -EPERM, -EACCES))
1008 return r;
1009 if (r < 0)
1010 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1011 }
1012
1013 return 0;
add00535
LP
1014}
1015
13d92c63 1016int seccomp_parse_syscall_filter_full(
898748d8
YW
1017 const char *name,
1018 int errno_num,
1019 Hashmap *filter,
13d92c63 1020 SeccompParseFlags flags,
898748d8
YW
1021 const char *unit,
1022 const char *filename,
1023 unsigned line) {
1024
1025 int r;
1026
1027 assert(name);
1028 assert(filter);
1029
1030 if (name[0] == '@') {
1031 const SyscallFilterSet *set;
1032 const char *i;
1033
1034 set = syscall_filter_set_find(name);
1035 if (!set) {
13d92c63 1036 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1037 return -EINVAL;
13d92c63
LP
1038
1039 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1040 "Unknown system call group, ignoring: %s", name);
1041 return 0;
898748d8
YW
1042 }
1043
1044 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1045 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1046 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1047 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1048 * about them. */
1049 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1050 if (r < 0)
1051 return r;
1052 }
1053 } else {
1054 int id;
1055
1056 id = seccomp_syscall_resolve_name(name);
1057 if (id == __NR_SCMP_ERROR) {
13d92c63 1058 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1059 return -EINVAL;
13d92c63
LP
1060
1061 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1062 "Failed to parse system call, ignoring: %s", name);
1063 return 0;
898748d8
YW
1064 }
1065
1066 /* If we previously wanted to forbid a syscall and now
1067 * we want to allow it, then remove it from the list. */
13d92c63 1068 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
898748d8
YW
1069 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1070 if (r < 0)
851ee70a
LW
1071 switch (r) {
1072 case -ENOMEM:
1073 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1074 case -EEXIST:
9d7fe7c6
LW
1075 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1076 break;
851ee70a
LW
1077 default:
1078 return r;
1079 }
898748d8
YW
1080 } else
1081 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1082 }
1083
1084 return 0;
1085}
1086
add00535 1087int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1088 uint32_t arch;
add00535
LP
1089 int r;
1090
f1d34068 1091 if (DEBUG_LOGGING) {
add00535
LP
1092 _cleanup_free_ char *s = NULL;
1093
86c2a9f1 1094 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1095 log_debug("Restricting namespace to: %s.", strna(s));
1096 }
1097
1098 /* NOOP? */
1099 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1100 return 0;
1101
469830d1
LP
1102 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1103 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1104 unsigned i;
add00535 1105
469830d1
LP
1106 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1107
1108 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1109 if (r < 0)
1110 return r;
1111
1112 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1113 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1114 * altogether. */
1115 r = seccomp_rule_add_exact(
1116 seccomp,
1117 SCMP_ACT_ERRNO(EPERM),
1118 SCMP_SYS(setns),
1119 0);
1120 else
1121 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1122 * special invocation with a zero flags argument, right here. */
1123 r = seccomp_rule_add_exact(
1124 seccomp,
1125 SCMP_ACT_ERRNO(EPERM),
1126 SCMP_SYS(setns),
1127 1,
1128 SCMP_A1(SCMP_CMP_EQ, 0));
1129 if (r < 0) {
1130 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1131 continue;
1132 }
1133
1134 for (i = 0; namespace_flag_map[i].name; i++) {
1135 unsigned long f;
1136
1137 f = namespace_flag_map[i].flag;
1138 if ((retain & f) == f) {
1139 log_debug("Permitting %s.", namespace_flag_map[i].name);
1140 continue;
1141 }
1142
1143 log_debug("Blocking %s.", namespace_flag_map[i].name);
1144
1145 r = seccomp_rule_add_exact(
1146 seccomp,
1147 SCMP_ACT_ERRNO(EPERM),
1148 SCMP_SYS(unshare),
1149 1,
1150 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1151 if (r < 0) {
1152 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1153 break;
1154 }
1155
511ceb1f
ZJS
1156 /* On s390/s390x the first two parameters to clone are switched */
1157 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1158 r = seccomp_rule_add_exact(
1159 seccomp,
1160 SCMP_ACT_ERRNO(EPERM),
1161 SCMP_SYS(clone),
1162 1,
1163 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1164 else
1165 r = seccomp_rule_add_exact(
1166 seccomp,
1167 SCMP_ACT_ERRNO(EPERM),
1168 SCMP_SYS(clone),
1169 1,
1170 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1171 if (r < 0) {
1172 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1173 break;
1174 }
1175
1176 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1177 r = seccomp_rule_add_exact(
1178 seccomp,
1179 SCMP_ACT_ERRNO(EPERM),
1180 SCMP_SYS(setns),
1181 1,
1182 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1183 if (r < 0) {
1184 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1185 break;
1186 }
1187 }
1188 }
1189 if (r < 0)
1190 continue;
1191
1192 r = seccomp_load(seccomp);
1193 if (IN_SET(r, -EPERM, -EACCES))
1194 return r;
1195 if (r < 0)
1196 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1197 }
1198
1199 return 0;
1200}
1201
1202int seccomp_protect_sysctl(void) {
1203 uint32_t arch;
1204 int r;
1205
1206 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1207 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1208
1209 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1210
2e64e8f4
ZJS
1211 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1212 /* No _sysctl syscall */
1213 continue;
1214
469830d1
LP
1215 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1216 if (r < 0)
1217 return r;
1218
1219 r = seccomp_rule_add_exact(
add00535
LP
1220 seccomp,
1221 SCMP_ACT_ERRNO(EPERM),
469830d1 1222 SCMP_SYS(_sysctl),
add00535 1223 0);
469830d1
LP
1224 if (r < 0) {
1225 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1226 continue;
1227 }
1228
1229 r = seccomp_load(seccomp);
1230 if (IN_SET(r, -EPERM, -EACCES))
1231 return r;
1232 if (r < 0)
1233 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1234 }
1235
1236 return 0;
1237}
1238
1239int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1240 uint32_t arch;
1241 int r;
1242
1243 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1244 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1245 bool supported;
469830d1
LP
1246 Iterator i;
1247
1248 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1249
9606bc4b
LP
1250 switch (arch) {
1251
1252 case SCMP_ARCH_X86_64:
1253 case SCMP_ARCH_X32:
1254 case SCMP_ARCH_ARM:
1255 case SCMP_ARCH_AARCH64:
0d9fca76 1256 case SCMP_ARCH_PPC:
da1921a5
ZJS
1257 case SCMP_ARCH_PPC64:
1258 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1259 case SCMP_ARCH_MIPSEL64N32:
1260 case SCMP_ARCH_MIPS64N32:
1261 case SCMP_ARCH_MIPSEL64:
1262 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1263 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1264 supported = true;
1265 break;
1266
9606bc4b
LP
1267 case SCMP_ARCH_S390:
1268 case SCMP_ARCH_S390X:
da1921a5 1269 case SCMP_ARCH_X86:
f5aeac14
JC
1270 case SCMP_ARCH_MIPSEL:
1271 case SCMP_ARCH_MIPS:
9606bc4b
LP
1272 default:
1273 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1274 * don't know */
1275 supported = false;
1276 break;
1277 }
1278
1279 if (!supported)
1280 continue;
1281
469830d1
LP
1282 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1283 if (r < 0)
1284 return r;
1285
1286 if (whitelist) {
1287 int af, first = 0, last = 0;
1288 void *afp;
1289
1290 /* If this is a whitelist, we first block the address families that are out of range and then
1291 * everything that is not in the set. First, we find the lowest and highest address family in
1292 * the set. */
1293
1294 SET_FOREACH(afp, address_families, i) {
1295 af = PTR_TO_INT(afp);
1296
1297 if (af <= 0 || af >= af_max())
1298 continue;
1299
1300 if (first == 0 || af < first)
1301 first = af;
1302
1303 if (last == 0 || af > last)
1304 last = af;
1305 }
1306
1307 assert((first == 0) == (last == 0));
1308
1309 if (first == 0) {
1310
1311 /* No entries in the valid range, block everything */
1312 r = seccomp_rule_add_exact(
1313 seccomp,
1314 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1315 SCMP_SYS(socket),
1316 0);
1317 if (r < 0) {
1318 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1319 continue;
1320 }
1321
1322 } else {
1323
1324 /* Block everything below the first entry */
1325 r = seccomp_rule_add_exact(
1326 seccomp,
1327 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1328 SCMP_SYS(socket),
1329 1,
1330 SCMP_A0(SCMP_CMP_LT, first));
1331 if (r < 0) {
1332 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333 continue;
1334 }
1335
1336 /* Block everything above the last entry */
1337 r = seccomp_rule_add_exact(
1338 seccomp,
1339 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1340 SCMP_SYS(socket),
1341 1,
1342 SCMP_A0(SCMP_CMP_GT, last));
1343 if (r < 0) {
1344 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1345 continue;
1346 }
1347
1348 /* Block everything between the first and last entry */
1349 for (af = 1; af < af_max(); af++) {
1350
1351 if (set_contains(address_families, INT_TO_PTR(af)))
1352 continue;
1353
1354 r = seccomp_rule_add_exact(
1355 seccomp,
1356 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1357 SCMP_SYS(socket),
1358 1,
1359 SCMP_A0(SCMP_CMP_EQ, af));
1360 if (r < 0)
1361 break;
1362 }
469830d1
LP
1363 if (r < 0) {
1364 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1365 continue;
1366 }
1367 }
1368
1369 } else {
1370 void *af;
1371
1372 /* If this is a blacklist, then generate one rule for
1373 * each address family that are then combined in OR
1374 * checks. */
1375
1376 SET_FOREACH(af, address_families, i) {
1377
1378 r = seccomp_rule_add_exact(
1379 seccomp,
1380 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1381 SCMP_SYS(socket),
1382 1,
1383 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1384 if (r < 0)
1385 break;
1386 }
469830d1
LP
1387 if (r < 0) {
1388 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1389 continue;
1390 }
1391 }
1392
1393 r = seccomp_load(seccomp);
1394 if (IN_SET(r, -EPERM, -EACCES))
1395 return r;
1396 if (r < 0)
1397 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 }
1399
1400 return 0;
1401}
1402
1403int seccomp_restrict_realtime(void) {
1404 static const int permitted_policies[] = {
1405 SCHED_OTHER,
1406 SCHED_BATCH,
1407 SCHED_IDLE,
1408 };
1409
1410 int r, max_policy = 0;
1411 uint32_t arch;
1412 unsigned i;
1413
1414 /* Determine the highest policy constant we want to allow */
1415 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1416 if (permitted_policies[i] > max_policy)
1417 max_policy = permitted_policies[i];
1418
1419 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1420 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1421 int p;
1422
1423 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1424
1425 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1426 if (r < 0)
1427 return r;
1428
1429 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1430 * whitelist. */
1431 for (p = 0; p < max_policy; p++) {
1432 bool good = false;
1433
1434 /* Check if this is in the whitelist. */
1435 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1436 if (permitted_policies[i] == p) {
1437 good = true;
1438 break;
1439 }
1440
1441 if (good)
1442 continue;
1443
1444 /* Deny this policy */
1445 r = seccomp_rule_add_exact(
1446 seccomp,
1447 SCMP_ACT_ERRNO(EPERM),
1448 SCMP_SYS(sched_setscheduler),
1449 1,
1450 SCMP_A1(SCMP_CMP_EQ, p));
1451 if (r < 0) {
1452 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1453 continue;
1454 }
1455 }
1456
1457 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1458 * unsigned here, hence no need no check for < 0 values. */
1459 r = seccomp_rule_add_exact(
add00535
LP
1460 seccomp,
1461 SCMP_ACT_ERRNO(EPERM),
469830d1 1462 SCMP_SYS(sched_setscheduler),
add00535 1463 1,
469830d1
LP
1464 SCMP_A1(SCMP_CMP_GT, max_policy));
1465 if (r < 0) {
1466 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1467 continue;
1468 }
add00535 1469
469830d1
LP
1470 r = seccomp_load(seccomp);
1471 if (IN_SET(r, -EPERM, -EACCES))
1472 return r;
1473 if (r < 0)
1474 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1475 }
1476
1477 return 0;
1478}
1479
6dc66688
ZJS
1480static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1481 uint32_t arch,
1482 int nr,
14cb109d 1483 unsigned arg_cnt,
6dc66688
ZJS
1484 const struct scmp_arg_cmp arg) {
1485 int r;
1486
1487 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1488 if (r < 0) {
1489 _cleanup_free_ char *n = NULL;
1490
1491 n = seccomp_syscall_resolve_num_arch(arch, nr);
1492 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1493 strna(n),
1494 seccomp_arch_to_string(arch));
1495 }
1496
1497 return r;
1498}
1499
2a8d6e63 1500/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1501#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1502assert_cc(SCMP_SYS(shmget) > 0);
1503assert_cc(SCMP_SYS(shmat) > 0);
1504assert_cc(SCMP_SYS(shmdt) > 0);
303d6b4c 1505#elif defined(__i386__) || defined(__powerpc64__)
2a8d6e63
ZJS
1506assert_cc(SCMP_SYS(shmget) < 0);
1507assert_cc(SCMP_SYS(shmat) < 0);
1508assert_cc(SCMP_SYS(shmdt) < 0);
1509#endif
6dc66688 1510
469830d1 1511int seccomp_memory_deny_write_execute(void) {
8a50cf69 1512
469830d1
LP
1513 uint32_t arch;
1514 int r;
1515
1516 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1517 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1518 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1519
469830d1
LP
1520 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1521
8a50cf69
LP
1522 switch (arch) {
1523
1524 case SCMP_ARCH_X86:
1525 filter_syscall = SCMP_SYS(mmap2);
1526 block_syscall = SCMP_SYS(mmap);
2a8d6e63
ZJS
1527 break;
1528
63d00dfb 1529 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1530 case SCMP_ARCH_PPC64:
1531 case SCMP_ARCH_PPC64LE:
1532 filter_syscall = SCMP_SYS(mmap);
1533
1534 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1535 * We ignore that here, which means there's still a way to get writable/executable
1536 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1537
8a50cf69
LP
1538 break;
1539
4278d1f5
ZJS
1540 case SCMP_ARCH_ARM:
1541 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1542 shmat_syscall = SCMP_SYS(shmat);
1543 break;
1544
8a50cf69
LP
1545 case SCMP_ARCH_X86_64:
1546 case SCMP_ARCH_X32:
79873bc8 1547 case SCMP_ARCH_AARCH64:
303d6b4c 1548 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
8a50cf69
LP
1549 shmat_syscall = SCMP_SYS(shmat);
1550 break;
1551
1552 /* Please add more definitions here, if you port systemd to other architectures! */
1553
303d6b4c 1554#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
8a50cf69
LP
1555#warning "Consider adding the right mmap() syscall definitions here!"
1556#endif
1557 }
1558
1559 /* Can't filter mmap() on this arch, then skip it */
1560 if (filter_syscall == 0)
1561 continue;
1562
469830d1
LP
1563 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1564 if (r < 0)
1565 return r;
1566
6dc66688
ZJS
1567 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1568 1,
1569 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1570 if (r < 0)
1571 continue;
8a50cf69
LP
1572
1573 if (block_syscall != 0) {
6dc66688
ZJS
1574 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1575 if (r < 0)
8a50cf69 1576 continue;
add00535 1577 }
a3be2849 1578
6dc66688
ZJS
1579 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1580 1,
b835eeb4
ZJS
1581 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1582 if (r < 0)
1583 continue;
1584
91691f1d 1585#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1586 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1587 1,
6dc66688
ZJS
1588 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1589 if (r < 0)
469830d1 1590 continue;
91691f1d 1591#endif
add00535 1592
8a50cf69 1593 if (shmat_syscall != 0) {
6dc66688
ZJS
1594 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1595 1,
1596 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1597 if (r < 0)
8a50cf69 1598 continue;
469830d1
LP
1599 }
1600
1601 r = seccomp_load(seccomp);
1602 if (IN_SET(r, -EPERM, -EACCES))
1603 return r;
add00535 1604 if (r < 0)
469830d1
LP
1605 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1606 }
add00535 1607
469830d1
LP
1608 return 0;
1609}
1610
1611int seccomp_restrict_archs(Set *archs) {
1612 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1613 Iterator i;
1614 void *id;
1615 int r;
1616
1617 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1618 * list.
1619 *
1620 * There are some qualifications. However the most important use is to stop processes from bypassing
1621 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1622 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1623
2428aaf8
AJ
1624 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1625 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1626 * to run a program with the restrictions applied. */
469830d1
LP
1627 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1628 if (!seccomp)
1629 return -ENOMEM;
1630
1631 SET_FOREACH(id, archs, i) {
1632 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1633 if (r < 0 && r != -EEXIST)
1634 return r;
1635 }
1636
1637 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1638 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1639 * The important thing is that you can block the old 32-bit x86 syscalls.
1640 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1641
1642 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1643 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1644
1645 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1646 if (r < 0 && r != -EEXIST)
469830d1 1647 return r;
add00535
LP
1648 }
1649
469830d1
LP
1650 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1651 if (r < 0)
1652 return r;
add00535 1653
1c6af69b
LP
1654 r = seccomp_load(seccomp);
1655 if (IN_SET(r, -EPERM, -EACCES))
1656 return r;
1657 if (r < 0)
1658 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1659
1660 return 0;
a3be2849 1661}
b16bd535
YW
1662
1663int parse_syscall_archs(char **l, Set **archs) {
1664 _cleanup_set_free_ Set *_archs;
1665 char **s;
1666 int r;
1667
1668 assert(l);
1669 assert(archs);
1670
1671 r = set_ensure_allocated(&_archs, NULL);
1672 if (r < 0)
1673 return r;
1674
1675 STRV_FOREACH(s, l) {
1676 uint32_t a;
1677
1678 r = seccomp_arch_from_string(*s, &a);
1679 if (r < 0)
1680 return -EINVAL;
1681
1682 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1683 if (r < 0)
1684 return -ENOMEM;
1685 }
1686
1cc6c93a 1687 *archs = TAKE_PTR(_archs);
b16bd535
YW
1688
1689 return 0;
1690}
165a31c0 1691
8cfa775f 1692int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1693 const char *i;
1694 int r;
1695
1696 assert(set);
1697
1698 NULSTR_FOREACH(i, set->value) {
1699
1700 if (i[0] == '@') {
1701 const SyscallFilterSet *more;
1702
1703 more = syscall_filter_set_find(i);
1704 if (!more)
1705 return -ENXIO;
1706
165a31c0
LP
1707 r = seccomp_filter_set_add(filter, add, more);
1708 if (r < 0)
1709 return r;
1710 } else {
1711 int id;
1712
1713 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1714 if (id == __NR_SCMP_ERROR) {
1715 log_debug("Couldn't resolve system call, ignoring: %s", i);
1716 continue;
1717 }
165a31c0
LP
1718
1719 if (add) {
8cfa775f 1720 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1721 if (r < 0)
1722 return r;
1723 } else
8cfa775f 1724 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1725 }
1726 }
1727
1728 return 0;
1729}
78e864e5
TM
1730
1731int seccomp_lock_personality(unsigned long personality) {
72eafe71 1732 uint32_t arch;
78e864e5
TM
1733 int r;
1734
72eafe71
LP
1735 if (personality >= PERSONALITY_INVALID)
1736 return -EINVAL;
78e864e5 1737
72eafe71
LP
1738 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1739 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1740
72eafe71
LP
1741 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1742 if (r < 0)
1743 return r;
1744
1745 r = seccomp_rule_add_exact(
1746 seccomp,
1747 SCMP_ACT_ERRNO(EPERM),
1748 SCMP_SYS(personality),
1749 1,
1750 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1751 if (r < 0) {
1752 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1753 continue;
1754 }
72eafe71
LP
1755
1756 r = seccomp_load(seccomp);
1757 if (IN_SET(r, -EPERM, -EACCES))
1758 return r;
1759 if (r < 0)
1760 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1761 }
1762
1763 return 0;
78e864e5 1764}