]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
network: cleanup header inclusion
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
d8b4d14d 15#include "errno-list.h"
a8fbdf54 16#include "macro.h"
add00535 17#include "nsflags.h"
d8b4d14d 18#include "nulstr-util.h"
78e864e5 19#include "process-util.h"
cf0fbc49 20#include "seccomp-util.h"
b16bd535 21#include "set.h"
07630cea 22#include "string-util.h"
b16bd535 23#include "strv.h"
469830d1
LP
24
25const uint32_t seccomp_local_archs[] = {
26
f2d9751c
LP
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28
29#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
f2d9751c
LP
32 SCMP_ARCH_X32, /* native */
33#elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
469830d1 35 SCMP_ARCH_X32,
f2d9751c
LP
36 SCMP_ARCH_X86_64, /* native */
37#elif defined(__i386__)
38 SCMP_ARCH_X86,
39#elif defined(__aarch64__)
469830d1 40 SCMP_ARCH_ARM,
f2d9751c
LP
41 SCMP_ARCH_AARCH64, /* native */
42#elif defined(__arm__)
43 SCMP_ARCH_ARM,
44#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 48 SCMP_ARCH_MIPS,
f2d9751c
LP
49 SCMP_ARCH_MIPSEL, /* native */
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
469830d1 54 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
469830d1 59 SCMP_ARCH_MIPSEL,
f2d9751c
LP
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL64,
f2d9751c 68 SCMP_ARCH_MIPS64,
469830d1 69 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPS64N32, /* native */
71#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 79 SCMP_ARCH_PPC,
469830d1 80 SCMP_ARCH_PPC64LE,
f2d9751c
LP
81 SCMP_ARCH_PPC64, /* native */
82#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86#elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88#elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91#elif defined(__s390__)
469830d1 92 SCMP_ARCH_S390,
469830d1
LP
93#endif
94 (uint32_t) -1
95 };
57183d11
LP
96
97const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
57183d11 102
aa34055f
ZJS
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
57183d11 105 return "native";
aa34055f 106 case SCMP_ARCH_X86:
57183d11 107 return "x86";
aa34055f 108 case SCMP_ARCH_X86_64:
57183d11 109 return "x86-64";
aa34055f 110 case SCMP_ARCH_X32:
57183d11 111 return "x32";
aa34055f 112 case SCMP_ARCH_ARM:
57183d11 113 return "arm";
aa34055f
ZJS
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
6abfd303 135 return "s390";
aa34055f 136 case SCMP_ARCH_S390X:
6abfd303 137 return "s390x";
aa34055f
ZJS
138 default:
139 return NULL;
140 }
57183d11
LP
141}
142
143int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
57183d11
LP
183 else
184 return -EINVAL;
185
186 return 0;
187}
e9642be2 188
469830d1 189int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
190 scmp_filter_ctx seccomp;
191 int r;
192
469830d1
LP
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
469830d1
LP
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
1b52793d 203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
204 if (r < 0)
205 goto finish;
206
1b52793d 207 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230finish:
231 seccomp_release(seccomp);
232 return r;
233}
234
d347d902 235static bool is_basic_seccomp_available(void) {
4d5bd50a 236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
237}
238
239static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
d347d902
FS
242}
243
83f12b27 244bool is_seccomp_available(void) {
83f12b27 245 static int cached_enabled = -1;
4d5bd50a 246
83f12b27 247 if (cached_enabled < 0)
4d5bd50a
LP
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
83f12b27
FS
252 return cached_enabled;
253}
254
8130926d 255const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 256 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 257 .name = "@default",
d5efc18b 258 .help = "System calls that are always permitted",
40eb6a80
ZJS
259 .value =
260 "clock_getres\0"
261 "clock_gettime\0"
262 "clock_nanosleep\0"
263 "execve\0"
264 "exit\0"
265 "exit_group\0"
e41b0f42
LP
266 "futex\0"
267 "get_robust_list\0"
268 "get_thread_area\0"
09d3020b
DH
269 "getegid\0"
270 "getegid32\0"
271 "geteuid\0"
272 "geteuid32\0"
273 "getgid\0"
274 "getgid32\0"
275 "getgroups\0"
276 "getgroups32\0"
277 "getpgid\0"
278 "getpgrp\0"
279 "getpid\0"
280 "getppid\0"
281 "getresgid\0"
282 "getresgid32\0"
283 "getresuid\0"
284 "getresuid32\0"
40eb6a80 285 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
286 "getsid\0"
287 "gettid\0"
40eb6a80 288 "gettimeofday\0"
09d3020b
DH
289 "getuid\0"
290 "getuid32\0"
e41b0f42 291 "membarrier\0"
40eb6a80
ZJS
292 "nanosleep\0"
293 "pause\0"
4c3a9176 294 "prlimit64\0"
e41b0f42 295 "restart_syscall\0"
6fee3be0 296 "rseq\0"
40eb6a80 297 "rt_sigreturn\0"
8f44de08 298 "sched_yield\0"
e41b0f42
LP
299 "set_robust_list\0"
300 "set_thread_area\0"
301 "set_tid_address\0"
ce5faeac 302 "set_tls\0"
40eb6a80
ZJS
303 "sigreturn\0"
304 "time\0"
4c3a9176 305 "ugetrlimit\0"
40eb6a80 306 },
44898c53
LP
307 [SYSCALL_FILTER_SET_AIO] = {
308 .name = "@aio",
309 .help = "Asynchronous IO",
310 .value =
311 "io_cancel\0"
312 "io_destroy\0"
313 "io_getevents\0"
a05cfe23 314 "io_pgetevents\0"
44898c53
LP
315 "io_setup\0"
316 "io_submit\0"
317 },
133ddbbe 318 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 319 .name = "@basic-io",
d5efc18b 320 .help = "Basic IO",
133ddbbe 321 .value =
648a0ed0 322 "_llseek\0"
133ddbbe 323 "close\0"
648a0ed0 324 "dup\0"
133ddbbe
LP
325 "dup2\0"
326 "dup3\0"
133ddbbe
LP
327 "lseek\0"
328 "pread64\0"
329 "preadv\0"
44898c53 330 "preadv2\0"
133ddbbe
LP
331 "pwrite64\0"
332 "pwritev\0"
44898c53 333 "pwritev2\0"
133ddbbe
LP
334 "read\0"
335 "readv\0"
336 "write\0"
337 "writev\0"
338 },
44898c53
LP
339 [SYSCALL_FILTER_SET_CHOWN] = {
340 .name = "@chown",
341 .help = "Change ownership of files and directories",
342 .value =
343 "chown\0"
344 "chown32\0"
345 "fchown\0"
346 "fchown32\0"
347 "fchownat\0"
348 "lchown\0"
349 "lchown32\0"
350 },
8130926d 351 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 352 .name = "@clock",
d5efc18b 353 .help = "Change the system time",
201c1cc2
TM
354 .value =
355 "adjtimex\0"
1f9ac68b
LP
356 "clock_adjtime\0"
357 "clock_settime\0"
201c1cc2 358 "settimeofday\0"
1f9ac68b 359 "stime\0"
8130926d
LP
360 },
361 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 362 .name = "@cpu-emulation",
d5efc18b 363 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
364 .value =
365 "modify_ldt\0"
366 "subpage_prot\0"
367 "switch_endian\0"
368 "vm86\0"
369 "vm86old\0"
8130926d
LP
370 },
371 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 372 .name = "@debug",
d5efc18b 373 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
374 .value =
375 "lookup_dcookie\0"
376 "perf_event_open\0"
1f9ac68b
LP
377 "ptrace\0"
378 "rtas\0"
8130926d 379#ifdef __NR_s390_runtime_instr
1f9ac68b 380 "s390_runtime_instr\0"
8130926d 381#endif
1f9ac68b 382 "sys_debug_setcontext\0"
8130926d 383 },
1a1b13c9
LP
384 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
385 .name = "@file-system",
386 .help = "File system operations",
387 .value =
388 "access\0"
389 "chdir\0"
390 "chmod\0"
391 "close\0"
392 "creat\0"
393 "faccessat\0"
394 "fallocate\0"
395 "fchdir\0"
396 "fchmod\0"
397 "fchmodat\0"
1a1b13c9 398 "fcntl\0"
ceaa6aa7 399 "fcntl64\0"
1a1b13c9
LP
400 "fgetxattr\0"
401 "flistxattr\0"
ceaa6aa7 402 "fremovexattr\0"
1a1b13c9 403 "fsetxattr\0"
1a1b13c9 404 "fstat\0"
ceaa6aa7 405 "fstat64\0"
1a1b13c9 406 "fstatat64\0"
1a1b13c9 407 "fstatfs\0"
ceaa6aa7 408 "fstatfs64\0"
1a1b13c9 409 "ftruncate\0"
ceaa6aa7 410 "ftruncate64\0"
1a1b13c9
LP
411 "futimesat\0"
412 "getcwd\0"
1a1b13c9 413 "getdents\0"
ceaa6aa7 414 "getdents64\0"
1a1b13c9
LP
415 "getxattr\0"
416 "inotify_add_watch\0"
ceaa6aa7 417 "inotify_init\0"
1a1b13c9
LP
418 "inotify_init1\0"
419 "inotify_rm_watch\0"
420 "lgetxattr\0"
421 "link\0"
422 "linkat\0"
423 "listxattr\0"
424 "llistxattr\0"
425 "lremovexattr\0"
426 "lsetxattr\0"
1a1b13c9 427 "lstat\0"
ceaa6aa7 428 "lstat64\0"
1a1b13c9
LP
429 "mkdir\0"
430 "mkdirat\0"
431 "mknod\0"
432 "mknodat\0"
1a1b13c9 433 "mmap\0"
ceaa6aa7 434 "mmap2\0"
7961116e 435 "munmap\0"
1a1b13c9 436 "newfstatat\0"
ceaa6aa7
LP
437 "oldfstat\0"
438 "oldlstat\0"
439 "oldstat\0"
1a1b13c9
LP
440 "open\0"
441 "openat\0"
442 "readlink\0"
443 "readlinkat\0"
444 "removexattr\0"
445 "rename\0"
1a1b13c9 446 "renameat\0"
ceaa6aa7 447 "renameat2\0"
1a1b13c9
LP
448 "rmdir\0"
449 "setxattr\0"
1a1b13c9 450 "stat\0"
ceaa6aa7 451 "stat64\0"
1a1b13c9 452 "statfs\0"
ceaa6aa7 453 "statfs64\0"
8e6a7a8b 454#ifdef __NR_statx
a4135a74 455 "statx\0"
ceaa6aa7 456#endif
1a1b13c9
LP
457 "symlink\0"
458 "symlinkat\0"
1a1b13c9 459 "truncate\0"
ceaa6aa7 460 "truncate64\0"
1a1b13c9
LP
461 "unlink\0"
462 "unlinkat\0"
ceaa6aa7 463 "utime\0"
1a1b13c9
LP
464 "utimensat\0"
465 "utimes\0"
466 },
8130926d 467 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 468 .name = "@io-event",
d5efc18b 469 .help = "Event loop system calls",
201c1cc2
TM
470 .value =
471 "_newselect\0"
201c1cc2 472 "epoll_create\0"
215728ff 473 "epoll_create1\0"
201c1cc2
TM
474 "epoll_ctl\0"
475 "epoll_ctl_old\0"
476 "epoll_pwait\0"
477 "epoll_wait\0"
478 "epoll_wait_old\0"
201c1cc2 479 "eventfd\0"
215728ff 480 "eventfd2\0"
201c1cc2
TM
481 "poll\0"
482 "ppoll\0"
483 "pselect6\0"
484 "select\0"
8130926d
LP
485 },
486 [SYSCALL_FILTER_SET_IPC] = {
8130926d 487 .name = "@ipc",
d5efc18b
ZJS
488 .help = "SysV IPC, POSIX Message Queues or other IPC",
489 .value =
490 "ipc\0"
cd5bfd7e 491 "memfd_create\0"
201c1cc2
TM
492 "mq_getsetattr\0"
493 "mq_notify\0"
494 "mq_open\0"
495 "mq_timedreceive\0"
496 "mq_timedsend\0"
497 "mq_unlink\0"
498 "msgctl\0"
499 "msgget\0"
500 "msgrcv\0"
501 "msgsnd\0"
cd5bfd7e 502 "pipe\0"
215728ff 503 "pipe2\0"
201c1cc2
TM
504 "process_vm_readv\0"
505 "process_vm_writev\0"
506 "semctl\0"
507 "semget\0"
508 "semop\0"
509 "semtimedop\0"
510 "shmat\0"
511 "shmctl\0"
512 "shmdt\0"
513 "shmget\0"
8130926d
LP
514 },
515 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 516 .name = "@keyring",
d5efc18b 517 .help = "Kernel keyring access",
1f9ac68b
LP
518 .value =
519 "add_key\0"
520 "keyctl\0"
521 "request_key\0"
8130926d 522 },
cd0ddf6f
LP
523 [SYSCALL_FILTER_SET_MEMLOCK] = {
524 .name = "@memlock",
525 .help = "Memory locking control",
526 .value =
527 "mlock\0"
528 "mlock2\0"
529 "mlockall\0"
530 "munlock\0"
531 "munlockall\0"
532 },
8130926d 533 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 534 .name = "@module",
d5efc18b 535 .help = "Loading and unloading of kernel modules",
201c1cc2 536 .value =
201c1cc2
TM
537 "delete_module\0"
538 "finit_module\0"
539 "init_module\0"
8130926d
LP
540 },
541 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 542 .name = "@mount",
d5efc18b 543 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
544 .value =
545 "chroot\0"
546 "mount\0"
201c1cc2 547 "pivot_root\0"
201c1cc2 548 "umount\0"
215728ff 549 "umount2\0"
8130926d
LP
550 },
551 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 552 .name = "@network-io",
d5efc18b 553 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 554 .value =
201c1cc2 555 "accept\0"
215728ff 556 "accept4\0"
201c1cc2
TM
557 "bind\0"
558 "connect\0"
559 "getpeername\0"
560 "getsockname\0"
561 "getsockopt\0"
562 "listen\0"
563 "recv\0"
564 "recvfrom\0"
565 "recvmmsg\0"
566 "recvmsg\0"
567 "send\0"
568 "sendmmsg\0"
569 "sendmsg\0"
570 "sendto\0"
571 "setsockopt\0"
572 "shutdown\0"
573 "socket\0"
574 "socketcall\0"
575 "socketpair\0"
8130926d
LP
576 },
577 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 578 /* some unknown even to libseccomp */
8130926d 579 .name = "@obsolete",
d5efc18b 580 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
581 .value =
582 "_sysctl\0"
583 "afs_syscall\0"
802fa07a 584 "bdflush\0"
201c1cc2 585 "break\0"
1f9ac68b 586 "create_module\0"
201c1cc2
TM
587 "ftime\0"
588 "get_kernel_syms\0"
201c1cc2
TM
589 "getpmsg\0"
590 "gtty\0"
7e0c3b8f 591 "idle\0"
201c1cc2 592 "lock\0"
201c1cc2 593 "mpx\0"
201c1cc2
TM
594 "prof\0"
595 "profil\0"
201c1cc2
TM
596 "putpmsg\0"
597 "query_module\0"
201c1cc2
TM
598 "security\0"
599 "sgetmask\0"
600 "ssetmask\0"
601 "stty\0"
1f9ac68b 602 "sysfs\0"
201c1cc2
TM
603 "tuxcall\0"
604 "ulimit\0"
605 "uselib\0"
1f9ac68b 606 "ustat\0"
201c1cc2 607 "vserver\0"
8130926d
LP
608 },
609 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 610 .name = "@privileged",
d5efc18b 611 .help = "All system calls which need super-user capabilities",
201c1cc2 612 .value =
44898c53 613 "@chown\0"
201c1cc2
TM
614 "@clock\0"
615 "@module\0"
616 "@raw-io\0"
af0f047b
LP
617 "@reboot\0"
618 "@swap\0"
215728ff 619 "_sysctl\0"
201c1cc2 620 "acct\0"
201c1cc2 621 "bpf\0"
1f9ac68b 622 "capset\0"
201c1cc2 623 "chroot\0"
a05cfe23 624 "fanotify_init\0"
201c1cc2 625 "nfsservctl\0"
a05cfe23 626 "open_by_handle_at\0"
201c1cc2
TM
627 "pivot_root\0"
628 "quotactl\0"
201c1cc2 629 "setdomainname\0"
201c1cc2 630 "setfsuid\0"
215728ff 631 "setfsuid32\0"
201c1cc2 632 "setgroups\0"
215728ff 633 "setgroups32\0"
201c1cc2 634 "sethostname\0"
201c1cc2 635 "setresuid\0"
215728ff 636 "setresuid32\0"
201c1cc2 637 "setreuid\0"
215728ff 638 "setreuid32\0"
e05ee49b 639 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 640 "setuid32\0"
201c1cc2 641 "vhangup\0"
8130926d
LP
642 },
643 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 644 .name = "@process",
d5efc18b 645 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
646 .value =
647 "arch_prctl\0"
09d3020b 648 "capget\0" /* Able to query arbitrary processes */
201c1cc2 649 "clone\0"
201c1cc2
TM
650 "execveat\0"
651 "fork\0"
b887d2eb 652 "getrusage\0"
201c1cc2 653 "kill\0"
46fcf95d 654 "pidfd_send_signal\0"
201c1cc2 655 "prctl\0"
b887d2eb
LP
656 "rt_sigqueueinfo\0"
657 "rt_tgsigqueueinfo\0"
201c1cc2 658 "setns\0"
a9518dc3 659 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 660 "tgkill\0"
b887d2eb 661 "times\0"
201c1cc2
TM
662 "tkill\0"
663 "unshare\0"
664 "vfork\0"
b887d2eb
LP
665 "wait4\0"
666 "waitid\0"
667 "waitpid\0"
8130926d
LP
668 },
669 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 670 .name = "@raw-io",
d5efc18b 671 .help = "Raw I/O port access",
201c1cc2
TM
672 .value =
673 "ioperm\0"
674 "iopl\0"
1f9ac68b 675 "pciconfig_iobase\0"
201c1cc2
TM
676 "pciconfig_read\0"
677 "pciconfig_write\0"
8130926d 678#ifdef __NR_s390_pci_mmio_read
201c1cc2 679 "s390_pci_mmio_read\0"
8130926d
LP
680#endif
681#ifdef __NR_s390_pci_mmio_write
201c1cc2 682 "s390_pci_mmio_write\0"
8130926d
LP
683#endif
684 },
bd2ab3f4
LP
685 [SYSCALL_FILTER_SET_REBOOT] = {
686 .name = "@reboot",
687 .help = "Reboot and reboot preparation/kexec",
688 .value =
bd2ab3f4 689 "kexec_file_load\0"
e59608fa 690 "kexec_load\0"
bd2ab3f4
LP
691 "reboot\0"
692 },
133ddbbe 693 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 694 .name = "@resources",
58a8f68b 695 .help = "Alter resource settings",
133ddbbe 696 .value =
0963c053
LP
697 "ioprio_set\0"
698 "mbind\0"
699 "migrate_pages\0"
700 "move_pages\0"
701 "nice\0"
0963c053
LP
702 "sched_setaffinity\0"
703 "sched_setattr\0"
133ddbbe
LP
704 "sched_setparam\0"
705 "sched_setscheduler\0"
0963c053 706 "set_mempolicy\0"
133ddbbe
LP
707 "setpriority\0"
708 "setrlimit\0"
133ddbbe 709 },
6eaaeee9
LP
710 [SYSCALL_FILTER_SET_SETUID] = {
711 .name = "@setuid",
712 .help = "Operations for changing user/group credentials",
713 .value =
6eaaeee9 714 "setgid\0"
215728ff 715 "setgid32\0"
6eaaeee9 716 "setgroups\0"
215728ff 717 "setgroups32\0"
6eaaeee9 718 "setregid\0"
215728ff 719 "setregid32\0"
6eaaeee9 720 "setresgid\0"
215728ff 721 "setresgid32\0"
6eaaeee9 722 "setresuid\0"
215728ff 723 "setresuid32\0"
6eaaeee9 724 "setreuid\0"
215728ff 725 "setreuid32\0"
6eaaeee9 726 "setuid\0"
215728ff 727 "setuid32\0"
6eaaeee9 728 },
cd0ddf6f
LP
729 [SYSCALL_FILTER_SET_SIGNAL] = {
730 .name = "@signal",
731 .help = "Process signal handling",
732 .value =
733 "rt_sigaction\0"
734 "rt_sigpending\0"
735 "rt_sigprocmask\0"
736 "rt_sigsuspend\0"
737 "rt_sigtimedwait\0"
738 "sigaction\0"
739 "sigaltstack\0"
740 "signal\0"
741 "signalfd\0"
742 "signalfd4\0"
743 "sigpending\0"
744 "sigprocmask\0"
745 "sigsuspend\0"
746 },
bd2ab3f4
LP
747 [SYSCALL_FILTER_SET_SWAP] = {
748 .name = "@swap",
749 .help = "Enable/disable swap devices",
750 .value =
751 "swapoff\0"
752 "swapon\0"
753 },
44898c53
LP
754 [SYSCALL_FILTER_SET_SYNC] = {
755 .name = "@sync",
756 .help = "Synchronize files and memory to storage",
757 .value =
758 "fdatasync\0"
759 "fsync\0"
760 "msync\0"
761 "sync\0"
762 "sync_file_range\0"
a8fb09f5 763 "sync_file_range2\0"
44898c53
LP
764 "syncfs\0"
765 },
70526841
LP
766 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
767 .name = "@system-service",
768 .help = "General system service operations",
769 .value =
770 "@aio\0"
771 "@basic-io\0"
772 "@chown\0"
773 "@default\0"
774 "@file-system\0"
775 "@io-event\0"
776 "@ipc\0"
777 "@keyring\0"
778 "@memlock\0"
779 "@network-io\0"
780 "@process\0"
781 "@resources\0"
782 "@setuid\0"
783 "@signal\0"
784 "@sync\0"
785 "@timer\0"
786 "brk\0"
787 "capget\0"
788 "capset\0"
789 "copy_file_range\0"
790 "fadvise64\0"
791 "fadvise64_64\0"
792 "flock\0"
793 "get_mempolicy\0"
794 "getcpu\0"
795 "getpriority\0"
796 "getrandom\0"
797 "ioctl\0"
798 "ioprio_get\0"
799 "kcmp\0"
800 "madvise\0"
70526841
LP
801 "mprotect\0"
802 "mremap\0"
803 "name_to_handle_at\0"
804 "oldolduname\0"
805 "olduname\0"
806 "personality\0"
807 "readahead\0"
808 "readdir\0"
809 "remap_file_pages\0"
810 "sched_get_priority_max\0"
811 "sched_get_priority_min\0"
812 "sched_getaffinity\0"
813 "sched_getattr\0"
814 "sched_getparam\0"
815 "sched_getscheduler\0"
816 "sched_rr_get_interval\0"
817 "sched_yield\0"
818 "sendfile\0"
819 "sendfile64\0"
820 "setfsgid\0"
821 "setfsgid32\0"
822 "setfsuid\0"
823 "setfsuid32\0"
824 "setpgid\0"
825 "setsid\0"
826 "splice\0"
827 "sysinfo\0"
828 "tee\0"
829 "umask\0"
830 "uname\0"
831 "userfaultfd\0"
832 "vmsplice\0"
833 },
cd0ddf6f
LP
834 [SYSCALL_FILTER_SET_TIMER] = {
835 .name = "@timer",
836 .help = "Schedule operations by time",
837 .value =
838 "alarm\0"
839 "getitimer\0"
840 "setitimer\0"
841 "timer_create\0"
842 "timer_delete\0"
843 "timer_getoverrun\0"
844 "timer_gettime\0"
845 "timer_settime\0"
846 "timerfd_create\0"
847 "timerfd_gettime\0"
848 "timerfd_settime\0"
849 "times\0"
850 },
201c1cc2 851};
8130926d
LP
852
853const SyscallFilterSet *syscall_filter_set_find(const char *name) {
854 unsigned i;
855
856 if (isempty(name) || name[0] != '@')
857 return NULL;
858
859 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
860 if (streq(syscall_filter_sets[i].name, name))
861 return syscall_filter_sets + i;
862
863 return NULL;
864}
865
b54f36c6 866static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 867
b54f36c6 868int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
869 assert(seccomp);
870 assert(name);
871
960e4569
LP
872 if (strv_contains(exclude, name))
873 return 0;
874
69b1b241
LP
875 if (name[0] == '@') {
876 const SyscallFilterSet *other;
877
878 other = syscall_filter_set_find(name);
baaa35ad
ZJS
879 if (!other)
880 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
881 "Filter set %s is not known!",
882 name);
69b1b241 883
b54f36c6
ZJS
884 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
885
69b1b241 886 } else {
b54f36c6 887 int id, r;
69b1b241
LP
888
889 id = seccomp_syscall_resolve_name(name);
cff7bff8 890 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
891 if (log_missing)
892 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 893 return 0;
cff7bff8 894 }
69b1b241
LP
895
896 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 897 if (r < 0) {
69b1b241 898 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
899 bool ignore = r == -EDOM;
900
901 if (!ignore || log_missing)
902 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
903 name, id, ignore ? ", ignoring" : "");
904 if (!ignore)
905 return r;
b54f36c6 906 }
69b1b241 907
b54f36c6
ZJS
908 return 0;
909 }
69b1b241
LP
910}
911
469830d1
LP
912static int seccomp_add_syscall_filter_set(
913 scmp_filter_ctx seccomp,
469830d1 914 const SyscallFilterSet *set,
960e4569 915 uint32_t action,
b54f36c6
ZJS
916 char **exclude,
917 bool log_missing) {
469830d1 918
8130926d
LP
919 const char *sys;
920 int r;
921
922 assert(seccomp);
923 assert(set);
924
925 NULSTR_FOREACH(sys, set->value) {
b54f36c6 926 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
927 if (r < 0)
928 return r;
469830d1
LP
929 }
930
931 return 0;
932}
933
b54f36c6 934int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
935 uint32_t arch;
936 int r;
937
938 assert(set);
939
940 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 941 * each local arch. */
469830d1
LP
942
943 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
944 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
945
946 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
947
948 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
949 if (r < 0)
950 return r;
469830d1 951
b54f36c6 952 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
953 if (r < 0)
954 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
955
956 r = seccomp_load(seccomp);
7bc5e0b1 957 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
958 return r;
959 if (r < 0)
960 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
961 }
962
963 return 0;
964}
a3be2849 965
b54f36c6 966int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 967 uint32_t arch;
a3be2849
LP
968 int r;
969
469830d1
LP
970 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
971 * SyscallFilterSet* table. */
a3be2849 972
8cfa775f 973 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 974 return 0;
a3be2849 975
469830d1
LP
976 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
977 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
978 Iterator i;
b54f36c6 979 void *syscall_id, *val;
a3be2849 980
469830d1 981 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 982
469830d1
LP
983 r = seccomp_init_for_arch(&seccomp, arch, default_action);
984 if (r < 0)
985 return r;
a3be2849 986
b54f36c6 987 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 988 uint32_t a = action;
b54f36c6
ZJS
989 int id = PTR_TO_INT(syscall_id) - 1;
990 int error = PTR_TO_INT(val);
8cfa775f 991
b54f36c6
ZJS
992 if (action != SCMP_ACT_ALLOW && error >= 0)
993 a = SCMP_ACT_ERRNO(error);
8cfa775f 994
b54f36c6 995 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
996 if (r < 0) {
997 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
998 _cleanup_free_ char *n = NULL;
7e86bd73 999 bool ignore;
469830d1 1000
b54f36c6 1001 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1002 ignore = r == -EDOM;
1003 if (!ignore || log_missing)
1004 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1005 strna(n), id, ignore ? ", ignoring" : "");
1006 if (!ignore)
1007 return r;
469830d1
LP
1008 }
1009 }
1010
1011 r = seccomp_load(seccomp);
7bc5e0b1 1012 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1013 return r;
1014 if (r < 0)
1015 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1016 }
1017
1018 return 0;
add00535
LP
1019}
1020
58f6ab44 1021int seccomp_parse_syscall_filter(
898748d8
YW
1022 const char *name,
1023 int errno_num,
1024 Hashmap *filter,
13d92c63 1025 SeccompParseFlags flags,
898748d8
YW
1026 const char *unit,
1027 const char *filename,
1028 unsigned line) {
1029
1030 int r;
1031
1032 assert(name);
1033 assert(filter);
1034
1035 if (name[0] == '@') {
1036 const SyscallFilterSet *set;
1037 const char *i;
1038
1039 set = syscall_filter_set_find(name);
1040 if (!set) {
13d92c63 1041 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1042 return -EINVAL;
13d92c63
LP
1043
1044 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1045 "Unknown system call group, ignoring: %s", name);
1046 return 0;
898748d8
YW
1047 }
1048
1049 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1050 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1051 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1052 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1053 * about them. */
58f6ab44 1054 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1055 if (r < 0)
1056 return r;
1057 }
1058 } else {
1059 int id;
1060
1061 id = seccomp_syscall_resolve_name(name);
1062 if (id == __NR_SCMP_ERROR) {
13d92c63 1063 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1064 return -EINVAL;
13d92c63
LP
1065
1066 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1067 "Failed to parse system call, ignoring: %s", name);
1068 return 0;
898748d8
YW
1069 }
1070
1071 /* If we previously wanted to forbid a syscall and now
1072 * we want to allow it, then remove it from the list. */
13d92c63 1073 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
898748d8
YW
1074 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1075 if (r < 0)
851ee70a
LW
1076 switch (r) {
1077 case -ENOMEM:
1078 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1079 case -EEXIST:
9d7fe7c6
LW
1080 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1081 break;
851ee70a
LW
1082 default:
1083 return r;
1084 }
898748d8
YW
1085 } else
1086 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1087 }
1088
1089 return 0;
1090}
1091
add00535 1092int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1093 uint32_t arch;
add00535
LP
1094 int r;
1095
f1d34068 1096 if (DEBUG_LOGGING) {
add00535
LP
1097 _cleanup_free_ char *s = NULL;
1098
86c2a9f1 1099 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1100 log_debug("Restricting namespace to: %s.", strna(s));
1101 }
1102
1103 /* NOOP? */
1104 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1105 return 0;
1106
469830d1
LP
1107 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1108 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1109 unsigned i;
add00535 1110
469830d1
LP
1111 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1112
1113 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1114 if (r < 0)
1115 return r;
1116
1117 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1118 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1119 * altogether. */
1120 r = seccomp_rule_add_exact(
1121 seccomp,
1122 SCMP_ACT_ERRNO(EPERM),
1123 SCMP_SYS(setns),
1124 0);
1125 else
1126 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1127 * special invocation with a zero flags argument, right here. */
1128 r = seccomp_rule_add_exact(
1129 seccomp,
1130 SCMP_ACT_ERRNO(EPERM),
1131 SCMP_SYS(setns),
1132 1,
1133 SCMP_A1(SCMP_CMP_EQ, 0));
1134 if (r < 0) {
1135 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1136 continue;
1137 }
1138
1139 for (i = 0; namespace_flag_map[i].name; i++) {
1140 unsigned long f;
1141
1142 f = namespace_flag_map[i].flag;
1143 if ((retain & f) == f) {
1144 log_debug("Permitting %s.", namespace_flag_map[i].name);
1145 continue;
1146 }
1147
1148 log_debug("Blocking %s.", namespace_flag_map[i].name);
1149
1150 r = seccomp_rule_add_exact(
1151 seccomp,
1152 SCMP_ACT_ERRNO(EPERM),
1153 SCMP_SYS(unshare),
1154 1,
1155 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1156 if (r < 0) {
1157 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1158 break;
1159 }
1160
511ceb1f
ZJS
1161 /* On s390/s390x the first two parameters to clone are switched */
1162 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1163 r = seccomp_rule_add_exact(
1164 seccomp,
1165 SCMP_ACT_ERRNO(EPERM),
1166 SCMP_SYS(clone),
1167 1,
1168 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1169 else
1170 r = seccomp_rule_add_exact(
1171 seccomp,
1172 SCMP_ACT_ERRNO(EPERM),
1173 SCMP_SYS(clone),
1174 1,
1175 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1176 if (r < 0) {
1177 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1178 break;
1179 }
1180
1181 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1182 r = seccomp_rule_add_exact(
1183 seccomp,
1184 SCMP_ACT_ERRNO(EPERM),
1185 SCMP_SYS(setns),
1186 1,
1187 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1188 if (r < 0) {
1189 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1190 break;
1191 }
1192 }
1193 }
1194 if (r < 0)
1195 continue;
1196
1197 r = seccomp_load(seccomp);
7bc5e0b1 1198 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1199 return r;
1200 if (r < 0)
1201 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1202 }
1203
1204 return 0;
1205}
1206
1207int seccomp_protect_sysctl(void) {
1208 uint32_t arch;
1209 int r;
1210
1211 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1212 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1213
1214 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1215
2e64e8f4
ZJS
1216 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1217 /* No _sysctl syscall */
1218 continue;
1219
469830d1
LP
1220 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1221 if (r < 0)
1222 return r;
1223
1224 r = seccomp_rule_add_exact(
add00535
LP
1225 seccomp,
1226 SCMP_ACT_ERRNO(EPERM),
469830d1 1227 SCMP_SYS(_sysctl),
add00535 1228 0);
469830d1
LP
1229 if (r < 0) {
1230 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1231 continue;
1232 }
1233
1234 r = seccomp_load(seccomp);
7bc5e0b1 1235 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1236 return r;
1237 if (r < 0)
1238 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1239 }
1240
1241 return 0;
1242}
1243
1244int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1245 uint32_t arch;
1246 int r;
1247
1248 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1249 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1250 bool supported;
469830d1
LP
1251 Iterator i;
1252
1253 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1254
9606bc4b
LP
1255 switch (arch) {
1256
1257 case SCMP_ARCH_X86_64:
1258 case SCMP_ARCH_X32:
1259 case SCMP_ARCH_ARM:
1260 case SCMP_ARCH_AARCH64:
0d9fca76 1261 case SCMP_ARCH_PPC:
da1921a5
ZJS
1262 case SCMP_ARCH_PPC64:
1263 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1264 case SCMP_ARCH_MIPSEL64N32:
1265 case SCMP_ARCH_MIPS64N32:
1266 case SCMP_ARCH_MIPSEL64:
1267 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1268 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1269 supported = true;
1270 break;
1271
9606bc4b
LP
1272 case SCMP_ARCH_S390:
1273 case SCMP_ARCH_S390X:
da1921a5 1274 case SCMP_ARCH_X86:
f5aeac14
JC
1275 case SCMP_ARCH_MIPSEL:
1276 case SCMP_ARCH_MIPS:
9606bc4b
LP
1277 default:
1278 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1279 * don't know */
1280 supported = false;
1281 break;
1282 }
1283
1284 if (!supported)
1285 continue;
1286
469830d1
LP
1287 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1288 if (r < 0)
1289 return r;
1290
1291 if (whitelist) {
1292 int af, first = 0, last = 0;
1293 void *afp;
1294
1295 /* If this is a whitelist, we first block the address families that are out of range and then
1296 * everything that is not in the set. First, we find the lowest and highest address family in
1297 * the set. */
1298
1299 SET_FOREACH(afp, address_families, i) {
1300 af = PTR_TO_INT(afp);
1301
1302 if (af <= 0 || af >= af_max())
1303 continue;
1304
1305 if (first == 0 || af < first)
1306 first = af;
1307
1308 if (last == 0 || af > last)
1309 last = af;
1310 }
1311
1312 assert((first == 0) == (last == 0));
1313
1314 if (first == 0) {
1315
1316 /* No entries in the valid range, block everything */
1317 r = seccomp_rule_add_exact(
1318 seccomp,
1319 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1320 SCMP_SYS(socket),
1321 0);
1322 if (r < 0) {
1323 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1324 continue;
1325 }
1326
1327 } else {
1328
1329 /* Block everything below the first entry */
1330 r = seccomp_rule_add_exact(
1331 seccomp,
1332 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1333 SCMP_SYS(socket),
1334 1,
1335 SCMP_A0(SCMP_CMP_LT, first));
1336 if (r < 0) {
1337 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1338 continue;
1339 }
1340
1341 /* Block everything above the last entry */
1342 r = seccomp_rule_add_exact(
1343 seccomp,
1344 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1345 SCMP_SYS(socket),
1346 1,
1347 SCMP_A0(SCMP_CMP_GT, last));
1348 if (r < 0) {
1349 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1350 continue;
1351 }
1352
1353 /* Block everything between the first and last entry */
1354 for (af = 1; af < af_max(); af++) {
1355
1356 if (set_contains(address_families, INT_TO_PTR(af)))
1357 continue;
1358
1359 r = seccomp_rule_add_exact(
1360 seccomp,
1361 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1362 SCMP_SYS(socket),
1363 1,
1364 SCMP_A0(SCMP_CMP_EQ, af));
1365 if (r < 0)
1366 break;
1367 }
469830d1
LP
1368 if (r < 0) {
1369 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1370 continue;
1371 }
1372 }
1373
1374 } else {
1375 void *af;
1376
1377 /* If this is a blacklist, then generate one rule for
1378 * each address family that are then combined in OR
1379 * checks. */
1380
1381 SET_FOREACH(af, address_families, i) {
1382
1383 r = seccomp_rule_add_exact(
1384 seccomp,
1385 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1386 SCMP_SYS(socket),
1387 1,
1388 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1389 if (r < 0)
1390 break;
1391 }
469830d1
LP
1392 if (r < 0) {
1393 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1394 continue;
1395 }
1396 }
1397
1398 r = seccomp_load(seccomp);
7bc5e0b1 1399 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1400 return r;
1401 if (r < 0)
1402 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1403 }
1404
1405 return 0;
1406}
1407
1408int seccomp_restrict_realtime(void) {
1409 static const int permitted_policies[] = {
1410 SCHED_OTHER,
1411 SCHED_BATCH,
1412 SCHED_IDLE,
1413 };
1414
1415 int r, max_policy = 0;
1416 uint32_t arch;
1417 unsigned i;
1418
1419 /* Determine the highest policy constant we want to allow */
1420 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1421 if (permitted_policies[i] > max_policy)
1422 max_policy = permitted_policies[i];
1423
1424 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1425 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1426 int p;
1427
1428 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1429
1430 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1431 if (r < 0)
1432 return r;
1433
1434 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1435 * whitelist. */
1436 for (p = 0; p < max_policy; p++) {
1437 bool good = false;
1438
1439 /* Check if this is in the whitelist. */
1440 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1441 if (permitted_policies[i] == p) {
1442 good = true;
1443 break;
1444 }
1445
1446 if (good)
1447 continue;
1448
1449 /* Deny this policy */
1450 r = seccomp_rule_add_exact(
1451 seccomp,
1452 SCMP_ACT_ERRNO(EPERM),
1453 SCMP_SYS(sched_setscheduler),
1454 1,
1455 SCMP_A1(SCMP_CMP_EQ, p));
1456 if (r < 0) {
1457 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1458 continue;
1459 }
1460 }
1461
1462 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1463 * unsigned here, hence no need no check for < 0 values. */
1464 r = seccomp_rule_add_exact(
add00535
LP
1465 seccomp,
1466 SCMP_ACT_ERRNO(EPERM),
469830d1 1467 SCMP_SYS(sched_setscheduler),
add00535 1468 1,
469830d1
LP
1469 SCMP_A1(SCMP_CMP_GT, max_policy));
1470 if (r < 0) {
1471 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1472 continue;
1473 }
add00535 1474
469830d1 1475 r = seccomp_load(seccomp);
7bc5e0b1 1476 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1477 return r;
1478 if (r < 0)
1479 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1480 }
1481
1482 return 0;
1483}
1484
6dc66688
ZJS
1485static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1486 uint32_t arch,
1487 int nr,
14cb109d 1488 unsigned arg_cnt,
6dc66688
ZJS
1489 const struct scmp_arg_cmp arg) {
1490 int r;
1491
1492 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1493 if (r < 0) {
1494 _cleanup_free_ char *n = NULL;
1495
1496 n = seccomp_syscall_resolve_num_arch(arch, nr);
1497 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1498 strna(n),
1499 seccomp_arch_to_string(arch));
1500 }
1501
1502 return r;
1503}
1504
2a8d6e63 1505/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1506#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1507assert_cc(SCMP_SYS(shmget) > 0);
1508assert_cc(SCMP_SYS(shmat) > 0);
1509assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1510#endif
6dc66688 1511
469830d1
LP
1512int seccomp_memory_deny_write_execute(void) {
1513 uint32_t arch;
1514 int r;
1515
1516 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1517 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1518 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1519
469830d1
LP
1520 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1521
8a50cf69
LP
1522 switch (arch) {
1523
1524 case SCMP_ARCH_X86:
57311925 1525 case SCMP_ARCH_S390:
8a50cf69
LP
1526 filter_syscall = SCMP_SYS(mmap2);
1527 block_syscall = SCMP_SYS(mmap);
67fb5f33 1528 shmat_syscall = SCMP_SYS(shmat);
2a8d6e63
ZJS
1529 break;
1530
63d00dfb 1531 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1532 case SCMP_ARCH_PPC64:
1533 case SCMP_ARCH_PPC64LE:
1534 filter_syscall = SCMP_SYS(mmap);
1535
1536 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1537 * We ignore that here, which means there's still a way to get writable/executable
1538 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1539
8a50cf69
LP
1540 break;
1541
4278d1f5
ZJS
1542 case SCMP_ARCH_ARM:
1543 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1544 shmat_syscall = SCMP_SYS(shmat);
1545 break;
1546
8a50cf69
LP
1547 case SCMP_ARCH_X86_64:
1548 case SCMP_ARCH_X32:
79873bc8 1549 case SCMP_ARCH_AARCH64:
57311925
DS
1550 case SCMP_ARCH_S390X:
1551 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, s390x, and arm64 have only mmap */
8a50cf69
LP
1552 shmat_syscall = SCMP_SYS(shmat);
1553 break;
1554
1555 /* Please add more definitions here, if you port systemd to other architectures! */
1556
57311925 1557#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
8a50cf69
LP
1558#warning "Consider adding the right mmap() syscall definitions here!"
1559#endif
1560 }
1561
1562 /* Can't filter mmap() on this arch, then skip it */
1563 if (filter_syscall == 0)
1564 continue;
1565
469830d1
LP
1566 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1567 if (r < 0)
1568 return r;
1569
6dc66688
ZJS
1570 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1571 1,
1572 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1573 if (r < 0)
1574 continue;
8a50cf69
LP
1575
1576 if (block_syscall != 0) {
6dc66688
ZJS
1577 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1578 if (r < 0)
8a50cf69 1579 continue;
add00535 1580 }
a3be2849 1581
6dc66688
ZJS
1582 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1583 1,
b835eeb4
ZJS
1584 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1585 if (r < 0)
1586 continue;
1587
91691f1d 1588#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1589 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1590 1,
6dc66688
ZJS
1591 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1592 if (r < 0)
469830d1 1593 continue;
91691f1d 1594#endif
add00535 1595
67fb5f33 1596 if (shmat_syscall > 0) {
6dc66688
ZJS
1597 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1598 1,
1599 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1600 if (r < 0)
8a50cf69 1601 continue;
469830d1
LP
1602 }
1603
1604 r = seccomp_load(seccomp);
7bc5e0b1 1605 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1606 return r;
add00535 1607 if (r < 0)
469830d1
LP
1608 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1609 }
add00535 1610
469830d1
LP
1611 return 0;
1612}
1613
1614int seccomp_restrict_archs(Set *archs) {
1615 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1616 Iterator i;
1617 void *id;
1618 int r;
1619
1620 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1621 * list.
1622 *
1623 * There are some qualifications. However the most important use is to stop processes from bypassing
1624 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1625 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1626
2428aaf8
AJ
1627 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1628 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1629 * to run a program with the restrictions applied. */
469830d1
LP
1630 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1631 if (!seccomp)
1632 return -ENOMEM;
1633
1634 SET_FOREACH(id, archs, i) {
1635 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1636 if (r < 0 && r != -EEXIST)
1637 return r;
1638 }
1639
1640 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1641 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1642 * The important thing is that you can block the old 32-bit x86 syscalls.
1643 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1644
1645 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1646 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1647
1648 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1649 if (r < 0 && r != -EEXIST)
469830d1 1650 return r;
add00535
LP
1651 }
1652
469830d1
LP
1653 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1654 if (r < 0)
1655 return r;
add00535 1656
1c6af69b 1657 r = seccomp_load(seccomp);
7bc5e0b1 1658 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1659 return r;
1660 if (r < 0)
1661 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1662
1663 return 0;
a3be2849 1664}
b16bd535
YW
1665
1666int parse_syscall_archs(char **l, Set **archs) {
1667 _cleanup_set_free_ Set *_archs;
1668 char **s;
1669 int r;
1670
1671 assert(l);
1672 assert(archs);
1673
1674 r = set_ensure_allocated(&_archs, NULL);
1675 if (r < 0)
1676 return r;
1677
1678 STRV_FOREACH(s, l) {
1679 uint32_t a;
1680
1681 r = seccomp_arch_from_string(*s, &a);
1682 if (r < 0)
1683 return -EINVAL;
1684
1685 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1686 if (r < 0)
1687 return -ENOMEM;
1688 }
1689
1cc6c93a 1690 *archs = TAKE_PTR(_archs);
b16bd535
YW
1691
1692 return 0;
1693}
165a31c0 1694
8cfa775f 1695int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1696 const char *i;
1697 int r;
1698
1699 assert(set);
1700
1701 NULSTR_FOREACH(i, set->value) {
1702
1703 if (i[0] == '@') {
1704 const SyscallFilterSet *more;
1705
1706 more = syscall_filter_set_find(i);
1707 if (!more)
1708 return -ENXIO;
1709
165a31c0
LP
1710 r = seccomp_filter_set_add(filter, add, more);
1711 if (r < 0)
1712 return r;
1713 } else {
1714 int id;
1715
1716 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1717 if (id == __NR_SCMP_ERROR) {
1718 log_debug("Couldn't resolve system call, ignoring: %s", i);
1719 continue;
1720 }
165a31c0
LP
1721
1722 if (add) {
8cfa775f 1723 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1724 if (r < 0)
1725 return r;
1726 } else
8cfa775f 1727 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1728 }
1729 }
1730
1731 return 0;
1732}
78e864e5
TM
1733
1734int seccomp_lock_personality(unsigned long personality) {
72eafe71 1735 uint32_t arch;
78e864e5
TM
1736 int r;
1737
72eafe71
LP
1738 if (personality >= PERSONALITY_INVALID)
1739 return -EINVAL;
78e864e5 1740
72eafe71
LP
1741 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1742 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1743
72eafe71
LP
1744 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1745 if (r < 0)
1746 return r;
1747
1748 r = seccomp_rule_add_exact(
1749 seccomp,
1750 SCMP_ACT_ERRNO(EPERM),
1751 SCMP_SYS(personality),
1752 1,
1753 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1754 if (r < 0) {
1755 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1756 continue;
1757 }
72eafe71
LP
1758
1759 r = seccomp_load(seccomp);
7bc5e0b1 1760 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1761 return r;
1762 if (r < 0)
1763 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1764 }
1765
1766 return 0;
78e864e5 1767}
aecd5ac6
TM
1768
1769int seccomp_protect_hostname(void) {
1770 uint32_t arch;
1771 int r;
1772
1773 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1774 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1775
1776 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1777 if (r < 0)
1778 return r;
1779
1780 r = seccomp_rule_add_exact(
1781 seccomp,
1782 SCMP_ACT_ERRNO(EPERM),
1783 SCMP_SYS(sethostname),
1784 0);
9e6e543c
LP
1785 if (r < 0) {
1786 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1787 continue;
9e6e543c 1788 }
aecd5ac6
TM
1789
1790 r = seccomp_rule_add_exact(
1791 seccomp,
1792 SCMP_ACT_ERRNO(EPERM),
1793 SCMP_SYS(setdomainname),
1794 0);
9e6e543c
LP
1795 if (r < 0) {
1796 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1797 continue;
9e6e543c 1798 }
aecd5ac6
TM
1799
1800 r = seccomp_load(seccomp);
7bc5e0b1 1801 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1802 return r;
1803 if (r < 0)
1804 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1805 }
1806
1807 return 0;
1808}
3c27973b 1809
da4dc9a6
ZJS
1810static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1811 /* Checks the mode_t parameter of the following system calls:
1812 *
1813 * → chmod() + fchmod() + fchmodat()
1814 * → open() + creat() + openat()
1815 * → mkdir() + mkdirat()
1816 * → mknod() + mknodat()
1817 *
1818 * Returns error if *everything* failed, and 0 otherwise.
1819 */
1820 int r = 0;
1821 bool any = false;
1822
1823 r = seccomp_rule_add_exact(
1824 seccomp,
1825 SCMP_ACT_ERRNO(EPERM),
1826 SCMP_SYS(chmod),
1827 1,
1828 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1829 if (r < 0)
1830 log_debug_errno(r, "Failed to add filter for chmod: %m");
1831 else
1832 any = true;
1833
1834 r = seccomp_rule_add_exact(
1835 seccomp,
1836 SCMP_ACT_ERRNO(EPERM),
1837 SCMP_SYS(fchmod),
1838 1,
1839 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1840 if (r < 0)
1841 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1842 else
1843 any = true;
1844
1845 r = seccomp_rule_add_exact(
1846 seccomp,
1847 SCMP_ACT_ERRNO(EPERM),
1848 SCMP_SYS(fchmodat),
1849 1,
1850 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1851 if (r < 0)
1852 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1853 else
1854 any = true;
1855
1856 r = seccomp_rule_add_exact(
1857 seccomp,
1858 SCMP_ACT_ERRNO(EPERM),
1859 SCMP_SYS(mkdir),
1860 1,
1861 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1862 if (r < 0)
1863 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1864 else
1865 any = true;
1866
1867 r = seccomp_rule_add_exact(
1868 seccomp,
1869 SCMP_ACT_ERRNO(EPERM),
1870 SCMP_SYS(mkdirat),
1871 1,
1872 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1873 if (r < 0)
1874 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1875 else
1876 any = true;
1877
1878 r = seccomp_rule_add_exact(
1879 seccomp,
1880 SCMP_ACT_ERRNO(EPERM),
1881 SCMP_SYS(mknod),
1882 1,
1883 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1884 if (r < 0)
1885 log_debug_errno(r, "Failed to add filter for mknod: %m");
1886 else
1887 any = true;
1888
1889 r = seccomp_rule_add_exact(
1890 seccomp,
1891 SCMP_ACT_ERRNO(EPERM),
1892 SCMP_SYS(mknodat),
1893 1,
1894 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1895 if (r < 0)
1896 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1897 else
1898 any = true;
1899
1900#if SCMP_SYS(open) > 0
1901 r = seccomp_rule_add_exact(
1902 seccomp,
1903 SCMP_ACT_ERRNO(EPERM),
1904 SCMP_SYS(open),
1905 2,
1906 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1907 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1908 if (r < 0)
1909 log_debug_errno(r, "Failed to add filter for open: %m");
1910 else
1911 any = true;
1912#endif
1913
1914 r = seccomp_rule_add_exact(
1915 seccomp,
1916 SCMP_ACT_ERRNO(EPERM),
1917 SCMP_SYS(openat),
1918 2,
1919 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1920 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1921 if (r < 0)
1922 log_debug_errno(r, "Failed to add filter for openat: %m");
1923 else
1924 any = true;
1925
1926 r = seccomp_rule_add_exact(
1927 seccomp,
1928 SCMP_ACT_ERRNO(EPERM),
1929 SCMP_SYS(creat),
1930 1,
1931 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1932 if (r < 0)
1933 log_debug_errno(r, "Failed to add filter for creat: %m");
1934 else
1935 any = true;
1936
1937 return any ? 0 : r;
1938}
1939
3c27973b
LP
1940int seccomp_restrict_suid_sgid(void) {
1941 uint32_t arch;
da4dc9a6 1942 int r, k;
3c27973b
LP
1943
1944 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1945 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1946
1947 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1948 if (r < 0)
1949 return r;
1950
da4dc9a6
ZJS
1951 r = seccomp_restrict_sxid(seccomp, S_ISUID);
1952 if (r < 0)
1953 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 1954
da4dc9a6
ZJS
1955 k = seccomp_restrict_sxid(seccomp, S_ISGID);
1956 if (k < 0)
1957 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 1958
da4dc9a6 1959 if (r < 0 && k < 0)
3c27973b 1960 continue;
3c27973b
LP
1961
1962 r = seccomp_load(seccomp);
7bc5e0b1 1963 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
1964 return r;
1965 if (r < 0)
1966 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1967 }
1968
1969 return 0;
1970}
915fb324
LP
1971
1972uint32_t scmp_act_kill_process(void) {
1973
1974 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
1975 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
1976 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
1977 * for single-threaded apps does the right thing. */
1978
1979#ifdef SCMP_ACT_KILL_PROCESS
1980 if (seccomp_api_get() >= 3)
1981 return SCMP_ACT_KILL_PROCESS;
1982#endif
1983
1984 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
1985}