]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
core: remove support for ".include" stanza
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
d8b4d14d 15#include "errno-list.h"
a8fbdf54 16#include "macro.h"
add00535 17#include "nsflags.h"
d8b4d14d 18#include "nulstr-util.h"
78e864e5 19#include "process-util.h"
cf0fbc49 20#include "seccomp-util.h"
b16bd535 21#include "set.h"
07630cea 22#include "string-util.h"
b16bd535 23#include "strv.h"
469830d1
LP
24
25const uint32_t seccomp_local_archs[] = {
26
f2d9751c
LP
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28
29#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
f2d9751c
LP
32 SCMP_ARCH_X32, /* native */
33#elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
469830d1 35 SCMP_ARCH_X32,
f2d9751c
LP
36 SCMP_ARCH_X86_64, /* native */
37#elif defined(__i386__)
38 SCMP_ARCH_X86,
39#elif defined(__aarch64__)
469830d1 40 SCMP_ARCH_ARM,
f2d9751c
LP
41 SCMP_ARCH_AARCH64, /* native */
42#elif defined(__arm__)
43 SCMP_ARCH_ARM,
44#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 48 SCMP_ARCH_MIPS,
f2d9751c
LP
49 SCMP_ARCH_MIPSEL, /* native */
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
469830d1 54 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
469830d1 59 SCMP_ARCH_MIPSEL,
f2d9751c
LP
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL64,
f2d9751c 68 SCMP_ARCH_MIPS64,
469830d1 69 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPS64N32, /* native */
71#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 79 SCMP_ARCH_PPC,
469830d1 80 SCMP_ARCH_PPC64LE,
f2d9751c
LP
81 SCMP_ARCH_PPC64, /* native */
82#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86#elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88#elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91#elif defined(__s390__)
469830d1 92 SCMP_ARCH_S390,
469830d1
LP
93#endif
94 (uint32_t) -1
95 };
57183d11
LP
96
97const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
57183d11 102
aa34055f
ZJS
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
57183d11 105 return "native";
aa34055f 106 case SCMP_ARCH_X86:
57183d11 107 return "x86";
aa34055f 108 case SCMP_ARCH_X86_64:
57183d11 109 return "x86-64";
aa34055f 110 case SCMP_ARCH_X32:
57183d11 111 return "x32";
aa34055f 112 case SCMP_ARCH_ARM:
57183d11 113 return "arm";
aa34055f
ZJS
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
6abfd303 135 return "s390";
aa34055f 136 case SCMP_ARCH_S390X:
6abfd303 137 return "s390x";
aa34055f
ZJS
138 default:
139 return NULL;
140 }
57183d11
LP
141}
142
143int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
57183d11
LP
183 else
184 return -EINVAL;
185
186 return 0;
187}
e9642be2 188
469830d1 189int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
190 scmp_filter_ctx seccomp;
191 int r;
192
469830d1
LP
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
469830d1
LP
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
1b52793d 203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
204 if (r < 0)
205 goto finish;
206
1b52793d 207 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230finish:
231 seccomp_release(seccomp);
232 return r;
233}
234
d347d902 235static bool is_basic_seccomp_available(void) {
4d5bd50a 236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
237}
238
239static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
d347d902
FS
242}
243
83f12b27 244bool is_seccomp_available(void) {
83f12b27 245 static int cached_enabled = -1;
4d5bd50a 246
83f12b27 247 if (cached_enabled < 0)
4d5bd50a
LP
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
83f12b27
FS
252 return cached_enabled;
253}
254
8130926d 255const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 256 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 257 .name = "@default",
d5efc18b 258 .help = "System calls that are always permitted",
40eb6a80
ZJS
259 .value =
260 "clock_getres\0"
6ca67710 261 "clock_getres_time64\0"
40eb6a80 262 "clock_gettime\0"
6ca67710 263 "clock_gettime64\0"
40eb6a80 264 "clock_nanosleep\0"
6ca67710 265 "clock_nanosleep_time64\0"
40eb6a80
ZJS
266 "execve\0"
267 "exit\0"
268 "exit_group\0"
e41b0f42 269 "futex\0"
6ca67710 270 "futex_time64\0"
e41b0f42
LP
271 "get_robust_list\0"
272 "get_thread_area\0"
09d3020b
DH
273 "getegid\0"
274 "getegid32\0"
275 "geteuid\0"
276 "geteuid32\0"
277 "getgid\0"
278 "getgid32\0"
279 "getgroups\0"
280 "getgroups32\0"
281 "getpgid\0"
282 "getpgrp\0"
283 "getpid\0"
284 "getppid\0"
285 "getresgid\0"
286 "getresgid32\0"
287 "getresuid\0"
288 "getresuid32\0"
40eb6a80 289 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
290 "getsid\0"
291 "gettid\0"
40eb6a80 292 "gettimeofday\0"
09d3020b
DH
293 "getuid\0"
294 "getuid32\0"
e41b0f42 295 "membarrier\0"
40eb6a80
ZJS
296 "nanosleep\0"
297 "pause\0"
4c3a9176 298 "prlimit64\0"
e41b0f42 299 "restart_syscall\0"
6fee3be0 300 "rseq\0"
40eb6a80 301 "rt_sigreturn\0"
8f44de08 302 "sched_yield\0"
e41b0f42
LP
303 "set_robust_list\0"
304 "set_thread_area\0"
305 "set_tid_address\0"
ce5faeac 306 "set_tls\0"
40eb6a80
ZJS
307 "sigreturn\0"
308 "time\0"
4c3a9176 309 "ugetrlimit\0"
40eb6a80 310 },
44898c53
LP
311 [SYSCALL_FILTER_SET_AIO] = {
312 .name = "@aio",
313 .help = "Asynchronous IO",
314 .value =
315 "io_cancel\0"
316 "io_destroy\0"
317 "io_getevents\0"
a05cfe23 318 "io_pgetevents\0"
6ca67710 319 "io_pgetevents_time64\0"
44898c53
LP
320 "io_setup\0"
321 "io_submit\0"
9e486265
LP
322 "io_uring_enter\0"
323 "io_uring_register\0"
324 "io_uring_setup\0"
44898c53 325 },
133ddbbe 326 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 327 .name = "@basic-io",
d5efc18b 328 .help = "Basic IO",
133ddbbe 329 .value =
648a0ed0 330 "_llseek\0"
133ddbbe 331 "close\0"
648a0ed0 332 "dup\0"
133ddbbe
LP
333 "dup2\0"
334 "dup3\0"
133ddbbe
LP
335 "lseek\0"
336 "pread64\0"
337 "preadv\0"
44898c53 338 "preadv2\0"
133ddbbe
LP
339 "pwrite64\0"
340 "pwritev\0"
44898c53 341 "pwritev2\0"
133ddbbe
LP
342 "read\0"
343 "readv\0"
344 "write\0"
345 "writev\0"
346 },
44898c53
LP
347 [SYSCALL_FILTER_SET_CHOWN] = {
348 .name = "@chown",
349 .help = "Change ownership of files and directories",
350 .value =
351 "chown\0"
352 "chown32\0"
353 "fchown\0"
354 "fchown32\0"
355 "fchownat\0"
356 "lchown\0"
357 "lchown32\0"
358 },
8130926d 359 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 360 .name = "@clock",
d5efc18b 361 .help = "Change the system time",
201c1cc2
TM
362 .value =
363 "adjtimex\0"
1f9ac68b 364 "clock_adjtime\0"
6ca67710 365 "clock_adjtime64\0"
1f9ac68b 366 "clock_settime\0"
6ca67710 367 "clock_settime64\0"
201c1cc2 368 "settimeofday\0"
1f9ac68b 369 "stime\0"
8130926d
LP
370 },
371 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 372 .name = "@cpu-emulation",
d5efc18b 373 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
374 .value =
375 "modify_ldt\0"
376 "subpage_prot\0"
377 "switch_endian\0"
378 "vm86\0"
379 "vm86old\0"
8130926d
LP
380 },
381 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 382 .name = "@debug",
d5efc18b 383 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
384 .value =
385 "lookup_dcookie\0"
386 "perf_event_open\0"
8270e3d8 387 "pidfd_getfd\0"
1f9ac68b
LP
388 "ptrace\0"
389 "rtas\0"
8130926d 390#ifdef __NR_s390_runtime_instr
1f9ac68b 391 "s390_runtime_instr\0"
8130926d 392#endif
1f9ac68b 393 "sys_debug_setcontext\0"
8130926d 394 },
1a1b13c9
LP
395 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
396 .name = "@file-system",
397 .help = "File system operations",
398 .value =
399 "access\0"
400 "chdir\0"
401 "chmod\0"
402 "close\0"
403 "creat\0"
404 "faccessat\0"
405 "fallocate\0"
406 "fchdir\0"
407 "fchmod\0"
408 "fchmodat\0"
1a1b13c9 409 "fcntl\0"
ceaa6aa7 410 "fcntl64\0"
1a1b13c9
LP
411 "fgetxattr\0"
412 "flistxattr\0"
ceaa6aa7 413 "fremovexattr\0"
1a1b13c9 414 "fsetxattr\0"
1a1b13c9 415 "fstat\0"
ceaa6aa7 416 "fstat64\0"
1a1b13c9 417 "fstatat64\0"
1a1b13c9 418 "fstatfs\0"
ceaa6aa7 419 "fstatfs64\0"
1a1b13c9 420 "ftruncate\0"
ceaa6aa7 421 "ftruncate64\0"
1a1b13c9
LP
422 "futimesat\0"
423 "getcwd\0"
1a1b13c9 424 "getdents\0"
ceaa6aa7 425 "getdents64\0"
1a1b13c9
LP
426 "getxattr\0"
427 "inotify_add_watch\0"
ceaa6aa7 428 "inotify_init\0"
1a1b13c9
LP
429 "inotify_init1\0"
430 "inotify_rm_watch\0"
431 "lgetxattr\0"
432 "link\0"
433 "linkat\0"
434 "listxattr\0"
435 "llistxattr\0"
436 "lremovexattr\0"
437 "lsetxattr\0"
1a1b13c9 438 "lstat\0"
ceaa6aa7 439 "lstat64\0"
1a1b13c9
LP
440 "mkdir\0"
441 "mkdirat\0"
442 "mknod\0"
443 "mknodat\0"
1a1b13c9 444 "mmap\0"
ceaa6aa7 445 "mmap2\0"
7961116e 446 "munmap\0"
1a1b13c9 447 "newfstatat\0"
ceaa6aa7
LP
448 "oldfstat\0"
449 "oldlstat\0"
450 "oldstat\0"
1a1b13c9
LP
451 "open\0"
452 "openat\0"
8270e3d8 453 "openat2\0"
1a1b13c9
LP
454 "readlink\0"
455 "readlinkat\0"
456 "removexattr\0"
457 "rename\0"
1a1b13c9 458 "renameat\0"
ceaa6aa7 459 "renameat2\0"
1a1b13c9
LP
460 "rmdir\0"
461 "setxattr\0"
1a1b13c9 462 "stat\0"
ceaa6aa7 463 "stat64\0"
1a1b13c9 464 "statfs\0"
ceaa6aa7 465 "statfs64\0"
8e6a7a8b 466#ifdef __NR_statx
a4135a74 467 "statx\0"
ceaa6aa7 468#endif
1a1b13c9
LP
469 "symlink\0"
470 "symlinkat\0"
1a1b13c9 471 "truncate\0"
ceaa6aa7 472 "truncate64\0"
1a1b13c9
LP
473 "unlink\0"
474 "unlinkat\0"
ceaa6aa7 475 "utime\0"
1a1b13c9 476 "utimensat\0"
6ca67710 477 "utimensat_time64\0"
1a1b13c9
LP
478 "utimes\0"
479 },
8130926d 480 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 481 .name = "@io-event",
d5efc18b 482 .help = "Event loop system calls",
201c1cc2
TM
483 .value =
484 "_newselect\0"
201c1cc2 485 "epoll_create\0"
215728ff 486 "epoll_create1\0"
201c1cc2
TM
487 "epoll_ctl\0"
488 "epoll_ctl_old\0"
489 "epoll_pwait\0"
490 "epoll_wait\0"
491 "epoll_wait_old\0"
201c1cc2 492 "eventfd\0"
215728ff 493 "eventfd2\0"
201c1cc2
TM
494 "poll\0"
495 "ppoll\0"
6ca67710 496 "ppoll_time64\0"
201c1cc2 497 "pselect6\0"
6ca67710 498 "pselect6_time64\0"
201c1cc2 499 "select\0"
8130926d
LP
500 },
501 [SYSCALL_FILTER_SET_IPC] = {
8130926d 502 .name = "@ipc",
d5efc18b
ZJS
503 .help = "SysV IPC, POSIX Message Queues or other IPC",
504 .value =
505 "ipc\0"
cd5bfd7e 506 "memfd_create\0"
201c1cc2
TM
507 "mq_getsetattr\0"
508 "mq_notify\0"
509 "mq_open\0"
510 "mq_timedreceive\0"
6ca67710 511 "mq_timedreceive_time64\0"
201c1cc2 512 "mq_timedsend\0"
6ca67710 513 "mq_timedsend_time64\0"
201c1cc2
TM
514 "mq_unlink\0"
515 "msgctl\0"
516 "msgget\0"
517 "msgrcv\0"
518 "msgsnd\0"
cd5bfd7e 519 "pipe\0"
215728ff 520 "pipe2\0"
201c1cc2
TM
521 "process_vm_readv\0"
522 "process_vm_writev\0"
523 "semctl\0"
524 "semget\0"
525 "semop\0"
526 "semtimedop\0"
6ca67710 527 "semtimedop_time64\0"
201c1cc2
TM
528 "shmat\0"
529 "shmctl\0"
530 "shmdt\0"
531 "shmget\0"
8130926d
LP
532 },
533 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 534 .name = "@keyring",
d5efc18b 535 .help = "Kernel keyring access",
1f9ac68b
LP
536 .value =
537 "add_key\0"
538 "keyctl\0"
539 "request_key\0"
8130926d 540 },
cd0ddf6f
LP
541 [SYSCALL_FILTER_SET_MEMLOCK] = {
542 .name = "@memlock",
543 .help = "Memory locking control",
544 .value =
545 "mlock\0"
546 "mlock2\0"
547 "mlockall\0"
548 "munlock\0"
549 "munlockall\0"
550 },
8130926d 551 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 552 .name = "@module",
d5efc18b 553 .help = "Loading and unloading of kernel modules",
201c1cc2 554 .value =
201c1cc2
TM
555 "delete_module\0"
556 "finit_module\0"
557 "init_module\0"
8130926d
LP
558 },
559 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 560 .name = "@mount",
d5efc18b 561 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
562 .value =
563 "chroot\0"
9e486265
LP
564 "fsconfig\0"
565 "fsmount\0"
566 "fsopen\0"
567 "fspick\0"
201c1cc2 568 "mount\0"
9e486265
LP
569 "move_mount\0"
570 "open_tree\0"
201c1cc2 571 "pivot_root\0"
201c1cc2 572 "umount\0"
215728ff 573 "umount2\0"
8130926d
LP
574 },
575 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 576 .name = "@network-io",
d5efc18b 577 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 578 .value =
201c1cc2 579 "accept\0"
215728ff 580 "accept4\0"
201c1cc2
TM
581 "bind\0"
582 "connect\0"
583 "getpeername\0"
584 "getsockname\0"
585 "getsockopt\0"
586 "listen\0"
587 "recv\0"
588 "recvfrom\0"
589 "recvmmsg\0"
6ca67710 590 "recvmmsg_time64\0"
201c1cc2
TM
591 "recvmsg\0"
592 "send\0"
593 "sendmmsg\0"
594 "sendmsg\0"
595 "sendto\0"
596 "setsockopt\0"
597 "shutdown\0"
598 "socket\0"
599 "socketcall\0"
600 "socketpair\0"
8130926d
LP
601 },
602 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 603 /* some unknown even to libseccomp */
8130926d 604 .name = "@obsolete",
d5efc18b 605 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
606 .value =
607 "_sysctl\0"
608 "afs_syscall\0"
802fa07a 609 "bdflush\0"
201c1cc2 610 "break\0"
1f9ac68b 611 "create_module\0"
201c1cc2
TM
612 "ftime\0"
613 "get_kernel_syms\0"
201c1cc2
TM
614 "getpmsg\0"
615 "gtty\0"
7e0c3b8f 616 "idle\0"
201c1cc2 617 "lock\0"
201c1cc2 618 "mpx\0"
201c1cc2
TM
619 "prof\0"
620 "profil\0"
201c1cc2
TM
621 "putpmsg\0"
622 "query_module\0"
201c1cc2
TM
623 "security\0"
624 "sgetmask\0"
625 "ssetmask\0"
626 "stty\0"
1f9ac68b 627 "sysfs\0"
201c1cc2
TM
628 "tuxcall\0"
629 "ulimit\0"
630 "uselib\0"
1f9ac68b 631 "ustat\0"
201c1cc2 632 "vserver\0"
8130926d 633 },
9493b168
ZJS
634 [SYSCALL_FILTER_SET_PKEY] = {
635 .name = "@pkey",
636 .help = "System calls used for memory protection keys",
637 .value =
638 "pkey_alloc\0"
639 "pkey_free\0"
640 "pkey_mprotect\0"
641 },
8130926d 642 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 643 .name = "@privileged",
d5efc18b 644 .help = "All system calls which need super-user capabilities",
201c1cc2 645 .value =
44898c53 646 "@chown\0"
201c1cc2
TM
647 "@clock\0"
648 "@module\0"
649 "@raw-io\0"
af0f047b
LP
650 "@reboot\0"
651 "@swap\0"
215728ff 652 "_sysctl\0"
201c1cc2 653 "acct\0"
201c1cc2 654 "bpf\0"
1f9ac68b 655 "capset\0"
201c1cc2 656 "chroot\0"
a05cfe23 657 "fanotify_init\0"
9e486265 658 "fanotify_mark\0"
201c1cc2 659 "nfsservctl\0"
a05cfe23 660 "open_by_handle_at\0"
201c1cc2
TM
661 "pivot_root\0"
662 "quotactl\0"
201c1cc2 663 "setdomainname\0"
201c1cc2 664 "setfsuid\0"
215728ff 665 "setfsuid32\0"
201c1cc2 666 "setgroups\0"
215728ff 667 "setgroups32\0"
201c1cc2 668 "sethostname\0"
201c1cc2 669 "setresuid\0"
215728ff 670 "setresuid32\0"
201c1cc2 671 "setreuid\0"
215728ff 672 "setreuid32\0"
e05ee49b 673 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 674 "setuid32\0"
201c1cc2 675 "vhangup\0"
8130926d
LP
676 },
677 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 678 .name = "@process",
d5efc18b 679 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
680 .value =
681 "arch_prctl\0"
09d3020b 682 "capget\0" /* Able to query arbitrary processes */
201c1cc2 683 "clone\0"
9e486265 684 "clone3\0"
201c1cc2
TM
685 "execveat\0"
686 "fork\0"
b887d2eb 687 "getrusage\0"
201c1cc2 688 "kill\0"
9e486265 689 "pidfd_open\0"
46fcf95d 690 "pidfd_send_signal\0"
201c1cc2 691 "prctl\0"
b887d2eb
LP
692 "rt_sigqueueinfo\0"
693 "rt_tgsigqueueinfo\0"
201c1cc2 694 "setns\0"
a9518dc3 695 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 696 "tgkill\0"
b887d2eb 697 "times\0"
201c1cc2
TM
698 "tkill\0"
699 "unshare\0"
700 "vfork\0"
b887d2eb
LP
701 "wait4\0"
702 "waitid\0"
703 "waitpid\0"
8130926d
LP
704 },
705 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 706 .name = "@raw-io",
d5efc18b 707 .help = "Raw I/O port access",
201c1cc2
TM
708 .value =
709 "ioperm\0"
710 "iopl\0"
1f9ac68b 711 "pciconfig_iobase\0"
201c1cc2
TM
712 "pciconfig_read\0"
713 "pciconfig_write\0"
8130926d 714#ifdef __NR_s390_pci_mmio_read
201c1cc2 715 "s390_pci_mmio_read\0"
8130926d
LP
716#endif
717#ifdef __NR_s390_pci_mmio_write
201c1cc2 718 "s390_pci_mmio_write\0"
8130926d
LP
719#endif
720 },
bd2ab3f4
LP
721 [SYSCALL_FILTER_SET_REBOOT] = {
722 .name = "@reboot",
723 .help = "Reboot and reboot preparation/kexec",
724 .value =
bd2ab3f4 725 "kexec_file_load\0"
e59608fa 726 "kexec_load\0"
bd2ab3f4
LP
727 "reboot\0"
728 },
133ddbbe 729 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 730 .name = "@resources",
58a8f68b 731 .help = "Alter resource settings",
133ddbbe 732 .value =
0963c053
LP
733 "ioprio_set\0"
734 "mbind\0"
735 "migrate_pages\0"
736 "move_pages\0"
737 "nice\0"
0963c053
LP
738 "sched_setaffinity\0"
739 "sched_setattr\0"
133ddbbe
LP
740 "sched_setparam\0"
741 "sched_setscheduler\0"
0963c053 742 "set_mempolicy\0"
133ddbbe
LP
743 "setpriority\0"
744 "setrlimit\0"
133ddbbe 745 },
6eaaeee9
LP
746 [SYSCALL_FILTER_SET_SETUID] = {
747 .name = "@setuid",
748 .help = "Operations for changing user/group credentials",
749 .value =
6eaaeee9 750 "setgid\0"
215728ff 751 "setgid32\0"
6eaaeee9 752 "setgroups\0"
215728ff 753 "setgroups32\0"
6eaaeee9 754 "setregid\0"
215728ff 755 "setregid32\0"
6eaaeee9 756 "setresgid\0"
215728ff 757 "setresgid32\0"
6eaaeee9 758 "setresuid\0"
215728ff 759 "setresuid32\0"
6eaaeee9 760 "setreuid\0"
215728ff 761 "setreuid32\0"
6eaaeee9 762 "setuid\0"
215728ff 763 "setuid32\0"
6eaaeee9 764 },
cd0ddf6f
LP
765 [SYSCALL_FILTER_SET_SIGNAL] = {
766 .name = "@signal",
767 .help = "Process signal handling",
768 .value =
769 "rt_sigaction\0"
770 "rt_sigpending\0"
771 "rt_sigprocmask\0"
772 "rt_sigsuspend\0"
773 "rt_sigtimedwait\0"
6ca67710 774 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
775 "sigaction\0"
776 "sigaltstack\0"
777 "signal\0"
778 "signalfd\0"
779 "signalfd4\0"
780 "sigpending\0"
781 "sigprocmask\0"
782 "sigsuspend\0"
783 },
bd2ab3f4
LP
784 [SYSCALL_FILTER_SET_SWAP] = {
785 .name = "@swap",
786 .help = "Enable/disable swap devices",
787 .value =
788 "swapoff\0"
789 "swapon\0"
790 },
44898c53
LP
791 [SYSCALL_FILTER_SET_SYNC] = {
792 .name = "@sync",
793 .help = "Synchronize files and memory to storage",
794 .value =
795 "fdatasync\0"
796 "fsync\0"
797 "msync\0"
798 "sync\0"
799 "sync_file_range\0"
a8fb09f5 800 "sync_file_range2\0"
44898c53
LP
801 "syncfs\0"
802 },
70526841
LP
803 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
804 .name = "@system-service",
805 .help = "General system service operations",
806 .value =
807 "@aio\0"
808 "@basic-io\0"
809 "@chown\0"
810 "@default\0"
811 "@file-system\0"
812 "@io-event\0"
813 "@ipc\0"
814 "@keyring\0"
815 "@memlock\0"
816 "@network-io\0"
817 "@process\0"
818 "@resources\0"
819 "@setuid\0"
820 "@signal\0"
821 "@sync\0"
822 "@timer\0"
823 "brk\0"
824 "capget\0"
825 "capset\0"
826 "copy_file_range\0"
827 "fadvise64\0"
828 "fadvise64_64\0"
829 "flock\0"
830 "get_mempolicy\0"
831 "getcpu\0"
832 "getpriority\0"
833 "getrandom\0"
834 "ioctl\0"
835 "ioprio_get\0"
836 "kcmp\0"
837 "madvise\0"
70526841
LP
838 "mprotect\0"
839 "mremap\0"
840 "name_to_handle_at\0"
841 "oldolduname\0"
842 "olduname\0"
843 "personality\0"
844 "readahead\0"
845 "readdir\0"
846 "remap_file_pages\0"
847 "sched_get_priority_max\0"
848 "sched_get_priority_min\0"
849 "sched_getaffinity\0"
850 "sched_getattr\0"
851 "sched_getparam\0"
852 "sched_getscheduler\0"
853 "sched_rr_get_interval\0"
6ca67710 854 "sched_rr_get_interval_time64\0"
70526841
LP
855 "sched_yield\0"
856 "sendfile\0"
857 "sendfile64\0"
858 "setfsgid\0"
859 "setfsgid32\0"
860 "setfsuid\0"
861 "setfsuid32\0"
862 "setpgid\0"
863 "setsid\0"
864 "splice\0"
865 "sysinfo\0"
866 "tee\0"
867 "umask\0"
868 "uname\0"
869 "userfaultfd\0"
870 "vmsplice\0"
871 },
cd0ddf6f
LP
872 [SYSCALL_FILTER_SET_TIMER] = {
873 .name = "@timer",
874 .help = "Schedule operations by time",
875 .value =
876 "alarm\0"
877 "getitimer\0"
878 "setitimer\0"
879 "timer_create\0"
880 "timer_delete\0"
881 "timer_getoverrun\0"
882 "timer_gettime\0"
6ca67710 883 "timer_gettime64\0"
cd0ddf6f 884 "timer_settime\0"
6ca67710 885 "timer_settime64\0"
cd0ddf6f
LP
886 "timerfd_create\0"
887 "timerfd_gettime\0"
6ca67710 888 "timerfd_gettime64\0"
cd0ddf6f 889 "timerfd_settime\0"
6ca67710 890 "timerfd_settime64\0"
cd0ddf6f
LP
891 "times\0"
892 },
201c1cc2 893};
8130926d
LP
894
895const SyscallFilterSet *syscall_filter_set_find(const char *name) {
896 unsigned i;
897
898 if (isempty(name) || name[0] != '@')
899 return NULL;
900
901 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
902 if (streq(syscall_filter_sets[i].name, name))
903 return syscall_filter_sets + i;
904
905 return NULL;
906}
907
b54f36c6 908static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 909
b54f36c6 910int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
911 assert(seccomp);
912 assert(name);
913
960e4569
LP
914 if (strv_contains(exclude, name))
915 return 0;
916
69b1b241
LP
917 if (name[0] == '@') {
918 const SyscallFilterSet *other;
919
920 other = syscall_filter_set_find(name);
baaa35ad
ZJS
921 if (!other)
922 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
923 "Filter set %s is not known!",
924 name);
69b1b241 925
b54f36c6
ZJS
926 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
927
69b1b241 928 } else {
b54f36c6 929 int id, r;
69b1b241
LP
930
931 id = seccomp_syscall_resolve_name(name);
cff7bff8 932 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
933 if (log_missing)
934 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 935 return 0;
cff7bff8 936 }
69b1b241
LP
937
938 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 939 if (r < 0) {
69b1b241 940 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
941 bool ignore = r == -EDOM;
942
943 if (!ignore || log_missing)
944 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
945 name, id, ignore ? ", ignoring" : "");
946 if (!ignore)
947 return r;
b54f36c6 948 }
69b1b241 949
b54f36c6
ZJS
950 return 0;
951 }
69b1b241
LP
952}
953
469830d1
LP
954static int seccomp_add_syscall_filter_set(
955 scmp_filter_ctx seccomp,
469830d1 956 const SyscallFilterSet *set,
960e4569 957 uint32_t action,
b54f36c6
ZJS
958 char **exclude,
959 bool log_missing) {
469830d1 960
8130926d
LP
961 const char *sys;
962 int r;
963
964 assert(seccomp);
965 assert(set);
966
967 NULSTR_FOREACH(sys, set->value) {
b54f36c6 968 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
969 if (r < 0)
970 return r;
469830d1
LP
971 }
972
973 return 0;
974}
975
b54f36c6 976int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
977 uint32_t arch;
978 int r;
979
980 assert(set);
981
982 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 983 * each local arch. */
469830d1
LP
984
985 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
986 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
987
988 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
989
990 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
991 if (r < 0)
992 return r;
469830d1 993
b54f36c6 994 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
995 if (r < 0)
996 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
997
998 r = seccomp_load(seccomp);
7bc5e0b1 999 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1000 return r;
1001 if (r < 0)
1002 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
1003 }
1004
1005 return 0;
1006}
a3be2849 1007
b54f36c6 1008int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 1009 uint32_t arch;
a3be2849
LP
1010 int r;
1011
469830d1
LP
1012 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1013 * SyscallFilterSet* table. */
a3be2849 1014
8cfa775f 1015 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 1016 return 0;
a3be2849 1017
469830d1
LP
1018 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1019 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1020 Iterator i;
b54f36c6 1021 void *syscall_id, *val;
a3be2849 1022
469830d1 1023 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1024
469830d1
LP
1025 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1026 if (r < 0)
1027 return r;
a3be2849 1028
b54f36c6 1029 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 1030 uint32_t a = action;
b54f36c6
ZJS
1031 int id = PTR_TO_INT(syscall_id) - 1;
1032 int error = PTR_TO_INT(val);
8cfa775f 1033
b54f36c6
ZJS
1034 if (action != SCMP_ACT_ALLOW && error >= 0)
1035 a = SCMP_ACT_ERRNO(error);
8cfa775f 1036
b54f36c6 1037 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
1038 if (r < 0) {
1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1040 _cleanup_free_ char *n = NULL;
7e86bd73 1041 bool ignore;
469830d1 1042
b54f36c6 1043 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1044 ignore = r == -EDOM;
1045 if (!ignore || log_missing)
1046 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1047 strna(n), id, ignore ? ", ignoring" : "");
1048 if (!ignore)
1049 return r;
469830d1
LP
1050 }
1051 }
1052
1053 r = seccomp_load(seccomp);
7bc5e0b1 1054 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1055 return r;
1056 if (r < 0)
1057 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 }
1059
1060 return 0;
add00535
LP
1061}
1062
58f6ab44 1063int seccomp_parse_syscall_filter(
898748d8
YW
1064 const char *name,
1065 int errno_num,
1066 Hashmap *filter,
13d92c63 1067 SeccompParseFlags flags,
898748d8
YW
1068 const char *unit,
1069 const char *filename,
1070 unsigned line) {
1071
1072 int r;
1073
1074 assert(name);
1075 assert(filter);
1076
1077 if (name[0] == '@') {
1078 const SyscallFilterSet *set;
1079 const char *i;
1080
1081 set = syscall_filter_set_find(name);
1082 if (!set) {
13d92c63 1083 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1084 return -EINVAL;
13d92c63
LP
1085
1086 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1087 "Unknown system call group, ignoring: %s", name);
1088 return 0;
898748d8
YW
1089 }
1090
1091 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1092 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1093 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1094 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1095 * about them. */
58f6ab44 1096 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1097 if (r < 0)
1098 return r;
1099 }
1100 } else {
1101 int id;
1102
1103 id = seccomp_syscall_resolve_name(name);
1104 if (id == __NR_SCMP_ERROR) {
13d92c63 1105 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1106 return -EINVAL;
13d92c63
LP
1107
1108 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1109 "Failed to parse system call, ignoring: %s", name);
1110 return 0;
898748d8
YW
1111 }
1112
1113 /* If we previously wanted to forbid a syscall and now
1114 * we want to allow it, then remove it from the list. */
13d92c63 1115 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
898748d8
YW
1116 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1117 if (r < 0)
851ee70a
LW
1118 switch (r) {
1119 case -ENOMEM:
1120 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1121 case -EEXIST:
9d7fe7c6
LW
1122 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1123 break;
851ee70a
LW
1124 default:
1125 return r;
1126 }
898748d8
YW
1127 } else
1128 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1129 }
1130
1131 return 0;
1132}
1133
add00535 1134int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1135 uint32_t arch;
add00535
LP
1136 int r;
1137
f1d34068 1138 if (DEBUG_LOGGING) {
add00535
LP
1139 _cleanup_free_ char *s = NULL;
1140
86c2a9f1 1141 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1142 log_debug("Restricting namespace to: %s.", strna(s));
1143 }
1144
1145 /* NOOP? */
1146 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1147 return 0;
1148
469830d1
LP
1149 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1150 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1151 unsigned i;
add00535 1152
469830d1
LP
1153 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1154
1155 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1156 if (r < 0)
1157 return r;
1158
1159 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1160 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1161 * altogether. */
1162 r = seccomp_rule_add_exact(
1163 seccomp,
1164 SCMP_ACT_ERRNO(EPERM),
1165 SCMP_SYS(setns),
1166 0);
1167 else
1168 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1169 * special invocation with a zero flags argument, right here. */
1170 r = seccomp_rule_add_exact(
1171 seccomp,
1172 SCMP_ACT_ERRNO(EPERM),
1173 SCMP_SYS(setns),
1174 1,
1175 SCMP_A1(SCMP_CMP_EQ, 0));
1176 if (r < 0) {
1177 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1178 continue;
1179 }
1180
1181 for (i = 0; namespace_flag_map[i].name; i++) {
1182 unsigned long f;
1183
1184 f = namespace_flag_map[i].flag;
1185 if ((retain & f) == f) {
1186 log_debug("Permitting %s.", namespace_flag_map[i].name);
1187 continue;
1188 }
1189
1190 log_debug("Blocking %s.", namespace_flag_map[i].name);
1191
1192 r = seccomp_rule_add_exact(
1193 seccomp,
1194 SCMP_ACT_ERRNO(EPERM),
1195 SCMP_SYS(unshare),
1196 1,
1197 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1198 if (r < 0) {
1199 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 break;
1201 }
1202
511ceb1f
ZJS
1203 /* On s390/s390x the first two parameters to clone are switched */
1204 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1205 r = seccomp_rule_add_exact(
1206 seccomp,
1207 SCMP_ACT_ERRNO(EPERM),
1208 SCMP_SYS(clone),
1209 1,
1210 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1211 else
1212 r = seccomp_rule_add_exact(
1213 seccomp,
1214 SCMP_ACT_ERRNO(EPERM),
1215 SCMP_SYS(clone),
1216 1,
1217 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1218 if (r < 0) {
1219 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1220 break;
1221 }
1222
1223 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1224 r = seccomp_rule_add_exact(
1225 seccomp,
1226 SCMP_ACT_ERRNO(EPERM),
1227 SCMP_SYS(setns),
1228 1,
1229 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1230 if (r < 0) {
1231 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1232 break;
1233 }
1234 }
1235 }
1236 if (r < 0)
1237 continue;
1238
1239 r = seccomp_load(seccomp);
7bc5e0b1 1240 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1241 return r;
1242 if (r < 0)
1243 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1244 }
1245
1246 return 0;
1247}
1248
1249int seccomp_protect_sysctl(void) {
1250 uint32_t arch;
1251 int r;
1252
1253 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1254 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1255
1256 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1257
2e64e8f4
ZJS
1258 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1259 /* No _sysctl syscall */
1260 continue;
1261
469830d1
LP
1262 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1263 if (r < 0)
1264 return r;
1265
1266 r = seccomp_rule_add_exact(
add00535
LP
1267 seccomp,
1268 SCMP_ACT_ERRNO(EPERM),
469830d1 1269 SCMP_SYS(_sysctl),
add00535 1270 0);
469830d1
LP
1271 if (r < 0) {
1272 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1273 continue;
1274 }
1275
1276 r = seccomp_load(seccomp);
7bc5e0b1 1277 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1278 return r;
1279 if (r < 0)
1280 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1281 }
1282
1283 return 0;
1284}
1285
620dbdd2
KK
1286int seccomp_protect_syslog(void) {
1287 uint32_t arch;
1288 int r;
1289
1290 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1291 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1292
1293 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1294 if (r < 0)
1295 return r;
1296
1297 r = seccomp_rule_add_exact(
1298 seccomp,
1299 SCMP_ACT_ERRNO(EPERM),
1300 SCMP_SYS(syslog),
1301 0);
1302
1303 if (r < 0) {
1304 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1305 continue;
1306 }
1307
1308 r = seccomp_load(seccomp);
1309 if (ERRNO_IS_SECCOMP_FATAL(r))
1310 return r;
1311 if (r < 0)
1312 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1313 }
1314
1315 return 0;
1316}
1317
469830d1
LP
1318int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1319 uint32_t arch;
1320 int r;
1321
1322 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1323 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1324 bool supported;
469830d1
LP
1325 Iterator i;
1326
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1328
9606bc4b
LP
1329 switch (arch) {
1330
1331 case SCMP_ARCH_X86_64:
1332 case SCMP_ARCH_X32:
1333 case SCMP_ARCH_ARM:
1334 case SCMP_ARCH_AARCH64:
0d9fca76 1335 case SCMP_ARCH_PPC:
da1921a5
ZJS
1336 case SCMP_ARCH_PPC64:
1337 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1338 case SCMP_ARCH_MIPSEL64N32:
1339 case SCMP_ARCH_MIPS64N32:
1340 case SCMP_ARCH_MIPSEL64:
1341 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1342 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1343 supported = true;
1344 break;
1345
9606bc4b
LP
1346 case SCMP_ARCH_S390:
1347 case SCMP_ARCH_S390X:
da1921a5 1348 case SCMP_ARCH_X86:
f5aeac14
JC
1349 case SCMP_ARCH_MIPSEL:
1350 case SCMP_ARCH_MIPS:
9606bc4b
LP
1351 default:
1352 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1353 * don't know */
1354 supported = false;
1355 break;
1356 }
1357
1358 if (!supported)
1359 continue;
1360
469830d1
LP
1361 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1362 if (r < 0)
1363 return r;
1364
1365 if (whitelist) {
1366 int af, first = 0, last = 0;
1367 void *afp;
1368
1369 /* If this is a whitelist, we first block the address families that are out of range and then
1370 * everything that is not in the set. First, we find the lowest and highest address family in
1371 * the set. */
1372
1373 SET_FOREACH(afp, address_families, i) {
1374 af = PTR_TO_INT(afp);
1375
1376 if (af <= 0 || af >= af_max())
1377 continue;
1378
1379 if (first == 0 || af < first)
1380 first = af;
1381
1382 if (last == 0 || af > last)
1383 last = af;
1384 }
1385
1386 assert((first == 0) == (last == 0));
1387
1388 if (first == 0) {
1389
1390 /* No entries in the valid range, block everything */
1391 r = seccomp_rule_add_exact(
1392 seccomp,
1393 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1394 SCMP_SYS(socket),
1395 0);
1396 if (r < 0) {
1397 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 continue;
1399 }
1400
1401 } else {
1402
1403 /* Block everything below the first entry */
1404 r = seccomp_rule_add_exact(
1405 seccomp,
1406 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1407 SCMP_SYS(socket),
1408 1,
1409 SCMP_A0(SCMP_CMP_LT, first));
1410 if (r < 0) {
1411 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1412 continue;
1413 }
1414
1415 /* Block everything above the last entry */
1416 r = seccomp_rule_add_exact(
1417 seccomp,
1418 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1419 SCMP_SYS(socket),
1420 1,
1421 SCMP_A0(SCMP_CMP_GT, last));
1422 if (r < 0) {
1423 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 continue;
1425 }
1426
1427 /* Block everything between the first and last entry */
1428 for (af = 1; af < af_max(); af++) {
1429
1430 if (set_contains(address_families, INT_TO_PTR(af)))
1431 continue;
1432
1433 r = seccomp_rule_add_exact(
1434 seccomp,
1435 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1436 SCMP_SYS(socket),
1437 1,
1438 SCMP_A0(SCMP_CMP_EQ, af));
1439 if (r < 0)
1440 break;
1441 }
469830d1
LP
1442 if (r < 0) {
1443 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1444 continue;
1445 }
1446 }
1447
1448 } else {
1449 void *af;
1450
1451 /* If this is a blacklist, then generate one rule for
1452 * each address family that are then combined in OR
1453 * checks. */
1454
1455 SET_FOREACH(af, address_families, i) {
1456
1457 r = seccomp_rule_add_exact(
1458 seccomp,
1459 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1460 SCMP_SYS(socket),
1461 1,
1462 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1463 if (r < 0)
1464 break;
1465 }
469830d1
LP
1466 if (r < 0) {
1467 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1468 continue;
1469 }
1470 }
1471
1472 r = seccomp_load(seccomp);
7bc5e0b1 1473 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1474 return r;
1475 if (r < 0)
1476 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 }
1478
1479 return 0;
1480}
1481
1482int seccomp_restrict_realtime(void) {
1483 static const int permitted_policies[] = {
1484 SCHED_OTHER,
1485 SCHED_BATCH,
1486 SCHED_IDLE,
1487 };
1488
1489 int r, max_policy = 0;
1490 uint32_t arch;
1491 unsigned i;
1492
1493 /* Determine the highest policy constant we want to allow */
1494 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1495 if (permitted_policies[i] > max_policy)
1496 max_policy = permitted_policies[i];
1497
1498 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1499 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1500 int p;
1501
1502 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1503
1504 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1505 if (r < 0)
1506 return r;
1507
1508 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1509 * whitelist. */
1510 for (p = 0; p < max_policy; p++) {
1511 bool good = false;
1512
1513 /* Check if this is in the whitelist. */
1514 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1515 if (permitted_policies[i] == p) {
1516 good = true;
1517 break;
1518 }
1519
1520 if (good)
1521 continue;
1522
1523 /* Deny this policy */
1524 r = seccomp_rule_add_exact(
1525 seccomp,
1526 SCMP_ACT_ERRNO(EPERM),
1527 SCMP_SYS(sched_setscheduler),
1528 1,
1529 SCMP_A1(SCMP_CMP_EQ, p));
1530 if (r < 0) {
1531 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1532 continue;
1533 }
1534 }
1535
1536 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1537 * unsigned here, hence no need no check for < 0 values. */
1538 r = seccomp_rule_add_exact(
add00535
LP
1539 seccomp,
1540 SCMP_ACT_ERRNO(EPERM),
469830d1 1541 SCMP_SYS(sched_setscheduler),
add00535 1542 1,
469830d1
LP
1543 SCMP_A1(SCMP_CMP_GT, max_policy));
1544 if (r < 0) {
1545 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1546 continue;
1547 }
add00535 1548
469830d1 1549 r = seccomp_load(seccomp);
7bc5e0b1 1550 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1551 return r;
1552 if (r < 0)
1553 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1554 }
1555
1556 return 0;
1557}
1558
6dc66688
ZJS
1559static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1560 uint32_t arch,
1561 int nr,
14cb109d 1562 unsigned arg_cnt,
6dc66688
ZJS
1563 const struct scmp_arg_cmp arg) {
1564 int r;
1565
1566 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1567 if (r < 0) {
1568 _cleanup_free_ char *n = NULL;
1569
1570 n = seccomp_syscall_resolve_num_arch(arch, nr);
1571 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1572 strna(n),
1573 seccomp_arch_to_string(arch));
1574 }
1575
1576 return r;
1577}
1578
2a8d6e63 1579/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1580#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1581assert_cc(SCMP_SYS(shmget) > 0);
1582assert_cc(SCMP_SYS(shmat) > 0);
1583assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1584#endif
6dc66688 1585
469830d1
LP
1586int seccomp_memory_deny_write_execute(void) {
1587 uint32_t arch;
b069c2a3 1588 unsigned loaded = 0;
469830d1
LP
1589
1590 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1591 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1592 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1593
469830d1
LP
1594 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1595
8a50cf69
LP
1596 switch (arch) {
1597
bed4668d
CE
1598 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1599 * We ignore that here, which means there's still a way to get writable/executable
1600 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1601
8a50cf69 1602 case SCMP_ARCH_X86:
57311925 1603 case SCMP_ARCH_S390:
8a50cf69
LP
1604 filter_syscall = SCMP_SYS(mmap2);
1605 block_syscall = SCMP_SYS(mmap);
bed4668d 1606 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1607 break;
1608
63d00dfb 1609 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1610 case SCMP_ARCH_PPC64:
1611 case SCMP_ARCH_PPC64LE:
bed4668d 1612 case SCMP_ARCH_S390X:
2a8d6e63 1613 filter_syscall = SCMP_SYS(mmap);
bed4668d 1614 /* shmat multiplexed, see above */
8a50cf69
LP
1615 break;
1616
4278d1f5
ZJS
1617 case SCMP_ARCH_ARM:
1618 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1619 shmat_syscall = SCMP_SYS(shmat);
1620 break;
1621
8a50cf69
LP
1622 case SCMP_ARCH_X86_64:
1623 case SCMP_ARCH_X32:
79873bc8 1624 case SCMP_ARCH_AARCH64:
bed4668d 1625 filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
8a50cf69
LP
1626 shmat_syscall = SCMP_SYS(shmat);
1627 break;
1628
1629 /* Please add more definitions here, if you port systemd to other architectures! */
1630
57311925 1631#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
8a50cf69
LP
1632#warning "Consider adding the right mmap() syscall definitions here!"
1633#endif
1634 }
1635
1636 /* Can't filter mmap() on this arch, then skip it */
1637 if (filter_syscall == 0)
1638 continue;
1639
469830d1
LP
1640 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1641 if (r < 0)
1642 return r;
1643
6dc66688
ZJS
1644 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1645 1,
1646 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1647 if (r < 0)
1648 continue;
8a50cf69
LP
1649
1650 if (block_syscall != 0) {
6dc66688
ZJS
1651 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1652 if (r < 0)
8a50cf69 1653 continue;
add00535 1654 }
a3be2849 1655
6dc66688
ZJS
1656 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1657 1,
b835eeb4
ZJS
1658 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1659 if (r < 0)
1660 continue;
1661
91691f1d 1662#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1663 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1664 1,
6dc66688
ZJS
1665 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1666 if (r < 0)
469830d1 1667 continue;
91691f1d 1668#endif
add00535 1669
67fb5f33 1670 if (shmat_syscall > 0) {
5ef3ed97 1671 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1672 1,
1673 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1674 if (r < 0)
8a50cf69 1675 continue;
469830d1
LP
1676 }
1677
1678 r = seccomp_load(seccomp);
7bc5e0b1 1679 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1680 return r;
add00535 1681 if (r < 0)
b069c2a3
ZJS
1682 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1683 seccomp_arch_to_string(arch));
903659e7 1684 loaded++;
469830d1 1685 }
add00535 1686
903659e7 1687 if (loaded == 0)
b069c2a3 1688 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1689
1690 return loaded;
469830d1
LP
1691}
1692
1693int seccomp_restrict_archs(Set *archs) {
1694 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1695 Iterator i;
1696 void *id;
1697 int r;
1698
1699 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1700 * list.
1701 *
1702 * There are some qualifications. However the most important use is to stop processes from bypassing
1703 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1704 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1705
2428aaf8
AJ
1706 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1707 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1708 * to run a program with the restrictions applied. */
469830d1
LP
1709 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1710 if (!seccomp)
1711 return -ENOMEM;
1712
1713 SET_FOREACH(id, archs, i) {
1714 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1715 if (r < 0 && r != -EEXIST)
1716 return r;
1717 }
1718
1719 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1720 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1721 * The important thing is that you can block the old 32-bit x86 syscalls.
1722 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1723
1724 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1725 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1726
1727 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1728 if (r < 0 && r != -EEXIST)
469830d1 1729 return r;
add00535
LP
1730 }
1731
469830d1
LP
1732 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1733 if (r < 0)
1734 return r;
add00535 1735
1c6af69b 1736 r = seccomp_load(seccomp);
7bc5e0b1 1737 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1738 return r;
1739 if (r < 0)
1740 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1741
1742 return 0;
a3be2849 1743}
b16bd535
YW
1744
1745int parse_syscall_archs(char **l, Set **archs) {
b9c54c46 1746 _cleanup_set_free_ Set *_archs = NULL;
b16bd535
YW
1747 char **s;
1748 int r;
1749
1750 assert(l);
1751 assert(archs);
1752
1753 r = set_ensure_allocated(&_archs, NULL);
1754 if (r < 0)
1755 return r;
1756
1757 STRV_FOREACH(s, l) {
1758 uint32_t a;
1759
1760 r = seccomp_arch_from_string(*s, &a);
1761 if (r < 0)
1762 return -EINVAL;
1763
1764 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1765 if (r < 0)
1766 return -ENOMEM;
1767 }
1768
1cc6c93a 1769 *archs = TAKE_PTR(_archs);
b16bd535
YW
1770
1771 return 0;
1772}
165a31c0 1773
8cfa775f 1774int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1775 const char *i;
1776 int r;
1777
1778 assert(set);
1779
1780 NULSTR_FOREACH(i, set->value) {
1781
1782 if (i[0] == '@') {
1783 const SyscallFilterSet *more;
1784
1785 more = syscall_filter_set_find(i);
1786 if (!more)
1787 return -ENXIO;
1788
165a31c0
LP
1789 r = seccomp_filter_set_add(filter, add, more);
1790 if (r < 0)
1791 return r;
1792 } else {
1793 int id;
1794
1795 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1796 if (id == __NR_SCMP_ERROR) {
1797 log_debug("Couldn't resolve system call, ignoring: %s", i);
1798 continue;
1799 }
165a31c0
LP
1800
1801 if (add) {
8cfa775f 1802 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1803 if (r < 0)
1804 return r;
1805 } else
8cfa775f 1806 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1807 }
1808 }
1809
1810 return 0;
1811}
78e864e5
TM
1812
1813int seccomp_lock_personality(unsigned long personality) {
72eafe71 1814 uint32_t arch;
78e864e5
TM
1815 int r;
1816
72eafe71
LP
1817 if (personality >= PERSONALITY_INVALID)
1818 return -EINVAL;
78e864e5 1819
72eafe71
LP
1820 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1821 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1822
72eafe71
LP
1823 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1824 if (r < 0)
1825 return r;
1826
1827 r = seccomp_rule_add_exact(
1828 seccomp,
1829 SCMP_ACT_ERRNO(EPERM),
1830 SCMP_SYS(personality),
1831 1,
1832 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1833 if (r < 0) {
1834 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1835 continue;
1836 }
72eafe71
LP
1837
1838 r = seccomp_load(seccomp);
7bc5e0b1 1839 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1840 return r;
1841 if (r < 0)
1842 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1843 }
1844
1845 return 0;
78e864e5 1846}
aecd5ac6
TM
1847
1848int seccomp_protect_hostname(void) {
1849 uint32_t arch;
1850 int r;
1851
1852 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1853 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1854
1855 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1856 if (r < 0)
1857 return r;
1858
1859 r = seccomp_rule_add_exact(
1860 seccomp,
1861 SCMP_ACT_ERRNO(EPERM),
1862 SCMP_SYS(sethostname),
1863 0);
9e6e543c
LP
1864 if (r < 0) {
1865 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1866 continue;
9e6e543c 1867 }
aecd5ac6
TM
1868
1869 r = seccomp_rule_add_exact(
1870 seccomp,
1871 SCMP_ACT_ERRNO(EPERM),
1872 SCMP_SYS(setdomainname),
1873 0);
9e6e543c
LP
1874 if (r < 0) {
1875 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1876 continue;
9e6e543c 1877 }
aecd5ac6
TM
1878
1879 r = seccomp_load(seccomp);
7bc5e0b1 1880 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1881 return r;
1882 if (r < 0)
1883 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1884 }
1885
1886 return 0;
1887}
3c27973b 1888
da4dc9a6
ZJS
1889static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1890 /* Checks the mode_t parameter of the following system calls:
1891 *
1892 * → chmod() + fchmod() + fchmodat()
1893 * → open() + creat() + openat()
1894 * → mkdir() + mkdirat()
1895 * → mknod() + mknodat()
1896 *
1897 * Returns error if *everything* failed, and 0 otherwise.
1898 */
1899 int r = 0;
1900 bool any = false;
1901
1902 r = seccomp_rule_add_exact(
1903 seccomp,
1904 SCMP_ACT_ERRNO(EPERM),
1905 SCMP_SYS(chmod),
1906 1,
1907 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1908 if (r < 0)
1909 log_debug_errno(r, "Failed to add filter for chmod: %m");
1910 else
1911 any = true;
1912
1913 r = seccomp_rule_add_exact(
1914 seccomp,
1915 SCMP_ACT_ERRNO(EPERM),
1916 SCMP_SYS(fchmod),
1917 1,
1918 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1919 if (r < 0)
1920 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1921 else
1922 any = true;
1923
1924 r = seccomp_rule_add_exact(
1925 seccomp,
1926 SCMP_ACT_ERRNO(EPERM),
1927 SCMP_SYS(fchmodat),
1928 1,
1929 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1930 if (r < 0)
1931 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1932 else
1933 any = true;
1934
1935 r = seccomp_rule_add_exact(
1936 seccomp,
1937 SCMP_ACT_ERRNO(EPERM),
1938 SCMP_SYS(mkdir),
1939 1,
1940 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1941 if (r < 0)
1942 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1943 else
1944 any = true;
1945
1946 r = seccomp_rule_add_exact(
1947 seccomp,
1948 SCMP_ACT_ERRNO(EPERM),
1949 SCMP_SYS(mkdirat),
1950 1,
1951 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1952 if (r < 0)
1953 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1954 else
1955 any = true;
1956
1957 r = seccomp_rule_add_exact(
1958 seccomp,
1959 SCMP_ACT_ERRNO(EPERM),
1960 SCMP_SYS(mknod),
1961 1,
1962 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1963 if (r < 0)
1964 log_debug_errno(r, "Failed to add filter for mknod: %m");
1965 else
1966 any = true;
1967
1968 r = seccomp_rule_add_exact(
1969 seccomp,
1970 SCMP_ACT_ERRNO(EPERM),
1971 SCMP_SYS(mknodat),
1972 1,
1973 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1974 if (r < 0)
1975 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1976 else
1977 any = true;
1978
1979#if SCMP_SYS(open) > 0
1980 r = seccomp_rule_add_exact(
1981 seccomp,
1982 SCMP_ACT_ERRNO(EPERM),
1983 SCMP_SYS(open),
1984 2,
1985 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1986 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1987 if (r < 0)
1988 log_debug_errno(r, "Failed to add filter for open: %m");
1989 else
1990 any = true;
1991#endif
1992
1993 r = seccomp_rule_add_exact(
1994 seccomp,
1995 SCMP_ACT_ERRNO(EPERM),
1996 SCMP_SYS(openat),
1997 2,
1998 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1999 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2000 if (r < 0)
2001 log_debug_errno(r, "Failed to add filter for openat: %m");
2002 else
2003 any = true;
2004
2005 r = seccomp_rule_add_exact(
2006 seccomp,
2007 SCMP_ACT_ERRNO(EPERM),
2008 SCMP_SYS(creat),
2009 1,
2010 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2011 if (r < 0)
2012 log_debug_errno(r, "Failed to add filter for creat: %m");
2013 else
2014 any = true;
2015
2016 return any ? 0 : r;
2017}
2018
3c27973b
LP
2019int seccomp_restrict_suid_sgid(void) {
2020 uint32_t arch;
da4dc9a6 2021 int r, k;
3c27973b
LP
2022
2023 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2024 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2025
2026 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2027 if (r < 0)
2028 return r;
2029
da4dc9a6
ZJS
2030 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2031 if (r < 0)
2032 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2033
da4dc9a6
ZJS
2034 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2035 if (k < 0)
2036 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2037
da4dc9a6 2038 if (r < 0 && k < 0)
3c27973b 2039 continue;
3c27973b
LP
2040
2041 r = seccomp_load(seccomp);
7bc5e0b1 2042 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2043 return r;
2044 if (r < 0)
2045 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2046 }
2047
2048 return 0;
2049}
915fb324
LP
2050
2051uint32_t scmp_act_kill_process(void) {
2052
2053 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2054 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2055 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2056 * for single-threaded apps does the right thing. */
2057
2058#ifdef SCMP_ACT_KILL_PROCESS
2059 if (seccomp_api_get() >= 3)
2060 return SCMP_ACT_KILL_PROCESS;
2061#endif
2062
2063 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2064}