]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
Merge pull request #16705 from bluca/verity_udev_wait
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
d8b4d14d 15#include "errno-list.h"
a8fbdf54 16#include "macro.h"
add00535 17#include "nsflags.h"
d8b4d14d 18#include "nulstr-util.h"
78e864e5 19#include "process-util.h"
cf0fbc49 20#include "seccomp-util.h"
b16bd535 21#include "set.h"
07630cea 22#include "string-util.h"
b16bd535 23#include "strv.h"
469830d1
LP
24
25const uint32_t seccomp_local_archs[] = {
26
6b000af4 27 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
28
29#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
f2d9751c
LP
32 SCMP_ARCH_X32, /* native */
33#elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
469830d1 35 SCMP_ARCH_X32,
f2d9751c
LP
36 SCMP_ARCH_X86_64, /* native */
37#elif defined(__i386__)
38 SCMP_ARCH_X86,
39#elif defined(__aarch64__)
469830d1 40 SCMP_ARCH_ARM,
f2d9751c
LP
41 SCMP_ARCH_AARCH64, /* native */
42#elif defined(__arm__)
43 SCMP_ARCH_ARM,
44#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 48 SCMP_ARCH_MIPS,
f2d9751c
LP
49 SCMP_ARCH_MIPSEL, /* native */
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
469830d1 54 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
469830d1 59 SCMP_ARCH_MIPSEL,
f2d9751c
LP
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL64,
f2d9751c 68 SCMP_ARCH_MIPS64,
469830d1 69 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPS64N32, /* native */
71#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 79 SCMP_ARCH_PPC,
469830d1 80 SCMP_ARCH_PPC64LE,
f2d9751c
LP
81 SCMP_ARCH_PPC64, /* native */
82#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86#elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88#elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91#elif defined(__s390__)
469830d1 92 SCMP_ARCH_S390,
469830d1
LP
93#endif
94 (uint32_t) -1
95 };
57183d11
LP
96
97const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
57183d11 102
aa34055f
ZJS
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
57183d11 105 return "native";
aa34055f 106 case SCMP_ARCH_X86:
57183d11 107 return "x86";
aa34055f 108 case SCMP_ARCH_X86_64:
57183d11 109 return "x86-64";
aa34055f 110 case SCMP_ARCH_X32:
57183d11 111 return "x32";
aa34055f 112 case SCMP_ARCH_ARM:
57183d11 113 return "arm";
aa34055f
ZJS
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
6abfd303 135 return "s390";
aa34055f 136 case SCMP_ARCH_S390X:
6abfd303 137 return "s390x";
aa34055f
ZJS
138 default:
139 return NULL;
140 }
57183d11
LP
141}
142
143int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
57183d11
LP
183 else
184 return -EINVAL;
185
186 return 0;
187}
e9642be2 188
469830d1 189int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
190 scmp_filter_ctx seccomp;
191 int r;
192
469830d1
LP
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
469830d1
LP
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
1b52793d 203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
204 if (r < 0)
205 goto finish;
206
1b52793d 207 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230finish:
231 seccomp_release(seccomp);
232 return r;
233}
234
d347d902 235static bool is_basic_seccomp_available(void) {
4d5bd50a 236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
237}
238
239static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
d347d902
FS
242}
243
83f12b27 244bool is_seccomp_available(void) {
83f12b27 245 static int cached_enabled = -1;
4d5bd50a 246
83f12b27 247 if (cached_enabled < 0)
4d5bd50a
LP
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
83f12b27
FS
252 return cached_enabled;
253}
254
8130926d 255const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 256 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 257 .name = "@default",
d5efc18b 258 .help = "System calls that are always permitted",
40eb6a80
ZJS
259 .value =
260 "clock_getres\0"
6ca67710 261 "clock_getres_time64\0"
40eb6a80 262 "clock_gettime\0"
6ca67710 263 "clock_gettime64\0"
40eb6a80 264 "clock_nanosleep\0"
6ca67710 265 "clock_nanosleep_time64\0"
40eb6a80
ZJS
266 "execve\0"
267 "exit\0"
268 "exit_group\0"
e41b0f42 269 "futex\0"
6ca67710 270 "futex_time64\0"
e41b0f42
LP
271 "get_robust_list\0"
272 "get_thread_area\0"
09d3020b
DH
273 "getegid\0"
274 "getegid32\0"
275 "geteuid\0"
276 "geteuid32\0"
277 "getgid\0"
278 "getgid32\0"
279 "getgroups\0"
280 "getgroups32\0"
281 "getpgid\0"
282 "getpgrp\0"
283 "getpid\0"
284 "getppid\0"
285 "getresgid\0"
286 "getresgid32\0"
287 "getresuid\0"
288 "getresuid32\0"
40eb6a80 289 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
290 "getsid\0"
291 "gettid\0"
40eb6a80 292 "gettimeofday\0"
09d3020b
DH
293 "getuid\0"
294 "getuid32\0"
e41b0f42 295 "membarrier\0"
40eb6a80
ZJS
296 "nanosleep\0"
297 "pause\0"
4c3a9176 298 "prlimit64\0"
e41b0f42 299 "restart_syscall\0"
6fee3be0 300 "rseq\0"
40eb6a80 301 "rt_sigreturn\0"
8f44de08 302 "sched_yield\0"
e41b0f42
LP
303 "set_robust_list\0"
304 "set_thread_area\0"
305 "set_tid_address\0"
ce5faeac 306 "set_tls\0"
40eb6a80
ZJS
307 "sigreturn\0"
308 "time\0"
4c3a9176 309 "ugetrlimit\0"
40eb6a80 310 },
44898c53
LP
311 [SYSCALL_FILTER_SET_AIO] = {
312 .name = "@aio",
313 .help = "Asynchronous IO",
314 .value =
315 "io_cancel\0"
316 "io_destroy\0"
317 "io_getevents\0"
a05cfe23 318 "io_pgetevents\0"
6ca67710 319 "io_pgetevents_time64\0"
44898c53
LP
320 "io_setup\0"
321 "io_submit\0"
9e486265
LP
322 "io_uring_enter\0"
323 "io_uring_register\0"
324 "io_uring_setup\0"
44898c53 325 },
133ddbbe 326 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 327 .name = "@basic-io",
d5efc18b 328 .help = "Basic IO",
133ddbbe 329 .value =
648a0ed0 330 "_llseek\0"
133ddbbe 331 "close\0"
648a0ed0 332 "dup\0"
133ddbbe
LP
333 "dup2\0"
334 "dup3\0"
133ddbbe
LP
335 "lseek\0"
336 "pread64\0"
337 "preadv\0"
44898c53 338 "preadv2\0"
133ddbbe
LP
339 "pwrite64\0"
340 "pwritev\0"
44898c53 341 "pwritev2\0"
133ddbbe
LP
342 "read\0"
343 "readv\0"
344 "write\0"
345 "writev\0"
346 },
44898c53
LP
347 [SYSCALL_FILTER_SET_CHOWN] = {
348 .name = "@chown",
349 .help = "Change ownership of files and directories",
350 .value =
351 "chown\0"
352 "chown32\0"
353 "fchown\0"
354 "fchown32\0"
355 "fchownat\0"
356 "lchown\0"
357 "lchown32\0"
358 },
8130926d 359 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 360 .name = "@clock",
d5efc18b 361 .help = "Change the system time",
201c1cc2
TM
362 .value =
363 "adjtimex\0"
1f9ac68b 364 "clock_adjtime\0"
6ca67710 365 "clock_adjtime64\0"
1f9ac68b 366 "clock_settime\0"
6ca67710 367 "clock_settime64\0"
201c1cc2 368 "settimeofday\0"
1f9ac68b 369 "stime\0"
8130926d
LP
370 },
371 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 372 .name = "@cpu-emulation",
d5efc18b 373 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
374 .value =
375 "modify_ldt\0"
376 "subpage_prot\0"
377 "switch_endian\0"
378 "vm86\0"
379 "vm86old\0"
8130926d
LP
380 },
381 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 382 .name = "@debug",
d5efc18b 383 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
384 .value =
385 "lookup_dcookie\0"
386 "perf_event_open\0"
8270e3d8 387 "pidfd_getfd\0"
1f9ac68b
LP
388 "ptrace\0"
389 "rtas\0"
8130926d 390#ifdef __NR_s390_runtime_instr
1f9ac68b 391 "s390_runtime_instr\0"
8130926d 392#endif
1f9ac68b 393 "sys_debug_setcontext\0"
8130926d 394 },
1a1b13c9
LP
395 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
396 .name = "@file-system",
397 .help = "File system operations",
398 .value =
399 "access\0"
400 "chdir\0"
401 "chmod\0"
402 "close\0"
403 "creat\0"
404 "faccessat\0"
405 "fallocate\0"
406 "fchdir\0"
407 "fchmod\0"
408 "fchmodat\0"
1a1b13c9 409 "fcntl\0"
ceaa6aa7 410 "fcntl64\0"
1a1b13c9
LP
411 "fgetxattr\0"
412 "flistxattr\0"
ceaa6aa7 413 "fremovexattr\0"
1a1b13c9 414 "fsetxattr\0"
1a1b13c9 415 "fstat\0"
ceaa6aa7 416 "fstat64\0"
1a1b13c9 417 "fstatat64\0"
1a1b13c9 418 "fstatfs\0"
ceaa6aa7 419 "fstatfs64\0"
1a1b13c9 420 "ftruncate\0"
ceaa6aa7 421 "ftruncate64\0"
1a1b13c9
LP
422 "futimesat\0"
423 "getcwd\0"
1a1b13c9 424 "getdents\0"
ceaa6aa7 425 "getdents64\0"
1a1b13c9
LP
426 "getxattr\0"
427 "inotify_add_watch\0"
ceaa6aa7 428 "inotify_init\0"
1a1b13c9
LP
429 "inotify_init1\0"
430 "inotify_rm_watch\0"
431 "lgetxattr\0"
432 "link\0"
433 "linkat\0"
434 "listxattr\0"
435 "llistxattr\0"
436 "lremovexattr\0"
437 "lsetxattr\0"
1a1b13c9 438 "lstat\0"
ceaa6aa7 439 "lstat64\0"
1a1b13c9
LP
440 "mkdir\0"
441 "mkdirat\0"
442 "mknod\0"
443 "mknodat\0"
1a1b13c9 444 "mmap\0"
ceaa6aa7 445 "mmap2\0"
7961116e 446 "munmap\0"
1a1b13c9 447 "newfstatat\0"
ceaa6aa7
LP
448 "oldfstat\0"
449 "oldlstat\0"
450 "oldstat\0"
1a1b13c9
LP
451 "open\0"
452 "openat\0"
8270e3d8 453 "openat2\0"
1a1b13c9
LP
454 "readlink\0"
455 "readlinkat\0"
456 "removexattr\0"
457 "rename\0"
1a1b13c9 458 "renameat\0"
ceaa6aa7 459 "renameat2\0"
1a1b13c9
LP
460 "rmdir\0"
461 "setxattr\0"
1a1b13c9 462 "stat\0"
ceaa6aa7 463 "stat64\0"
1a1b13c9 464 "statfs\0"
ceaa6aa7 465 "statfs64\0"
8e6a7a8b 466#ifdef __NR_statx
a4135a74 467 "statx\0"
ceaa6aa7 468#endif
1a1b13c9
LP
469 "symlink\0"
470 "symlinkat\0"
1a1b13c9 471 "truncate\0"
ceaa6aa7 472 "truncate64\0"
1a1b13c9
LP
473 "unlink\0"
474 "unlinkat\0"
ceaa6aa7 475 "utime\0"
1a1b13c9 476 "utimensat\0"
6ca67710 477 "utimensat_time64\0"
1a1b13c9
LP
478 "utimes\0"
479 },
8130926d 480 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 481 .name = "@io-event",
d5efc18b 482 .help = "Event loop system calls",
201c1cc2
TM
483 .value =
484 "_newselect\0"
201c1cc2 485 "epoll_create\0"
215728ff 486 "epoll_create1\0"
201c1cc2
TM
487 "epoll_ctl\0"
488 "epoll_ctl_old\0"
489 "epoll_pwait\0"
490 "epoll_wait\0"
491 "epoll_wait_old\0"
201c1cc2 492 "eventfd\0"
215728ff 493 "eventfd2\0"
201c1cc2
TM
494 "poll\0"
495 "ppoll\0"
6ca67710 496 "ppoll_time64\0"
201c1cc2 497 "pselect6\0"
6ca67710 498 "pselect6_time64\0"
201c1cc2 499 "select\0"
8130926d
LP
500 },
501 [SYSCALL_FILTER_SET_IPC] = {
8130926d 502 .name = "@ipc",
d5efc18b
ZJS
503 .help = "SysV IPC, POSIX Message Queues or other IPC",
504 .value =
505 "ipc\0"
cd5bfd7e 506 "memfd_create\0"
201c1cc2
TM
507 "mq_getsetattr\0"
508 "mq_notify\0"
509 "mq_open\0"
510 "mq_timedreceive\0"
6ca67710 511 "mq_timedreceive_time64\0"
201c1cc2 512 "mq_timedsend\0"
6ca67710 513 "mq_timedsend_time64\0"
201c1cc2
TM
514 "mq_unlink\0"
515 "msgctl\0"
516 "msgget\0"
517 "msgrcv\0"
518 "msgsnd\0"
cd5bfd7e 519 "pipe\0"
215728ff 520 "pipe2\0"
201c1cc2
TM
521 "process_vm_readv\0"
522 "process_vm_writev\0"
523 "semctl\0"
524 "semget\0"
525 "semop\0"
526 "semtimedop\0"
6ca67710 527 "semtimedop_time64\0"
201c1cc2
TM
528 "shmat\0"
529 "shmctl\0"
530 "shmdt\0"
531 "shmget\0"
8130926d
LP
532 },
533 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 534 .name = "@keyring",
d5efc18b 535 .help = "Kernel keyring access",
1f9ac68b
LP
536 .value =
537 "add_key\0"
538 "keyctl\0"
539 "request_key\0"
8130926d 540 },
cd0ddf6f
LP
541 [SYSCALL_FILTER_SET_MEMLOCK] = {
542 .name = "@memlock",
543 .help = "Memory locking control",
544 .value =
545 "mlock\0"
546 "mlock2\0"
547 "mlockall\0"
548 "munlock\0"
549 "munlockall\0"
550 },
8130926d 551 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 552 .name = "@module",
d5efc18b 553 .help = "Loading and unloading of kernel modules",
201c1cc2 554 .value =
201c1cc2
TM
555 "delete_module\0"
556 "finit_module\0"
557 "init_module\0"
8130926d
LP
558 },
559 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 560 .name = "@mount",
d5efc18b 561 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
562 .value =
563 "chroot\0"
9e486265
LP
564 "fsconfig\0"
565 "fsmount\0"
566 "fsopen\0"
567 "fspick\0"
201c1cc2 568 "mount\0"
9e486265
LP
569 "move_mount\0"
570 "open_tree\0"
201c1cc2 571 "pivot_root\0"
201c1cc2 572 "umount\0"
215728ff 573 "umount2\0"
8130926d
LP
574 },
575 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 576 .name = "@network-io",
d5efc18b 577 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 578 .value =
201c1cc2 579 "accept\0"
215728ff 580 "accept4\0"
201c1cc2
TM
581 "bind\0"
582 "connect\0"
583 "getpeername\0"
584 "getsockname\0"
585 "getsockopt\0"
586 "listen\0"
587 "recv\0"
588 "recvfrom\0"
589 "recvmmsg\0"
6ca67710 590 "recvmmsg_time64\0"
201c1cc2
TM
591 "recvmsg\0"
592 "send\0"
593 "sendmmsg\0"
594 "sendmsg\0"
595 "sendto\0"
596 "setsockopt\0"
597 "shutdown\0"
598 "socket\0"
599 "socketcall\0"
600 "socketpair\0"
8130926d
LP
601 },
602 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 603 /* some unknown even to libseccomp */
8130926d 604 .name = "@obsolete",
d5efc18b 605 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
606 .value =
607 "_sysctl\0"
608 "afs_syscall\0"
802fa07a 609 "bdflush\0"
201c1cc2 610 "break\0"
1f9ac68b 611 "create_module\0"
201c1cc2
TM
612 "ftime\0"
613 "get_kernel_syms\0"
201c1cc2
TM
614 "getpmsg\0"
615 "gtty\0"
7e0c3b8f 616 "idle\0"
201c1cc2 617 "lock\0"
201c1cc2 618 "mpx\0"
201c1cc2
TM
619 "prof\0"
620 "profil\0"
201c1cc2
TM
621 "putpmsg\0"
622 "query_module\0"
201c1cc2
TM
623 "security\0"
624 "sgetmask\0"
625 "ssetmask\0"
626 "stty\0"
1f9ac68b 627 "sysfs\0"
201c1cc2
TM
628 "tuxcall\0"
629 "ulimit\0"
630 "uselib\0"
1f9ac68b 631 "ustat\0"
201c1cc2 632 "vserver\0"
8130926d 633 },
9493b168
ZJS
634 [SYSCALL_FILTER_SET_PKEY] = {
635 .name = "@pkey",
636 .help = "System calls used for memory protection keys",
637 .value =
638 "pkey_alloc\0"
639 "pkey_free\0"
640 "pkey_mprotect\0"
641 },
8130926d 642 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 643 .name = "@privileged",
d5efc18b 644 .help = "All system calls which need super-user capabilities",
201c1cc2 645 .value =
44898c53 646 "@chown\0"
201c1cc2
TM
647 "@clock\0"
648 "@module\0"
649 "@raw-io\0"
af0f047b
LP
650 "@reboot\0"
651 "@swap\0"
215728ff 652 "_sysctl\0"
201c1cc2 653 "acct\0"
201c1cc2 654 "bpf\0"
1f9ac68b 655 "capset\0"
201c1cc2 656 "chroot\0"
a05cfe23 657 "fanotify_init\0"
9e486265 658 "fanotify_mark\0"
201c1cc2 659 "nfsservctl\0"
a05cfe23 660 "open_by_handle_at\0"
201c1cc2
TM
661 "pivot_root\0"
662 "quotactl\0"
201c1cc2 663 "setdomainname\0"
201c1cc2 664 "setfsuid\0"
215728ff 665 "setfsuid32\0"
201c1cc2 666 "setgroups\0"
215728ff 667 "setgroups32\0"
201c1cc2 668 "sethostname\0"
201c1cc2 669 "setresuid\0"
215728ff 670 "setresuid32\0"
201c1cc2 671 "setreuid\0"
215728ff 672 "setreuid32\0"
e05ee49b 673 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 674 "setuid32\0"
201c1cc2 675 "vhangup\0"
8130926d
LP
676 },
677 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 678 .name = "@process",
d5efc18b 679 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
680 .value =
681 "arch_prctl\0"
09d3020b 682 "capget\0" /* Able to query arbitrary processes */
201c1cc2 683 "clone\0"
9e486265 684 "clone3\0"
201c1cc2
TM
685 "execveat\0"
686 "fork\0"
b887d2eb 687 "getrusage\0"
201c1cc2 688 "kill\0"
9e486265 689 "pidfd_open\0"
46fcf95d 690 "pidfd_send_signal\0"
201c1cc2 691 "prctl\0"
b887d2eb
LP
692 "rt_sigqueueinfo\0"
693 "rt_tgsigqueueinfo\0"
201c1cc2 694 "setns\0"
a9518dc3 695 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 696 "tgkill\0"
b887d2eb 697 "times\0"
201c1cc2
TM
698 "tkill\0"
699 "unshare\0"
700 "vfork\0"
b887d2eb
LP
701 "wait4\0"
702 "waitid\0"
703 "waitpid\0"
8130926d
LP
704 },
705 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 706 .name = "@raw-io",
d5efc18b 707 .help = "Raw I/O port access",
201c1cc2
TM
708 .value =
709 "ioperm\0"
710 "iopl\0"
1f9ac68b 711 "pciconfig_iobase\0"
201c1cc2
TM
712 "pciconfig_read\0"
713 "pciconfig_write\0"
8130926d 714#ifdef __NR_s390_pci_mmio_read
201c1cc2 715 "s390_pci_mmio_read\0"
8130926d
LP
716#endif
717#ifdef __NR_s390_pci_mmio_write
201c1cc2 718 "s390_pci_mmio_write\0"
8130926d
LP
719#endif
720 },
bd2ab3f4
LP
721 [SYSCALL_FILTER_SET_REBOOT] = {
722 .name = "@reboot",
723 .help = "Reboot and reboot preparation/kexec",
724 .value =
bd2ab3f4 725 "kexec_file_load\0"
e59608fa 726 "kexec_load\0"
bd2ab3f4
LP
727 "reboot\0"
728 },
133ddbbe 729 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 730 .name = "@resources",
58a8f68b 731 .help = "Alter resource settings",
133ddbbe 732 .value =
0963c053
LP
733 "ioprio_set\0"
734 "mbind\0"
735 "migrate_pages\0"
736 "move_pages\0"
737 "nice\0"
0963c053
LP
738 "sched_setaffinity\0"
739 "sched_setattr\0"
133ddbbe
LP
740 "sched_setparam\0"
741 "sched_setscheduler\0"
0963c053 742 "set_mempolicy\0"
133ddbbe
LP
743 "setpriority\0"
744 "setrlimit\0"
133ddbbe 745 },
6eaaeee9
LP
746 [SYSCALL_FILTER_SET_SETUID] = {
747 .name = "@setuid",
748 .help = "Operations for changing user/group credentials",
749 .value =
6eaaeee9 750 "setgid\0"
215728ff 751 "setgid32\0"
6eaaeee9 752 "setgroups\0"
215728ff 753 "setgroups32\0"
6eaaeee9 754 "setregid\0"
215728ff 755 "setregid32\0"
6eaaeee9 756 "setresgid\0"
215728ff 757 "setresgid32\0"
6eaaeee9 758 "setresuid\0"
215728ff 759 "setresuid32\0"
6eaaeee9 760 "setreuid\0"
215728ff 761 "setreuid32\0"
6eaaeee9 762 "setuid\0"
215728ff 763 "setuid32\0"
6eaaeee9 764 },
cd0ddf6f
LP
765 [SYSCALL_FILTER_SET_SIGNAL] = {
766 .name = "@signal",
767 .help = "Process signal handling",
768 .value =
769 "rt_sigaction\0"
770 "rt_sigpending\0"
771 "rt_sigprocmask\0"
772 "rt_sigsuspend\0"
773 "rt_sigtimedwait\0"
6ca67710 774 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
775 "sigaction\0"
776 "sigaltstack\0"
777 "signal\0"
778 "signalfd\0"
779 "signalfd4\0"
780 "sigpending\0"
781 "sigprocmask\0"
782 "sigsuspend\0"
783 },
bd2ab3f4
LP
784 [SYSCALL_FILTER_SET_SWAP] = {
785 .name = "@swap",
786 .help = "Enable/disable swap devices",
787 .value =
788 "swapoff\0"
789 "swapon\0"
790 },
44898c53
LP
791 [SYSCALL_FILTER_SET_SYNC] = {
792 .name = "@sync",
793 .help = "Synchronize files and memory to storage",
794 .value =
795 "fdatasync\0"
796 "fsync\0"
797 "msync\0"
798 "sync\0"
799 "sync_file_range\0"
a8fb09f5 800 "sync_file_range2\0"
44898c53
LP
801 "syncfs\0"
802 },
70526841
LP
803 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
804 .name = "@system-service",
805 .help = "General system service operations",
806 .value =
807 "@aio\0"
808 "@basic-io\0"
809 "@chown\0"
810 "@default\0"
811 "@file-system\0"
812 "@io-event\0"
813 "@ipc\0"
814 "@keyring\0"
815 "@memlock\0"
816 "@network-io\0"
817 "@process\0"
818 "@resources\0"
819 "@setuid\0"
820 "@signal\0"
821 "@sync\0"
822 "@timer\0"
823 "brk\0"
824 "capget\0"
825 "capset\0"
826 "copy_file_range\0"
827 "fadvise64\0"
828 "fadvise64_64\0"
829 "flock\0"
830 "get_mempolicy\0"
831 "getcpu\0"
832 "getpriority\0"
833 "getrandom\0"
834 "ioctl\0"
835 "ioprio_get\0"
836 "kcmp\0"
837 "madvise\0"
70526841
LP
838 "mprotect\0"
839 "mremap\0"
840 "name_to_handle_at\0"
841 "oldolduname\0"
842 "olduname\0"
843 "personality\0"
844 "readahead\0"
845 "readdir\0"
846 "remap_file_pages\0"
847 "sched_get_priority_max\0"
848 "sched_get_priority_min\0"
849 "sched_getaffinity\0"
850 "sched_getattr\0"
851 "sched_getparam\0"
852 "sched_getscheduler\0"
853 "sched_rr_get_interval\0"
6ca67710 854 "sched_rr_get_interval_time64\0"
70526841
LP
855 "sched_yield\0"
856 "sendfile\0"
857 "sendfile64\0"
858 "setfsgid\0"
859 "setfsgid32\0"
860 "setfsuid\0"
861 "setfsuid32\0"
862 "setpgid\0"
863 "setsid\0"
864 "splice\0"
865 "sysinfo\0"
866 "tee\0"
867 "umask\0"
868 "uname\0"
869 "userfaultfd\0"
870 "vmsplice\0"
871 },
cd0ddf6f
LP
872 [SYSCALL_FILTER_SET_TIMER] = {
873 .name = "@timer",
874 .help = "Schedule operations by time",
875 .value =
876 "alarm\0"
877 "getitimer\0"
878 "setitimer\0"
879 "timer_create\0"
880 "timer_delete\0"
881 "timer_getoverrun\0"
882 "timer_gettime\0"
6ca67710 883 "timer_gettime64\0"
cd0ddf6f 884 "timer_settime\0"
6ca67710 885 "timer_settime64\0"
cd0ddf6f
LP
886 "timerfd_create\0"
887 "timerfd_gettime\0"
6ca67710 888 "timerfd_gettime64\0"
cd0ddf6f 889 "timerfd_settime\0"
6ca67710 890 "timerfd_settime64\0"
cd0ddf6f
LP
891 "times\0"
892 },
201c1cc2 893};
8130926d
LP
894
895const SyscallFilterSet *syscall_filter_set_find(const char *name) {
896 unsigned i;
897
898 if (isempty(name) || name[0] != '@')
899 return NULL;
900
901 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
902 if (streq(syscall_filter_sets[i].name, name))
903 return syscall_filter_sets + i;
904
905 return NULL;
906}
907
b54f36c6 908static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 909
b54f36c6 910int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
911 assert(seccomp);
912 assert(name);
913
960e4569
LP
914 if (strv_contains(exclude, name))
915 return 0;
916
69b1b241
LP
917 if (name[0] == '@') {
918 const SyscallFilterSet *other;
919
920 other = syscall_filter_set_find(name);
baaa35ad
ZJS
921 if (!other)
922 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
923 "Filter set %s is not known!",
924 name);
69b1b241 925
b54f36c6
ZJS
926 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
927
69b1b241 928 } else {
b54f36c6 929 int id, r;
69b1b241
LP
930
931 id = seccomp_syscall_resolve_name(name);
cff7bff8 932 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
933 if (log_missing)
934 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 935 return 0;
cff7bff8 936 }
69b1b241
LP
937
938 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 939 if (r < 0) {
69b1b241 940 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
941 bool ignore = r == -EDOM;
942
943 if (!ignore || log_missing)
944 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
945 name, id, ignore ? ", ignoring" : "");
946 if (!ignore)
947 return r;
b54f36c6 948 }
69b1b241 949
b54f36c6
ZJS
950 return 0;
951 }
69b1b241
LP
952}
953
469830d1
LP
954static int seccomp_add_syscall_filter_set(
955 scmp_filter_ctx seccomp,
469830d1 956 const SyscallFilterSet *set,
960e4569 957 uint32_t action,
b54f36c6
ZJS
958 char **exclude,
959 bool log_missing) {
469830d1 960
8130926d
LP
961 const char *sys;
962 int r;
963
964 assert(seccomp);
965 assert(set);
966
967 NULSTR_FOREACH(sys, set->value) {
b54f36c6 968 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
969 if (r < 0)
970 return r;
469830d1
LP
971 }
972
973 return 0;
974}
975
b54f36c6 976int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
977 uint32_t arch;
978 int r;
979
980 assert(set);
981
982 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 983 * each local arch. */
469830d1
LP
984
985 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
986 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
987
988 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
989
990 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
991 if (r < 0)
992 return r;
469830d1 993
b54f36c6 994 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
995 if (r < 0)
996 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
997
998 r = seccomp_load(seccomp);
7bc5e0b1 999 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1000 return r;
1001 if (r < 0)
1002 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
1003 }
1004
1005 return 0;
1006}
a3be2849 1007
b54f36c6 1008int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 1009 uint32_t arch;
a3be2849
LP
1010 int r;
1011
469830d1
LP
1012 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1013 * SyscallFilterSet* table. */
a3be2849 1014
8cfa775f 1015 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 1016 return 0;
a3be2849 1017
469830d1
LP
1018 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1019 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1020 Iterator i;
b54f36c6 1021 void *syscall_id, *val;
a3be2849 1022
469830d1 1023 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1024
469830d1
LP
1025 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1026 if (r < 0)
1027 return r;
a3be2849 1028
b54f36c6 1029 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 1030 uint32_t a = action;
b54f36c6
ZJS
1031 int id = PTR_TO_INT(syscall_id) - 1;
1032 int error = PTR_TO_INT(val);
8cfa775f 1033
b54f36c6
ZJS
1034 if (action != SCMP_ACT_ALLOW && error >= 0)
1035 a = SCMP_ACT_ERRNO(error);
8cfa775f 1036
b54f36c6 1037 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
1038 if (r < 0) {
1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1040 _cleanup_free_ char *n = NULL;
7e86bd73 1041 bool ignore;
469830d1 1042
b54f36c6 1043 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1044 ignore = r == -EDOM;
1045 if (!ignore || log_missing)
1046 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1047 strna(n), id, ignore ? ", ignoring" : "");
1048 if (!ignore)
1049 return r;
469830d1
LP
1050 }
1051 }
1052
1053 r = seccomp_load(seccomp);
7bc5e0b1 1054 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1055 return r;
1056 if (r < 0)
1057 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 }
1059
1060 return 0;
add00535
LP
1061}
1062
58f6ab44 1063int seccomp_parse_syscall_filter(
898748d8
YW
1064 const char *name,
1065 int errno_num,
1066 Hashmap *filter,
13d92c63 1067 SeccompParseFlags flags,
898748d8
YW
1068 const char *unit,
1069 const char *filename,
1070 unsigned line) {
1071
1072 int r;
1073
1074 assert(name);
1075 assert(filter);
1076
1077 if (name[0] == '@') {
1078 const SyscallFilterSet *set;
1079 const char *i;
1080
1081 set = syscall_filter_set_find(name);
1082 if (!set) {
13d92c63 1083 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1084 return -EINVAL;
13d92c63
LP
1085
1086 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1087 "Unknown system call group, ignoring: %s", name);
1088 return 0;
898748d8
YW
1089 }
1090
1091 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1092 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1093 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1094 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1095 * about them. */
58f6ab44 1096 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1097 if (r < 0)
1098 return r;
1099 }
1100 } else {
1101 int id;
1102
1103 id = seccomp_syscall_resolve_name(name);
1104 if (id == __NR_SCMP_ERROR) {
13d92c63 1105 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1106 return -EINVAL;
13d92c63
LP
1107
1108 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1109 "Failed to parse system call, ignoring: %s", name);
1110 return 0;
898748d8
YW
1111 }
1112
1113 /* If we previously wanted to forbid a syscall and now
1114 * we want to allow it, then remove it from the list. */
6b000af4 1115 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
898748d8
YW
1116 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1117 if (r < 0)
851ee70a
LW
1118 switch (r) {
1119 case -ENOMEM:
1120 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1121 case -EEXIST:
9d7fe7c6
LW
1122 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1123 break;
851ee70a
LW
1124 default:
1125 return r;
1126 }
898748d8
YW
1127 } else
1128 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1129 }
1130
1131 return 0;
1132}
1133
add00535 1134int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1135 uint32_t arch;
add00535
LP
1136 int r;
1137
f1d34068 1138 if (DEBUG_LOGGING) {
add00535
LP
1139 _cleanup_free_ char *s = NULL;
1140
86c2a9f1 1141 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1142 log_debug("Restricting namespace to: %s.", strna(s));
1143 }
1144
1145 /* NOOP? */
1146 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1147 return 0;
1148
469830d1
LP
1149 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1150 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1151 unsigned i;
add00535 1152
469830d1
LP
1153 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1154
1155 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1156 if (r < 0)
1157 return r;
1158
1159 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1160 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1161 * altogether. */
1162 r = seccomp_rule_add_exact(
1163 seccomp,
1164 SCMP_ACT_ERRNO(EPERM),
1165 SCMP_SYS(setns),
1166 0);
1167 else
1168 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1169 * special invocation with a zero flags argument, right here. */
1170 r = seccomp_rule_add_exact(
1171 seccomp,
1172 SCMP_ACT_ERRNO(EPERM),
1173 SCMP_SYS(setns),
1174 1,
1175 SCMP_A1(SCMP_CMP_EQ, 0));
1176 if (r < 0) {
1177 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1178 continue;
1179 }
1180
1181 for (i = 0; namespace_flag_map[i].name; i++) {
1182 unsigned long f;
1183
1184 f = namespace_flag_map[i].flag;
1185 if ((retain & f) == f) {
1186 log_debug("Permitting %s.", namespace_flag_map[i].name);
1187 continue;
1188 }
1189
1190 log_debug("Blocking %s.", namespace_flag_map[i].name);
1191
1192 r = seccomp_rule_add_exact(
1193 seccomp,
1194 SCMP_ACT_ERRNO(EPERM),
1195 SCMP_SYS(unshare),
1196 1,
1197 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1198 if (r < 0) {
1199 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 break;
1201 }
1202
511ceb1f
ZJS
1203 /* On s390/s390x the first two parameters to clone are switched */
1204 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1205 r = seccomp_rule_add_exact(
1206 seccomp,
1207 SCMP_ACT_ERRNO(EPERM),
1208 SCMP_SYS(clone),
1209 1,
1210 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1211 else
1212 r = seccomp_rule_add_exact(
1213 seccomp,
1214 SCMP_ACT_ERRNO(EPERM),
1215 SCMP_SYS(clone),
1216 1,
1217 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1218 if (r < 0) {
1219 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1220 break;
1221 }
1222
1223 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1224 r = seccomp_rule_add_exact(
1225 seccomp,
1226 SCMP_ACT_ERRNO(EPERM),
1227 SCMP_SYS(setns),
1228 1,
1229 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1230 if (r < 0) {
1231 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1232 break;
1233 }
1234 }
1235 }
1236 if (r < 0)
1237 continue;
1238
1239 r = seccomp_load(seccomp);
7bc5e0b1 1240 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1241 return r;
1242 if (r < 0)
1243 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1244 }
1245
1246 return 0;
1247}
1248
1249int seccomp_protect_sysctl(void) {
1250 uint32_t arch;
1251 int r;
1252
1253 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1254 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1255
1256 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1257
2e64e8f4
ZJS
1258 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1259 /* No _sysctl syscall */
1260 continue;
1261
469830d1
LP
1262 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1263 if (r < 0)
1264 return r;
1265
1266 r = seccomp_rule_add_exact(
add00535
LP
1267 seccomp,
1268 SCMP_ACT_ERRNO(EPERM),
469830d1 1269 SCMP_SYS(_sysctl),
add00535 1270 0);
469830d1
LP
1271 if (r < 0) {
1272 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1273 continue;
1274 }
1275
1276 r = seccomp_load(seccomp);
7bc5e0b1 1277 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1278 return r;
1279 if (r < 0)
1280 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1281 }
1282
1283 return 0;
1284}
1285
620dbdd2
KK
1286int seccomp_protect_syslog(void) {
1287 uint32_t arch;
1288 int r;
1289
1290 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1291 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1292
1293 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1294 if (r < 0)
1295 return r;
1296
1297 r = seccomp_rule_add_exact(
1298 seccomp,
1299 SCMP_ACT_ERRNO(EPERM),
1300 SCMP_SYS(syslog),
1301 0);
1302
1303 if (r < 0) {
1304 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1305 continue;
1306 }
1307
1308 r = seccomp_load(seccomp);
1309 if (ERRNO_IS_SECCOMP_FATAL(r))
1310 return r;
1311 if (r < 0)
1312 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1313 }
1314
1315 return 0;
1316}
1317
6b000af4 1318int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1319 uint32_t arch;
1320 int r;
1321
1322 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1323 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1324 bool supported;
469830d1
LP
1325 Iterator i;
1326
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1328
9606bc4b
LP
1329 switch (arch) {
1330
1331 case SCMP_ARCH_X86_64:
1332 case SCMP_ARCH_X32:
1333 case SCMP_ARCH_ARM:
1334 case SCMP_ARCH_AARCH64:
0d9fca76 1335 case SCMP_ARCH_PPC:
da1921a5
ZJS
1336 case SCMP_ARCH_PPC64:
1337 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1338 case SCMP_ARCH_MIPSEL64N32:
1339 case SCMP_ARCH_MIPS64N32:
1340 case SCMP_ARCH_MIPSEL64:
1341 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1342 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1343 supported = true;
1344 break;
1345
9606bc4b
LP
1346 case SCMP_ARCH_S390:
1347 case SCMP_ARCH_S390X:
da1921a5 1348 case SCMP_ARCH_X86:
f5aeac14
JC
1349 case SCMP_ARCH_MIPSEL:
1350 case SCMP_ARCH_MIPS:
9606bc4b
LP
1351 default:
1352 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1353 * don't know */
1354 supported = false;
1355 break;
1356 }
1357
1358 if (!supported)
1359 continue;
1360
469830d1
LP
1361 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1362 if (r < 0)
1363 return r;
1364
6b000af4 1365 if (allow_list) {
469830d1
LP
1366 int af, first = 0, last = 0;
1367 void *afp;
1368
6b000af4
LP
1369 /* If this is an allow list, we first block the address families that are out of
1370 * range and then everything that is not in the set. First, we find the lowest and
1371 * highest address family in the set. */
469830d1
LP
1372
1373 SET_FOREACH(afp, address_families, i) {
1374 af = PTR_TO_INT(afp);
1375
1376 if (af <= 0 || af >= af_max())
1377 continue;
1378
1379 if (first == 0 || af < first)
1380 first = af;
1381
1382 if (last == 0 || af > last)
1383 last = af;
1384 }
1385
1386 assert((first == 0) == (last == 0));
1387
1388 if (first == 0) {
1389
1390 /* No entries in the valid range, block everything */
1391 r = seccomp_rule_add_exact(
1392 seccomp,
1393 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1394 SCMP_SYS(socket),
1395 0);
1396 if (r < 0) {
1397 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 continue;
1399 }
1400
1401 } else {
1402
1403 /* Block everything below the first entry */
1404 r = seccomp_rule_add_exact(
1405 seccomp,
1406 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1407 SCMP_SYS(socket),
1408 1,
1409 SCMP_A0(SCMP_CMP_LT, first));
1410 if (r < 0) {
1411 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1412 continue;
1413 }
1414
1415 /* Block everything above the last entry */
1416 r = seccomp_rule_add_exact(
1417 seccomp,
1418 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1419 SCMP_SYS(socket),
1420 1,
1421 SCMP_A0(SCMP_CMP_GT, last));
1422 if (r < 0) {
1423 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 continue;
1425 }
1426
1427 /* Block everything between the first and last entry */
1428 for (af = 1; af < af_max(); af++) {
1429
1430 if (set_contains(address_families, INT_TO_PTR(af)))
1431 continue;
1432
1433 r = seccomp_rule_add_exact(
1434 seccomp,
1435 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1436 SCMP_SYS(socket),
1437 1,
1438 SCMP_A0(SCMP_CMP_EQ, af));
1439 if (r < 0)
1440 break;
1441 }
469830d1
LP
1442 if (r < 0) {
1443 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1444 continue;
1445 }
1446 }
1447
1448 } else {
1449 void *af;
1450
6b000af4
LP
1451 /* If this is a deny list, then generate one rule for each address family that are
1452 * then combined in OR checks. */
469830d1
LP
1453
1454 SET_FOREACH(af, address_families, i) {
1455
1456 r = seccomp_rule_add_exact(
1457 seccomp,
1458 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1459 SCMP_SYS(socket),
1460 1,
1461 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1462 if (r < 0)
1463 break;
1464 }
469830d1
LP
1465 if (r < 0) {
1466 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1467 continue;
1468 }
1469 }
1470
1471 r = seccomp_load(seccomp);
7bc5e0b1 1472 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1473 return r;
1474 if (r < 0)
1475 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1476 }
1477
1478 return 0;
1479}
1480
1481int seccomp_restrict_realtime(void) {
1482 static const int permitted_policies[] = {
1483 SCHED_OTHER,
1484 SCHED_BATCH,
1485 SCHED_IDLE,
1486 };
1487
1488 int r, max_policy = 0;
1489 uint32_t arch;
1490 unsigned i;
1491
1492 /* Determine the highest policy constant we want to allow */
1493 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1494 if (permitted_policies[i] > max_policy)
1495 max_policy = permitted_policies[i];
1496
1497 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1498 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1499 int p;
1500
1501 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1502
1503 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1504 if (r < 0)
1505 return r;
1506
1507 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1508 * allow list. */
469830d1
LP
1509 for (p = 0; p < max_policy; p++) {
1510 bool good = false;
1511
6b000af4 1512 /* Check if this is in the allow list. */
469830d1
LP
1513 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1514 if (permitted_policies[i] == p) {
1515 good = true;
1516 break;
1517 }
1518
1519 if (good)
1520 continue;
1521
1522 /* Deny this policy */
1523 r = seccomp_rule_add_exact(
1524 seccomp,
1525 SCMP_ACT_ERRNO(EPERM),
1526 SCMP_SYS(sched_setscheduler),
1527 1,
1528 SCMP_A1(SCMP_CMP_EQ, p));
1529 if (r < 0) {
1530 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1531 continue;
1532 }
1533 }
1534
6b000af4
LP
1535 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1536 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1537 r = seccomp_rule_add_exact(
add00535
LP
1538 seccomp,
1539 SCMP_ACT_ERRNO(EPERM),
469830d1 1540 SCMP_SYS(sched_setscheduler),
add00535 1541 1,
469830d1
LP
1542 SCMP_A1(SCMP_CMP_GT, max_policy));
1543 if (r < 0) {
1544 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1545 continue;
1546 }
add00535 1547
469830d1 1548 r = seccomp_load(seccomp);
7bc5e0b1 1549 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1550 return r;
1551 if (r < 0)
1552 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1553 }
1554
1555 return 0;
1556}
1557
6dc66688
ZJS
1558static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1559 uint32_t arch,
1560 int nr,
14cb109d 1561 unsigned arg_cnt,
6dc66688
ZJS
1562 const struct scmp_arg_cmp arg) {
1563 int r;
1564
1565 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1566 if (r < 0) {
1567 _cleanup_free_ char *n = NULL;
1568
1569 n = seccomp_syscall_resolve_num_arch(arch, nr);
1570 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1571 strna(n),
1572 seccomp_arch_to_string(arch));
1573 }
1574
1575 return r;
1576}
1577
2a8d6e63 1578/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1579#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1580assert_cc(SCMP_SYS(shmget) > 0);
1581assert_cc(SCMP_SYS(shmat) > 0);
1582assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1583#endif
6dc66688 1584
469830d1
LP
1585int seccomp_memory_deny_write_execute(void) {
1586 uint32_t arch;
b069c2a3 1587 unsigned loaded = 0;
469830d1
LP
1588
1589 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1590 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1591 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1592
469830d1
LP
1593 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1594
8a50cf69
LP
1595 switch (arch) {
1596
bed4668d
CE
1597 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1598 * We ignore that here, which means there's still a way to get writable/executable
1599 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1600
8a50cf69 1601 case SCMP_ARCH_X86:
57311925 1602 case SCMP_ARCH_S390:
8a50cf69
LP
1603 filter_syscall = SCMP_SYS(mmap2);
1604 block_syscall = SCMP_SYS(mmap);
bed4668d 1605 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1606 break;
1607
63d00dfb 1608 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1609 case SCMP_ARCH_PPC64:
1610 case SCMP_ARCH_PPC64LE:
bed4668d 1611 case SCMP_ARCH_S390X:
2a8d6e63 1612 filter_syscall = SCMP_SYS(mmap);
bed4668d 1613 /* shmat multiplexed, see above */
8a50cf69
LP
1614 break;
1615
4278d1f5
ZJS
1616 case SCMP_ARCH_ARM:
1617 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1618 shmat_syscall = SCMP_SYS(shmat);
1619 break;
1620
8a50cf69
LP
1621 case SCMP_ARCH_X86_64:
1622 case SCMP_ARCH_X32:
79873bc8 1623 case SCMP_ARCH_AARCH64:
bed4668d 1624 filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
8a50cf69
LP
1625 shmat_syscall = SCMP_SYS(shmat);
1626 break;
1627
1628 /* Please add more definitions here, if you port systemd to other architectures! */
1629
57311925 1630#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
8a50cf69
LP
1631#warning "Consider adding the right mmap() syscall definitions here!"
1632#endif
1633 }
1634
1635 /* Can't filter mmap() on this arch, then skip it */
1636 if (filter_syscall == 0)
1637 continue;
1638
469830d1
LP
1639 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1640 if (r < 0)
1641 return r;
1642
6dc66688
ZJS
1643 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1644 1,
1645 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1646 if (r < 0)
1647 continue;
8a50cf69
LP
1648
1649 if (block_syscall != 0) {
6dc66688
ZJS
1650 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1651 if (r < 0)
8a50cf69 1652 continue;
add00535 1653 }
a3be2849 1654
6dc66688
ZJS
1655 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1656 1,
b835eeb4
ZJS
1657 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1658 if (r < 0)
1659 continue;
1660
91691f1d 1661#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1662 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1663 1,
6dc66688
ZJS
1664 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1665 if (r < 0)
469830d1 1666 continue;
91691f1d 1667#endif
add00535 1668
67fb5f33 1669 if (shmat_syscall > 0) {
5ef3ed97 1670 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1671 1,
1672 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1673 if (r < 0)
8a50cf69 1674 continue;
469830d1
LP
1675 }
1676
1677 r = seccomp_load(seccomp);
7bc5e0b1 1678 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1679 return r;
add00535 1680 if (r < 0)
b069c2a3
ZJS
1681 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1682 seccomp_arch_to_string(arch));
903659e7 1683 loaded++;
469830d1 1684 }
add00535 1685
903659e7 1686 if (loaded == 0)
b069c2a3 1687 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1688
1689 return loaded;
469830d1
LP
1690}
1691
1692int seccomp_restrict_archs(Set *archs) {
1693 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1694 Iterator i;
1695 void *id;
1696 int r;
1697
1698 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1699 * list.
1700 *
1701 * There are some qualifications. However the most important use is to stop processes from bypassing
1702 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1703 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1704
2428aaf8
AJ
1705 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1706 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1707 * to run a program with the restrictions applied. */
469830d1
LP
1708 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1709 if (!seccomp)
1710 return -ENOMEM;
1711
1712 SET_FOREACH(id, archs, i) {
1713 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1714 if (r < 0 && r != -EEXIST)
1715 return r;
1716 }
1717
1718 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1719 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1720 * The important thing is that you can block the old 32-bit x86 syscalls.
1721 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1722
1723 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1724 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1725
1726 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1727 if (r < 0 && r != -EEXIST)
469830d1 1728 return r;
add00535
LP
1729 }
1730
469830d1
LP
1731 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1732 if (r < 0)
1733 return r;
add00535 1734
1c6af69b 1735 r = seccomp_load(seccomp);
7bc5e0b1 1736 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1737 return r;
1738 if (r < 0)
1739 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1740
1741 return 0;
a3be2849 1742}
b16bd535 1743
de7fef4b
ZJS
1744int parse_syscall_archs(char **l, Set **ret_archs) {
1745 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1746 char **s;
1747 int r;
1748
1749 assert(l);
de7fef4b 1750 assert(ret_archs);
b16bd535
YW
1751
1752 STRV_FOREACH(s, l) {
1753 uint32_t a;
1754
1755 r = seccomp_arch_from_string(*s, &a);
1756 if (r < 0)
1757 return -EINVAL;
1758
de7fef4b 1759 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1760 if (r < 0)
1761 return -ENOMEM;
1762 }
1763
de7fef4b 1764 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1765 return 0;
1766}
165a31c0 1767
8cfa775f 1768int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1769 const char *i;
1770 int r;
1771
1772 assert(set);
1773
1774 NULSTR_FOREACH(i, set->value) {
1775
1776 if (i[0] == '@') {
1777 const SyscallFilterSet *more;
1778
1779 more = syscall_filter_set_find(i);
1780 if (!more)
1781 return -ENXIO;
1782
165a31c0
LP
1783 r = seccomp_filter_set_add(filter, add, more);
1784 if (r < 0)
1785 return r;
1786 } else {
1787 int id;
1788
1789 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1790 if (id == __NR_SCMP_ERROR) {
1791 log_debug("Couldn't resolve system call, ignoring: %s", i);
1792 continue;
1793 }
165a31c0
LP
1794
1795 if (add) {
8cfa775f 1796 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1797 if (r < 0)
1798 return r;
1799 } else
8cfa775f 1800 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1801 }
1802 }
1803
1804 return 0;
1805}
78e864e5
TM
1806
1807int seccomp_lock_personality(unsigned long personality) {
72eafe71 1808 uint32_t arch;
78e864e5
TM
1809 int r;
1810
72eafe71
LP
1811 if (personality >= PERSONALITY_INVALID)
1812 return -EINVAL;
78e864e5 1813
72eafe71
LP
1814 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1815 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1816
72eafe71
LP
1817 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1818 if (r < 0)
1819 return r;
1820
1821 r = seccomp_rule_add_exact(
1822 seccomp,
1823 SCMP_ACT_ERRNO(EPERM),
1824 SCMP_SYS(personality),
1825 1,
1826 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1827 if (r < 0) {
1828 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1829 continue;
1830 }
72eafe71
LP
1831
1832 r = seccomp_load(seccomp);
7bc5e0b1 1833 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1834 return r;
1835 if (r < 0)
1836 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1837 }
1838
1839 return 0;
78e864e5 1840}
aecd5ac6
TM
1841
1842int seccomp_protect_hostname(void) {
1843 uint32_t arch;
1844 int r;
1845
1846 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1847 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1848
1849 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1850 if (r < 0)
1851 return r;
1852
1853 r = seccomp_rule_add_exact(
1854 seccomp,
1855 SCMP_ACT_ERRNO(EPERM),
1856 SCMP_SYS(sethostname),
1857 0);
9e6e543c
LP
1858 if (r < 0) {
1859 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1860 continue;
9e6e543c 1861 }
aecd5ac6
TM
1862
1863 r = seccomp_rule_add_exact(
1864 seccomp,
1865 SCMP_ACT_ERRNO(EPERM),
1866 SCMP_SYS(setdomainname),
1867 0);
9e6e543c
LP
1868 if (r < 0) {
1869 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1870 continue;
9e6e543c 1871 }
aecd5ac6
TM
1872
1873 r = seccomp_load(seccomp);
7bc5e0b1 1874 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1875 return r;
1876 if (r < 0)
1877 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1878 }
1879
1880 return 0;
1881}
3c27973b 1882
da4dc9a6
ZJS
1883static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1884 /* Checks the mode_t parameter of the following system calls:
1885 *
1886 * → chmod() + fchmod() + fchmodat()
1887 * → open() + creat() + openat()
1888 * → mkdir() + mkdirat()
1889 * → mknod() + mknodat()
1890 *
1891 * Returns error if *everything* failed, and 0 otherwise.
1892 */
1893 int r = 0;
1894 bool any = false;
1895
1896 r = seccomp_rule_add_exact(
1897 seccomp,
1898 SCMP_ACT_ERRNO(EPERM),
1899 SCMP_SYS(chmod),
1900 1,
1901 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1902 if (r < 0)
1903 log_debug_errno(r, "Failed to add filter for chmod: %m");
1904 else
1905 any = true;
1906
1907 r = seccomp_rule_add_exact(
1908 seccomp,
1909 SCMP_ACT_ERRNO(EPERM),
1910 SCMP_SYS(fchmod),
1911 1,
1912 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1913 if (r < 0)
1914 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1915 else
1916 any = true;
1917
1918 r = seccomp_rule_add_exact(
1919 seccomp,
1920 SCMP_ACT_ERRNO(EPERM),
1921 SCMP_SYS(fchmodat),
1922 1,
1923 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1924 if (r < 0)
1925 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1926 else
1927 any = true;
1928
1929 r = seccomp_rule_add_exact(
1930 seccomp,
1931 SCMP_ACT_ERRNO(EPERM),
1932 SCMP_SYS(mkdir),
1933 1,
1934 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1935 if (r < 0)
1936 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1937 else
1938 any = true;
1939
1940 r = seccomp_rule_add_exact(
1941 seccomp,
1942 SCMP_ACT_ERRNO(EPERM),
1943 SCMP_SYS(mkdirat),
1944 1,
1945 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1946 if (r < 0)
1947 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1948 else
1949 any = true;
1950
1951 r = seccomp_rule_add_exact(
1952 seccomp,
1953 SCMP_ACT_ERRNO(EPERM),
1954 SCMP_SYS(mknod),
1955 1,
1956 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1957 if (r < 0)
1958 log_debug_errno(r, "Failed to add filter for mknod: %m");
1959 else
1960 any = true;
1961
1962 r = seccomp_rule_add_exact(
1963 seccomp,
1964 SCMP_ACT_ERRNO(EPERM),
1965 SCMP_SYS(mknodat),
1966 1,
1967 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1968 if (r < 0)
1969 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1970 else
1971 any = true;
1972
1973#if SCMP_SYS(open) > 0
1974 r = seccomp_rule_add_exact(
1975 seccomp,
1976 SCMP_ACT_ERRNO(EPERM),
1977 SCMP_SYS(open),
1978 2,
1979 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1980 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1981 if (r < 0)
1982 log_debug_errno(r, "Failed to add filter for open: %m");
1983 else
1984 any = true;
1985#endif
1986
1987 r = seccomp_rule_add_exact(
1988 seccomp,
1989 SCMP_ACT_ERRNO(EPERM),
1990 SCMP_SYS(openat),
1991 2,
1992 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1993 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1994 if (r < 0)
1995 log_debug_errno(r, "Failed to add filter for openat: %m");
1996 else
1997 any = true;
1998
ecc04067
LP
1999#if defined(__SNR_openat2)
2000 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2001 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2002 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2003 * compatible with kernels that are not absolutely recent. */
2004 r = seccomp_rule_add_exact(
2005 seccomp,
2006 SCMP_ACT_ERRNO(EPERM),
2007 SCMP_SYS(openat2),
2008 0);
2009 if (r < 0)
2010 log_debug_errno(r, "Failed to add filter for openat2: %m");
2011 else
2012 any = true;
2013#endif
2014
da4dc9a6
ZJS
2015 r = seccomp_rule_add_exact(
2016 seccomp,
2017 SCMP_ACT_ERRNO(EPERM),
2018 SCMP_SYS(creat),
2019 1,
2020 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2021 if (r < 0)
2022 log_debug_errno(r, "Failed to add filter for creat: %m");
2023 else
2024 any = true;
2025
2026 return any ? 0 : r;
2027}
2028
3c27973b
LP
2029int seccomp_restrict_suid_sgid(void) {
2030 uint32_t arch;
da4dc9a6 2031 int r, k;
3c27973b
LP
2032
2033 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2034 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2035
2036 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2037 if (r < 0)
2038 return r;
2039
da4dc9a6
ZJS
2040 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2041 if (r < 0)
2042 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2043
da4dc9a6
ZJS
2044 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2045 if (k < 0)
2046 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2047
da4dc9a6 2048 if (r < 0 && k < 0)
3c27973b 2049 continue;
3c27973b
LP
2050
2051 r = seccomp_load(seccomp);
7bc5e0b1 2052 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2053 return r;
2054 if (r < 0)
2055 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2056 }
2057
2058 return 0;
2059}
915fb324
LP
2060
2061uint32_t scmp_act_kill_process(void) {
2062
2063 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2064 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2065 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2066 * for single-threaded apps does the right thing. */
2067
2068#ifdef SCMP_ACT_KILL_PROCESS
2069 if (seccomp_api_get() >= 3)
2070 return SCMP_ACT_KILL_PROCESS;
2071#endif
2072
2073 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2074}