]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
Merge pull request #16803 from poettering/analyze-condition-rework
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
d8b4d14d 15#include "errno-list.h"
a8fbdf54 16#include "macro.h"
add00535 17#include "nsflags.h"
d8b4d14d 18#include "nulstr-util.h"
78e864e5 19#include "process-util.h"
cf0fbc49 20#include "seccomp-util.h"
b16bd535 21#include "set.h"
07630cea 22#include "string-util.h"
b16bd535 23#include "strv.h"
469830d1
LP
24
25const uint32_t seccomp_local_archs[] = {
26
6b000af4 27 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
28
29#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
f2d9751c
LP
32 SCMP_ARCH_X32, /* native */
33#elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
469830d1 35 SCMP_ARCH_X32,
f2d9751c
LP
36 SCMP_ARCH_X86_64, /* native */
37#elif defined(__i386__)
38 SCMP_ARCH_X86,
39#elif defined(__aarch64__)
469830d1 40 SCMP_ARCH_ARM,
f2d9751c
LP
41 SCMP_ARCH_AARCH64, /* native */
42#elif defined(__arm__)
43 SCMP_ARCH_ARM,
44#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 48 SCMP_ARCH_MIPS,
f2d9751c
LP
49 SCMP_ARCH_MIPSEL, /* native */
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
469830d1 54 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
469830d1 59 SCMP_ARCH_MIPSEL,
f2d9751c
LP
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL64,
f2d9751c 68 SCMP_ARCH_MIPS64,
469830d1 69 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPS64N32, /* native */
71#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 79 SCMP_ARCH_PPC,
469830d1 80 SCMP_ARCH_PPC64LE,
f2d9751c
LP
81 SCMP_ARCH_PPC64, /* native */
82#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86#elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88#elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91#elif defined(__s390__)
469830d1 92 SCMP_ARCH_S390,
469830d1
LP
93#endif
94 (uint32_t) -1
95 };
57183d11
LP
96
97const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
57183d11 102
aa34055f
ZJS
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
57183d11 105 return "native";
aa34055f 106 case SCMP_ARCH_X86:
57183d11 107 return "x86";
aa34055f 108 case SCMP_ARCH_X86_64:
57183d11 109 return "x86-64";
aa34055f 110 case SCMP_ARCH_X32:
57183d11 111 return "x32";
aa34055f 112 case SCMP_ARCH_ARM:
57183d11 113 return "arm";
aa34055f
ZJS
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
6abfd303 135 return "s390";
aa34055f 136 case SCMP_ARCH_S390X:
6abfd303 137 return "s390x";
aa34055f
ZJS
138 default:
139 return NULL;
140 }
57183d11
LP
141}
142
143int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
57183d11
LP
183 else
184 return -EINVAL;
185
186 return 0;
187}
e9642be2 188
469830d1 189int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 190 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
191 int r;
192
469830d1
LP
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
469830d1
LP
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
1b52793d 203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 204 if (r < 0)
b4eaa6cc 205 return r;
469830d1 206
1b52793d 207 r = seccomp_arch_add(seccomp, arch);
469830d1 208 if (r < 0)
b4eaa6cc 209 return r;
469830d1
LP
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 220 if (r < 0)
b4eaa6cc 221 return r;
8d7b0c8f
LP
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
b4eaa6cc 225 return r;
8d7b0c8f 226
b4eaa6cc 227 *ret = TAKE_PTR(seccomp);
8d7b0c8f 228 return 0;
8d7b0c8f
LP
229}
230
d347d902 231static bool is_basic_seccomp_available(void) {
4d5bd50a 232 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
233}
234
235static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
236 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
237 errno == EFAULT;
d347d902
FS
238}
239
83f12b27 240bool is_seccomp_available(void) {
83f12b27 241 static int cached_enabled = -1;
4d5bd50a 242
83f12b27 243 if (cached_enabled < 0)
4d5bd50a
LP
244 cached_enabled =
245 is_basic_seccomp_available() &&
246 is_seccomp_filter_available();
247
83f12b27
FS
248 return cached_enabled;
249}
250
8130926d 251const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 252 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 253 .name = "@default",
d5efc18b 254 .help = "System calls that are always permitted",
40eb6a80
ZJS
255 .value =
256 "clock_getres\0"
6ca67710 257 "clock_getres_time64\0"
40eb6a80 258 "clock_gettime\0"
6ca67710 259 "clock_gettime64\0"
40eb6a80 260 "clock_nanosleep\0"
6ca67710 261 "clock_nanosleep_time64\0"
40eb6a80
ZJS
262 "execve\0"
263 "exit\0"
264 "exit_group\0"
e41b0f42 265 "futex\0"
6ca67710 266 "futex_time64\0"
e41b0f42
LP
267 "get_robust_list\0"
268 "get_thread_area\0"
09d3020b
DH
269 "getegid\0"
270 "getegid32\0"
271 "geteuid\0"
272 "geteuid32\0"
273 "getgid\0"
274 "getgid32\0"
275 "getgroups\0"
276 "getgroups32\0"
277 "getpgid\0"
278 "getpgrp\0"
279 "getpid\0"
280 "getppid\0"
281 "getresgid\0"
282 "getresgid32\0"
283 "getresuid\0"
284 "getresuid32\0"
40eb6a80 285 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
286 "getsid\0"
287 "gettid\0"
40eb6a80 288 "gettimeofday\0"
09d3020b
DH
289 "getuid\0"
290 "getuid32\0"
e41b0f42 291 "membarrier\0"
40eb6a80
ZJS
292 "nanosleep\0"
293 "pause\0"
4c3a9176 294 "prlimit64\0"
e41b0f42 295 "restart_syscall\0"
6fee3be0 296 "rseq\0"
40eb6a80 297 "rt_sigreturn\0"
8f44de08 298 "sched_yield\0"
e41b0f42
LP
299 "set_robust_list\0"
300 "set_thread_area\0"
301 "set_tid_address\0"
ce5faeac 302 "set_tls\0"
40eb6a80
ZJS
303 "sigreturn\0"
304 "time\0"
4c3a9176 305 "ugetrlimit\0"
40eb6a80 306 },
44898c53
LP
307 [SYSCALL_FILTER_SET_AIO] = {
308 .name = "@aio",
309 .help = "Asynchronous IO",
310 .value =
311 "io_cancel\0"
312 "io_destroy\0"
313 "io_getevents\0"
a05cfe23 314 "io_pgetevents\0"
6ca67710 315 "io_pgetevents_time64\0"
44898c53
LP
316 "io_setup\0"
317 "io_submit\0"
9e486265
LP
318 "io_uring_enter\0"
319 "io_uring_register\0"
320 "io_uring_setup\0"
44898c53 321 },
133ddbbe 322 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 323 .name = "@basic-io",
d5efc18b 324 .help = "Basic IO",
133ddbbe 325 .value =
648a0ed0 326 "_llseek\0"
133ddbbe 327 "close\0"
648a0ed0 328 "dup\0"
133ddbbe
LP
329 "dup2\0"
330 "dup3\0"
133ddbbe
LP
331 "lseek\0"
332 "pread64\0"
333 "preadv\0"
44898c53 334 "preadv2\0"
133ddbbe
LP
335 "pwrite64\0"
336 "pwritev\0"
44898c53 337 "pwritev2\0"
133ddbbe
LP
338 "read\0"
339 "readv\0"
340 "write\0"
341 "writev\0"
342 },
44898c53
LP
343 [SYSCALL_FILTER_SET_CHOWN] = {
344 .name = "@chown",
345 .help = "Change ownership of files and directories",
346 .value =
347 "chown\0"
348 "chown32\0"
349 "fchown\0"
350 "fchown32\0"
351 "fchownat\0"
352 "lchown\0"
353 "lchown32\0"
354 },
8130926d 355 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 356 .name = "@clock",
d5efc18b 357 .help = "Change the system time",
201c1cc2
TM
358 .value =
359 "adjtimex\0"
1f9ac68b 360 "clock_adjtime\0"
6ca67710 361 "clock_adjtime64\0"
1f9ac68b 362 "clock_settime\0"
6ca67710 363 "clock_settime64\0"
201c1cc2 364 "settimeofday\0"
1f9ac68b 365 "stime\0"
8130926d
LP
366 },
367 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 368 .name = "@cpu-emulation",
d5efc18b 369 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
370 .value =
371 "modify_ldt\0"
372 "subpage_prot\0"
373 "switch_endian\0"
374 "vm86\0"
375 "vm86old\0"
8130926d
LP
376 },
377 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 378 .name = "@debug",
d5efc18b 379 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
380 .value =
381 "lookup_dcookie\0"
382 "perf_event_open\0"
8270e3d8 383 "pidfd_getfd\0"
1f9ac68b
LP
384 "ptrace\0"
385 "rtas\0"
6da432fd 386#if defined __s390__ || defined __s390x__
1f9ac68b 387 "s390_runtime_instr\0"
8130926d 388#endif
1f9ac68b 389 "sys_debug_setcontext\0"
8130926d 390 },
1a1b13c9
LP
391 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
392 .name = "@file-system",
393 .help = "File system operations",
394 .value =
395 "access\0"
396 "chdir\0"
397 "chmod\0"
398 "close\0"
399 "creat\0"
400 "faccessat\0"
bcf08acb 401 "faccessat2\0"
1a1b13c9
LP
402 "fallocate\0"
403 "fchdir\0"
404 "fchmod\0"
405 "fchmodat\0"
1a1b13c9 406 "fcntl\0"
ceaa6aa7 407 "fcntl64\0"
1a1b13c9
LP
408 "fgetxattr\0"
409 "flistxattr\0"
ceaa6aa7 410 "fremovexattr\0"
1a1b13c9 411 "fsetxattr\0"
1a1b13c9 412 "fstat\0"
ceaa6aa7 413 "fstat64\0"
1a1b13c9 414 "fstatat64\0"
1a1b13c9 415 "fstatfs\0"
ceaa6aa7 416 "fstatfs64\0"
1a1b13c9 417 "ftruncate\0"
ceaa6aa7 418 "ftruncate64\0"
1a1b13c9
LP
419 "futimesat\0"
420 "getcwd\0"
1a1b13c9 421 "getdents\0"
ceaa6aa7 422 "getdents64\0"
1a1b13c9
LP
423 "getxattr\0"
424 "inotify_add_watch\0"
ceaa6aa7 425 "inotify_init\0"
1a1b13c9
LP
426 "inotify_init1\0"
427 "inotify_rm_watch\0"
428 "lgetxattr\0"
429 "link\0"
430 "linkat\0"
431 "listxattr\0"
432 "llistxattr\0"
433 "lremovexattr\0"
434 "lsetxattr\0"
1a1b13c9 435 "lstat\0"
ceaa6aa7 436 "lstat64\0"
1a1b13c9
LP
437 "mkdir\0"
438 "mkdirat\0"
439 "mknod\0"
440 "mknodat\0"
1a1b13c9 441 "mmap\0"
ceaa6aa7 442 "mmap2\0"
7961116e 443 "munmap\0"
1a1b13c9 444 "newfstatat\0"
ceaa6aa7
LP
445 "oldfstat\0"
446 "oldlstat\0"
447 "oldstat\0"
1a1b13c9
LP
448 "open\0"
449 "openat\0"
8270e3d8 450 "openat2\0"
1a1b13c9
LP
451 "readlink\0"
452 "readlinkat\0"
453 "removexattr\0"
454 "rename\0"
1a1b13c9 455 "renameat\0"
ceaa6aa7 456 "renameat2\0"
1a1b13c9
LP
457 "rmdir\0"
458 "setxattr\0"
1a1b13c9 459 "stat\0"
ceaa6aa7 460 "stat64\0"
1a1b13c9 461 "statfs\0"
ceaa6aa7 462 "statfs64\0"
a4135a74 463 "statx\0"
1a1b13c9
LP
464 "symlink\0"
465 "symlinkat\0"
1a1b13c9 466 "truncate\0"
ceaa6aa7 467 "truncate64\0"
1a1b13c9
LP
468 "unlink\0"
469 "unlinkat\0"
ceaa6aa7 470 "utime\0"
1a1b13c9 471 "utimensat\0"
6ca67710 472 "utimensat_time64\0"
1a1b13c9
LP
473 "utimes\0"
474 },
8130926d 475 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 476 .name = "@io-event",
d5efc18b 477 .help = "Event loop system calls",
201c1cc2
TM
478 .value =
479 "_newselect\0"
201c1cc2 480 "epoll_create\0"
215728ff 481 "epoll_create1\0"
201c1cc2
TM
482 "epoll_ctl\0"
483 "epoll_ctl_old\0"
484 "epoll_pwait\0"
485 "epoll_wait\0"
486 "epoll_wait_old\0"
201c1cc2 487 "eventfd\0"
215728ff 488 "eventfd2\0"
201c1cc2
TM
489 "poll\0"
490 "ppoll\0"
6ca67710 491 "ppoll_time64\0"
201c1cc2 492 "pselect6\0"
6ca67710 493 "pselect6_time64\0"
201c1cc2 494 "select\0"
8130926d
LP
495 },
496 [SYSCALL_FILTER_SET_IPC] = {
8130926d 497 .name = "@ipc",
d5efc18b
ZJS
498 .help = "SysV IPC, POSIX Message Queues or other IPC",
499 .value =
500 "ipc\0"
cd5bfd7e 501 "memfd_create\0"
201c1cc2
TM
502 "mq_getsetattr\0"
503 "mq_notify\0"
504 "mq_open\0"
505 "mq_timedreceive\0"
6ca67710 506 "mq_timedreceive_time64\0"
201c1cc2 507 "mq_timedsend\0"
6ca67710 508 "mq_timedsend_time64\0"
201c1cc2
TM
509 "mq_unlink\0"
510 "msgctl\0"
511 "msgget\0"
512 "msgrcv\0"
513 "msgsnd\0"
cd5bfd7e 514 "pipe\0"
215728ff 515 "pipe2\0"
201c1cc2
TM
516 "process_vm_readv\0"
517 "process_vm_writev\0"
518 "semctl\0"
519 "semget\0"
520 "semop\0"
521 "semtimedop\0"
6ca67710 522 "semtimedop_time64\0"
201c1cc2
TM
523 "shmat\0"
524 "shmctl\0"
525 "shmdt\0"
526 "shmget\0"
8130926d
LP
527 },
528 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 529 .name = "@keyring",
d5efc18b 530 .help = "Kernel keyring access",
1f9ac68b
LP
531 .value =
532 "add_key\0"
533 "keyctl\0"
534 "request_key\0"
8130926d 535 },
cd0ddf6f
LP
536 [SYSCALL_FILTER_SET_MEMLOCK] = {
537 .name = "@memlock",
538 .help = "Memory locking control",
539 .value =
540 "mlock\0"
541 "mlock2\0"
542 "mlockall\0"
543 "munlock\0"
544 "munlockall\0"
545 },
8130926d 546 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 547 .name = "@module",
d5efc18b 548 .help = "Loading and unloading of kernel modules",
201c1cc2 549 .value =
201c1cc2
TM
550 "delete_module\0"
551 "finit_module\0"
552 "init_module\0"
8130926d
LP
553 },
554 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 555 .name = "@mount",
d5efc18b 556 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
557 .value =
558 "chroot\0"
9e486265
LP
559 "fsconfig\0"
560 "fsmount\0"
561 "fsopen\0"
562 "fspick\0"
201c1cc2 563 "mount\0"
9e486265
LP
564 "move_mount\0"
565 "open_tree\0"
201c1cc2 566 "pivot_root\0"
201c1cc2 567 "umount\0"
215728ff 568 "umount2\0"
8130926d
LP
569 },
570 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 571 .name = "@network-io",
d5efc18b 572 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 573 .value =
201c1cc2 574 "accept\0"
215728ff 575 "accept4\0"
201c1cc2
TM
576 "bind\0"
577 "connect\0"
578 "getpeername\0"
579 "getsockname\0"
580 "getsockopt\0"
581 "listen\0"
582 "recv\0"
583 "recvfrom\0"
584 "recvmmsg\0"
6ca67710 585 "recvmmsg_time64\0"
201c1cc2
TM
586 "recvmsg\0"
587 "send\0"
588 "sendmmsg\0"
589 "sendmsg\0"
590 "sendto\0"
591 "setsockopt\0"
592 "shutdown\0"
593 "socket\0"
594 "socketcall\0"
595 "socketpair\0"
8130926d
LP
596 },
597 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 598 /* some unknown even to libseccomp */
8130926d 599 .name = "@obsolete",
d5efc18b 600 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
601 .value =
602 "_sysctl\0"
603 "afs_syscall\0"
802fa07a 604 "bdflush\0"
201c1cc2 605 "break\0"
1f9ac68b 606 "create_module\0"
201c1cc2
TM
607 "ftime\0"
608 "get_kernel_syms\0"
201c1cc2
TM
609 "getpmsg\0"
610 "gtty\0"
7e0c3b8f 611 "idle\0"
201c1cc2 612 "lock\0"
201c1cc2 613 "mpx\0"
201c1cc2
TM
614 "prof\0"
615 "profil\0"
201c1cc2
TM
616 "putpmsg\0"
617 "query_module\0"
201c1cc2
TM
618 "security\0"
619 "sgetmask\0"
620 "ssetmask\0"
621 "stty\0"
1f9ac68b 622 "sysfs\0"
201c1cc2
TM
623 "tuxcall\0"
624 "ulimit\0"
625 "uselib\0"
1f9ac68b 626 "ustat\0"
201c1cc2 627 "vserver\0"
8130926d 628 },
9493b168
ZJS
629 [SYSCALL_FILTER_SET_PKEY] = {
630 .name = "@pkey",
631 .help = "System calls used for memory protection keys",
632 .value =
633 "pkey_alloc\0"
634 "pkey_free\0"
635 "pkey_mprotect\0"
636 },
8130926d 637 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 638 .name = "@privileged",
d5efc18b 639 .help = "All system calls which need super-user capabilities",
201c1cc2 640 .value =
44898c53 641 "@chown\0"
201c1cc2
TM
642 "@clock\0"
643 "@module\0"
644 "@raw-io\0"
af0f047b
LP
645 "@reboot\0"
646 "@swap\0"
215728ff 647 "_sysctl\0"
201c1cc2 648 "acct\0"
201c1cc2 649 "bpf\0"
1f9ac68b 650 "capset\0"
201c1cc2 651 "chroot\0"
a05cfe23 652 "fanotify_init\0"
9e486265 653 "fanotify_mark\0"
201c1cc2 654 "nfsservctl\0"
a05cfe23 655 "open_by_handle_at\0"
201c1cc2
TM
656 "pivot_root\0"
657 "quotactl\0"
201c1cc2 658 "setdomainname\0"
201c1cc2 659 "setfsuid\0"
215728ff 660 "setfsuid32\0"
201c1cc2 661 "setgroups\0"
215728ff 662 "setgroups32\0"
201c1cc2 663 "sethostname\0"
201c1cc2 664 "setresuid\0"
215728ff 665 "setresuid32\0"
201c1cc2 666 "setreuid\0"
215728ff 667 "setreuid32\0"
e05ee49b 668 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 669 "setuid32\0"
201c1cc2 670 "vhangup\0"
8130926d
LP
671 },
672 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 673 .name = "@process",
d5efc18b 674 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
675 .value =
676 "arch_prctl\0"
09d3020b 677 "capget\0" /* Able to query arbitrary processes */
201c1cc2 678 "clone\0"
9e486265 679 "clone3\0"
201c1cc2
TM
680 "execveat\0"
681 "fork\0"
b887d2eb 682 "getrusage\0"
201c1cc2 683 "kill\0"
9e486265 684 "pidfd_open\0"
46fcf95d 685 "pidfd_send_signal\0"
201c1cc2 686 "prctl\0"
b887d2eb
LP
687 "rt_sigqueueinfo\0"
688 "rt_tgsigqueueinfo\0"
201c1cc2 689 "setns\0"
a9518dc3 690 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 691 "tgkill\0"
b887d2eb 692 "times\0"
201c1cc2
TM
693 "tkill\0"
694 "unshare\0"
695 "vfork\0"
b887d2eb
LP
696 "wait4\0"
697 "waitid\0"
698 "waitpid\0"
8130926d
LP
699 },
700 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 701 .name = "@raw-io",
d5efc18b 702 .help = "Raw I/O port access",
201c1cc2
TM
703 .value =
704 "ioperm\0"
705 "iopl\0"
1f9ac68b 706 "pciconfig_iobase\0"
201c1cc2
TM
707 "pciconfig_read\0"
708 "pciconfig_write\0"
6da432fd 709#if defined __s390__ || defined __s390x__
201c1cc2
TM
710 "s390_pci_mmio_read\0"
711 "s390_pci_mmio_write\0"
8130926d
LP
712#endif
713 },
bd2ab3f4
LP
714 [SYSCALL_FILTER_SET_REBOOT] = {
715 .name = "@reboot",
716 .help = "Reboot and reboot preparation/kexec",
717 .value =
bd2ab3f4 718 "kexec_file_load\0"
e59608fa 719 "kexec_load\0"
bd2ab3f4
LP
720 "reboot\0"
721 },
133ddbbe 722 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 723 .name = "@resources",
58a8f68b 724 .help = "Alter resource settings",
133ddbbe 725 .value =
0963c053
LP
726 "ioprio_set\0"
727 "mbind\0"
728 "migrate_pages\0"
729 "move_pages\0"
730 "nice\0"
0963c053
LP
731 "sched_setaffinity\0"
732 "sched_setattr\0"
133ddbbe
LP
733 "sched_setparam\0"
734 "sched_setscheduler\0"
0963c053 735 "set_mempolicy\0"
133ddbbe
LP
736 "setpriority\0"
737 "setrlimit\0"
133ddbbe 738 },
6eaaeee9
LP
739 [SYSCALL_FILTER_SET_SETUID] = {
740 .name = "@setuid",
741 .help = "Operations for changing user/group credentials",
742 .value =
6eaaeee9 743 "setgid\0"
215728ff 744 "setgid32\0"
6eaaeee9 745 "setgroups\0"
215728ff 746 "setgroups32\0"
6eaaeee9 747 "setregid\0"
215728ff 748 "setregid32\0"
6eaaeee9 749 "setresgid\0"
215728ff 750 "setresgid32\0"
6eaaeee9 751 "setresuid\0"
215728ff 752 "setresuid32\0"
6eaaeee9 753 "setreuid\0"
215728ff 754 "setreuid32\0"
6eaaeee9 755 "setuid\0"
215728ff 756 "setuid32\0"
6eaaeee9 757 },
cd0ddf6f
LP
758 [SYSCALL_FILTER_SET_SIGNAL] = {
759 .name = "@signal",
760 .help = "Process signal handling",
761 .value =
762 "rt_sigaction\0"
763 "rt_sigpending\0"
764 "rt_sigprocmask\0"
765 "rt_sigsuspend\0"
766 "rt_sigtimedwait\0"
6ca67710 767 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
768 "sigaction\0"
769 "sigaltstack\0"
770 "signal\0"
771 "signalfd\0"
772 "signalfd4\0"
773 "sigpending\0"
774 "sigprocmask\0"
775 "sigsuspend\0"
776 },
bd2ab3f4
LP
777 [SYSCALL_FILTER_SET_SWAP] = {
778 .name = "@swap",
779 .help = "Enable/disable swap devices",
780 .value =
781 "swapoff\0"
782 "swapon\0"
783 },
44898c53
LP
784 [SYSCALL_FILTER_SET_SYNC] = {
785 .name = "@sync",
786 .help = "Synchronize files and memory to storage",
787 .value =
788 "fdatasync\0"
789 "fsync\0"
790 "msync\0"
791 "sync\0"
792 "sync_file_range\0"
a8fb09f5 793 "sync_file_range2\0"
44898c53
LP
794 "syncfs\0"
795 },
70526841
LP
796 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
797 .name = "@system-service",
798 .help = "General system service operations",
799 .value =
800 "@aio\0"
801 "@basic-io\0"
802 "@chown\0"
803 "@default\0"
804 "@file-system\0"
805 "@io-event\0"
806 "@ipc\0"
807 "@keyring\0"
808 "@memlock\0"
809 "@network-io\0"
810 "@process\0"
811 "@resources\0"
812 "@setuid\0"
813 "@signal\0"
814 "@sync\0"
815 "@timer\0"
816 "brk\0"
817 "capget\0"
818 "capset\0"
819 "copy_file_range\0"
820 "fadvise64\0"
821 "fadvise64_64\0"
822 "flock\0"
823 "get_mempolicy\0"
824 "getcpu\0"
825 "getpriority\0"
826 "getrandom\0"
827 "ioctl\0"
828 "ioprio_get\0"
829 "kcmp\0"
830 "madvise\0"
70526841
LP
831 "mprotect\0"
832 "mremap\0"
833 "name_to_handle_at\0"
834 "oldolduname\0"
835 "olduname\0"
836 "personality\0"
837 "readahead\0"
838 "readdir\0"
839 "remap_file_pages\0"
840 "sched_get_priority_max\0"
841 "sched_get_priority_min\0"
842 "sched_getaffinity\0"
843 "sched_getattr\0"
844 "sched_getparam\0"
845 "sched_getscheduler\0"
846 "sched_rr_get_interval\0"
6ca67710 847 "sched_rr_get_interval_time64\0"
70526841
LP
848 "sched_yield\0"
849 "sendfile\0"
850 "sendfile64\0"
851 "setfsgid\0"
852 "setfsgid32\0"
853 "setfsuid\0"
854 "setfsuid32\0"
855 "setpgid\0"
856 "setsid\0"
857 "splice\0"
858 "sysinfo\0"
859 "tee\0"
860 "umask\0"
861 "uname\0"
862 "userfaultfd\0"
863 "vmsplice\0"
864 },
cd0ddf6f
LP
865 [SYSCALL_FILTER_SET_TIMER] = {
866 .name = "@timer",
867 .help = "Schedule operations by time",
868 .value =
869 "alarm\0"
870 "getitimer\0"
871 "setitimer\0"
872 "timer_create\0"
873 "timer_delete\0"
874 "timer_getoverrun\0"
875 "timer_gettime\0"
6ca67710 876 "timer_gettime64\0"
cd0ddf6f 877 "timer_settime\0"
6ca67710 878 "timer_settime64\0"
cd0ddf6f
LP
879 "timerfd_create\0"
880 "timerfd_gettime\0"
6ca67710 881 "timerfd_gettime64\0"
cd0ddf6f 882 "timerfd_settime\0"
6ca67710 883 "timerfd_settime64\0"
cd0ddf6f
LP
884 "times\0"
885 },
201c1cc2 886};
8130926d
LP
887
888const SyscallFilterSet *syscall_filter_set_find(const char *name) {
889 unsigned i;
890
891 if (isempty(name) || name[0] != '@')
892 return NULL;
893
894 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
895 if (streq(syscall_filter_sets[i].name, name))
896 return syscall_filter_sets + i;
897
898 return NULL;
899}
900
b54f36c6 901static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 902
b54f36c6 903int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
904 assert(seccomp);
905 assert(name);
906
960e4569
LP
907 if (strv_contains(exclude, name))
908 return 0;
909
69b1b241
LP
910 if (name[0] == '@') {
911 const SyscallFilterSet *other;
912
913 other = syscall_filter_set_find(name);
baaa35ad
ZJS
914 if (!other)
915 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
916 "Filter set %s is not known!",
917 name);
69b1b241 918
b54f36c6
ZJS
919 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
920
69b1b241 921 } else {
b54f36c6 922 int id, r;
69b1b241
LP
923
924 id = seccomp_syscall_resolve_name(name);
cff7bff8 925 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
926 if (log_missing)
927 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 928 return 0;
cff7bff8 929 }
69b1b241
LP
930
931 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 932 if (r < 0) {
69b1b241 933 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
934 bool ignore = r == -EDOM;
935
936 if (!ignore || log_missing)
937 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
938 name, id, ignore ? ", ignoring" : "");
939 if (!ignore)
940 return r;
b54f36c6 941 }
69b1b241 942
b54f36c6
ZJS
943 return 0;
944 }
69b1b241
LP
945}
946
469830d1
LP
947static int seccomp_add_syscall_filter_set(
948 scmp_filter_ctx seccomp,
469830d1 949 const SyscallFilterSet *set,
960e4569 950 uint32_t action,
b54f36c6
ZJS
951 char **exclude,
952 bool log_missing) {
469830d1 953
8130926d
LP
954 const char *sys;
955 int r;
956
957 assert(seccomp);
958 assert(set);
959
960 NULSTR_FOREACH(sys, set->value) {
b54f36c6 961 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
962 if (r < 0)
963 return r;
469830d1
LP
964 }
965
966 return 0;
967}
968
b54f36c6 969int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
970 uint32_t arch;
971 int r;
972
973 assert(set);
974
975 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 976 * each local arch. */
469830d1
LP
977
978 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
979 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
980
981 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
982
983 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
984 if (r < 0)
985 return r;
469830d1 986
b54f36c6 987 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
988 if (r < 0)
989 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
990
991 r = seccomp_load(seccomp);
7bc5e0b1 992 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
993 return r;
994 if (r < 0)
995 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
996 }
997
998 return 0;
999}
a3be2849 1000
b54f36c6 1001int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 1002 uint32_t arch;
a3be2849
LP
1003 int r;
1004
469830d1
LP
1005 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1006 * SyscallFilterSet* table. */
a3be2849 1007
8cfa775f 1008 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 1009 return 0;
a3be2849 1010
469830d1
LP
1011 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1012 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1013 Iterator i;
b54f36c6 1014 void *syscall_id, *val;
a3be2849 1015
469830d1 1016 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1017
469830d1
LP
1018 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1019 if (r < 0)
1020 return r;
a3be2849 1021
b54f36c6 1022 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 1023 uint32_t a = action;
b54f36c6
ZJS
1024 int id = PTR_TO_INT(syscall_id) - 1;
1025 int error = PTR_TO_INT(val);
8cfa775f 1026
b54f36c6
ZJS
1027 if (action != SCMP_ACT_ALLOW && error >= 0)
1028 a = SCMP_ACT_ERRNO(error);
8cfa775f 1029
b54f36c6 1030 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
1031 if (r < 0) {
1032 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1033 _cleanup_free_ char *n = NULL;
7e86bd73 1034 bool ignore;
469830d1 1035
b54f36c6 1036 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1037 ignore = r == -EDOM;
1038 if (!ignore || log_missing)
1039 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1040 strna(n), id, ignore ? ", ignoring" : "");
1041 if (!ignore)
1042 return r;
469830d1
LP
1043 }
1044 }
1045
1046 r = seccomp_load(seccomp);
7bc5e0b1 1047 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1048 return r;
1049 if (r < 0)
1050 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1051 }
1052
1053 return 0;
add00535
LP
1054}
1055
58f6ab44 1056int seccomp_parse_syscall_filter(
898748d8
YW
1057 const char *name,
1058 int errno_num,
1059 Hashmap *filter,
13d92c63 1060 SeccompParseFlags flags,
898748d8
YW
1061 const char *unit,
1062 const char *filename,
1063 unsigned line) {
1064
1065 int r;
1066
1067 assert(name);
1068 assert(filter);
1069
1070 if (name[0] == '@') {
1071 const SyscallFilterSet *set;
1072 const char *i;
1073
1074 set = syscall_filter_set_find(name);
1075 if (!set) {
13d92c63 1076 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1077 return -EINVAL;
13d92c63
LP
1078
1079 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1080 "Unknown system call group, ignoring: %s", name);
1081 return 0;
898748d8
YW
1082 }
1083
1084 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1085 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1086 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1087 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1088 * about them. */
58f6ab44 1089 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1090 if (r < 0)
1091 return r;
1092 }
1093 } else {
1094 int id;
1095
1096 id = seccomp_syscall_resolve_name(name);
1097 if (id == __NR_SCMP_ERROR) {
13d92c63 1098 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1099 return -EINVAL;
13d92c63
LP
1100
1101 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1102 "Failed to parse system call, ignoring: %s", name);
1103 return 0;
898748d8
YW
1104 }
1105
1106 /* If we previously wanted to forbid a syscall and now
1107 * we want to allow it, then remove it from the list. */
6b000af4 1108 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
898748d8
YW
1109 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1110 if (r < 0)
851ee70a
LW
1111 switch (r) {
1112 case -ENOMEM:
1113 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1114 case -EEXIST:
9d7fe7c6
LW
1115 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1116 break;
851ee70a
LW
1117 default:
1118 return r;
1119 }
898748d8
YW
1120 } else
1121 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1122 }
1123
1124 return 0;
1125}
1126
add00535 1127int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1128 uint32_t arch;
add00535
LP
1129 int r;
1130
f1d34068 1131 if (DEBUG_LOGGING) {
add00535
LP
1132 _cleanup_free_ char *s = NULL;
1133
86c2a9f1 1134 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1135 log_debug("Restricting namespace to: %s.", strna(s));
1136 }
1137
1138 /* NOOP? */
1139 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1140 return 0;
1141
469830d1
LP
1142 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1143 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1144 unsigned i;
add00535 1145
469830d1
LP
1146 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1147
1148 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1149 if (r < 0)
1150 return r;
1151
1152 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1153 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1154 * altogether. */
1155 r = seccomp_rule_add_exact(
1156 seccomp,
1157 SCMP_ACT_ERRNO(EPERM),
1158 SCMP_SYS(setns),
1159 0);
1160 else
1161 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1162 * special invocation with a zero flags argument, right here. */
1163 r = seccomp_rule_add_exact(
1164 seccomp,
1165 SCMP_ACT_ERRNO(EPERM),
1166 SCMP_SYS(setns),
1167 1,
1168 SCMP_A1(SCMP_CMP_EQ, 0));
1169 if (r < 0) {
1170 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1171 continue;
1172 }
1173
1174 for (i = 0; namespace_flag_map[i].name; i++) {
1175 unsigned long f;
1176
1177 f = namespace_flag_map[i].flag;
1178 if ((retain & f) == f) {
1179 log_debug("Permitting %s.", namespace_flag_map[i].name);
1180 continue;
1181 }
1182
1183 log_debug("Blocking %s.", namespace_flag_map[i].name);
1184
1185 r = seccomp_rule_add_exact(
1186 seccomp,
1187 SCMP_ACT_ERRNO(EPERM),
1188 SCMP_SYS(unshare),
1189 1,
1190 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1191 if (r < 0) {
1192 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1193 break;
1194 }
1195
511ceb1f
ZJS
1196 /* On s390/s390x the first two parameters to clone are switched */
1197 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1198 r = seccomp_rule_add_exact(
1199 seccomp,
1200 SCMP_ACT_ERRNO(EPERM),
1201 SCMP_SYS(clone),
1202 1,
1203 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1204 else
1205 r = seccomp_rule_add_exact(
1206 seccomp,
1207 SCMP_ACT_ERRNO(EPERM),
1208 SCMP_SYS(clone),
1209 1,
1210 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1211 if (r < 0) {
1212 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1213 break;
1214 }
1215
1216 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1217 r = seccomp_rule_add_exact(
1218 seccomp,
1219 SCMP_ACT_ERRNO(EPERM),
1220 SCMP_SYS(setns),
1221 1,
1222 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1223 if (r < 0) {
1224 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1225 break;
1226 }
1227 }
1228 }
1229 if (r < 0)
1230 continue;
1231
1232 r = seccomp_load(seccomp);
7bc5e0b1 1233 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1234 return r;
1235 if (r < 0)
1236 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1237 }
1238
1239 return 0;
1240}
1241
1242int seccomp_protect_sysctl(void) {
1243 uint32_t arch;
1244 int r;
1245
1246 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1247 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1248
1249 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1250
2e64e8f4
ZJS
1251 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1252 /* No _sysctl syscall */
1253 continue;
1254
469830d1
LP
1255 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1256 if (r < 0)
1257 return r;
1258
1259 r = seccomp_rule_add_exact(
add00535
LP
1260 seccomp,
1261 SCMP_ACT_ERRNO(EPERM),
469830d1 1262 SCMP_SYS(_sysctl),
add00535 1263 0);
469830d1
LP
1264 if (r < 0) {
1265 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1266 continue;
1267 }
1268
1269 r = seccomp_load(seccomp);
7bc5e0b1 1270 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1271 return r;
1272 if (r < 0)
1273 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1274 }
1275
1276 return 0;
1277}
1278
620dbdd2
KK
1279int seccomp_protect_syslog(void) {
1280 uint32_t arch;
1281 int r;
1282
1283 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1284 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1285
1286 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1287 if (r < 0)
1288 return r;
1289
1290 r = seccomp_rule_add_exact(
1291 seccomp,
1292 SCMP_ACT_ERRNO(EPERM),
1293 SCMP_SYS(syslog),
1294 0);
1295
1296 if (r < 0) {
1297 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1298 continue;
1299 }
1300
1301 r = seccomp_load(seccomp);
1302 if (ERRNO_IS_SECCOMP_FATAL(r))
1303 return r;
1304 if (r < 0)
1305 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1306 }
1307
1308 return 0;
1309}
1310
6b000af4 1311int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1312 uint32_t arch;
1313 int r;
1314
1315 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1316 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1317 bool supported;
469830d1
LP
1318 Iterator i;
1319
1320 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1321
9606bc4b
LP
1322 switch (arch) {
1323
1324 case SCMP_ARCH_X86_64:
1325 case SCMP_ARCH_X32:
1326 case SCMP_ARCH_ARM:
1327 case SCMP_ARCH_AARCH64:
0d9fca76 1328 case SCMP_ARCH_PPC:
da1921a5
ZJS
1329 case SCMP_ARCH_PPC64:
1330 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1331 case SCMP_ARCH_MIPSEL64N32:
1332 case SCMP_ARCH_MIPS64N32:
1333 case SCMP_ARCH_MIPSEL64:
1334 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1335 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1336 supported = true;
1337 break;
1338
9606bc4b
LP
1339 case SCMP_ARCH_S390:
1340 case SCMP_ARCH_S390X:
da1921a5 1341 case SCMP_ARCH_X86:
f5aeac14
JC
1342 case SCMP_ARCH_MIPSEL:
1343 case SCMP_ARCH_MIPS:
9606bc4b
LP
1344 default:
1345 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1346 * don't know */
1347 supported = false;
1348 break;
1349 }
1350
1351 if (!supported)
1352 continue;
1353
469830d1
LP
1354 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1355 if (r < 0)
1356 return r;
1357
6b000af4 1358 if (allow_list) {
469830d1
LP
1359 int af, first = 0, last = 0;
1360 void *afp;
1361
6b000af4
LP
1362 /* If this is an allow list, we first block the address families that are out of
1363 * range and then everything that is not in the set. First, we find the lowest and
1364 * highest address family in the set. */
469830d1
LP
1365
1366 SET_FOREACH(afp, address_families, i) {
1367 af = PTR_TO_INT(afp);
1368
1369 if (af <= 0 || af >= af_max())
1370 continue;
1371
1372 if (first == 0 || af < first)
1373 first = af;
1374
1375 if (last == 0 || af > last)
1376 last = af;
1377 }
1378
1379 assert((first == 0) == (last == 0));
1380
1381 if (first == 0) {
1382
1383 /* No entries in the valid range, block everything */
1384 r = seccomp_rule_add_exact(
1385 seccomp,
1386 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1387 SCMP_SYS(socket),
1388 0);
1389 if (r < 0) {
1390 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1391 continue;
1392 }
1393
1394 } else {
1395
1396 /* Block everything below the first entry */
1397 r = seccomp_rule_add_exact(
1398 seccomp,
1399 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1400 SCMP_SYS(socket),
1401 1,
1402 SCMP_A0(SCMP_CMP_LT, first));
1403 if (r < 0) {
1404 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1405 continue;
1406 }
1407
1408 /* Block everything above the last entry */
1409 r = seccomp_rule_add_exact(
1410 seccomp,
1411 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1412 SCMP_SYS(socket),
1413 1,
1414 SCMP_A0(SCMP_CMP_GT, last));
1415 if (r < 0) {
1416 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1417 continue;
1418 }
1419
1420 /* Block everything between the first and last entry */
1421 for (af = 1; af < af_max(); af++) {
1422
1423 if (set_contains(address_families, INT_TO_PTR(af)))
1424 continue;
1425
1426 r = seccomp_rule_add_exact(
1427 seccomp,
1428 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1429 SCMP_SYS(socket),
1430 1,
1431 SCMP_A0(SCMP_CMP_EQ, af));
1432 if (r < 0)
1433 break;
1434 }
469830d1
LP
1435 if (r < 0) {
1436 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1437 continue;
1438 }
1439 }
1440
1441 } else {
1442 void *af;
1443
6b000af4
LP
1444 /* If this is a deny list, then generate one rule for each address family that are
1445 * then combined in OR checks. */
469830d1
LP
1446
1447 SET_FOREACH(af, address_families, i) {
1448
1449 r = seccomp_rule_add_exact(
1450 seccomp,
1451 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1452 SCMP_SYS(socket),
1453 1,
1454 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1455 if (r < 0)
1456 break;
1457 }
469830d1
LP
1458 if (r < 0) {
1459 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1460 continue;
1461 }
1462 }
1463
1464 r = seccomp_load(seccomp);
7bc5e0b1 1465 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1466 return r;
1467 if (r < 0)
1468 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1469 }
1470
1471 return 0;
1472}
1473
1474int seccomp_restrict_realtime(void) {
1475 static const int permitted_policies[] = {
1476 SCHED_OTHER,
1477 SCHED_BATCH,
1478 SCHED_IDLE,
1479 };
1480
1481 int r, max_policy = 0;
1482 uint32_t arch;
1483 unsigned i;
1484
1485 /* Determine the highest policy constant we want to allow */
1486 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1487 if (permitted_policies[i] > max_policy)
1488 max_policy = permitted_policies[i];
1489
1490 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1491 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1492 int p;
1493
1494 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1495
1496 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1497 if (r < 0)
1498 return r;
1499
1500 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1501 * allow list. */
469830d1
LP
1502 for (p = 0; p < max_policy; p++) {
1503 bool good = false;
1504
6b000af4 1505 /* Check if this is in the allow list. */
469830d1
LP
1506 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1507 if (permitted_policies[i] == p) {
1508 good = true;
1509 break;
1510 }
1511
1512 if (good)
1513 continue;
1514
1515 /* Deny this policy */
1516 r = seccomp_rule_add_exact(
1517 seccomp,
1518 SCMP_ACT_ERRNO(EPERM),
1519 SCMP_SYS(sched_setscheduler),
1520 1,
1521 SCMP_A1(SCMP_CMP_EQ, p));
1522 if (r < 0) {
1523 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1524 continue;
1525 }
1526 }
1527
6b000af4
LP
1528 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1529 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1530 r = seccomp_rule_add_exact(
add00535
LP
1531 seccomp,
1532 SCMP_ACT_ERRNO(EPERM),
469830d1 1533 SCMP_SYS(sched_setscheduler),
add00535 1534 1,
469830d1
LP
1535 SCMP_A1(SCMP_CMP_GT, max_policy));
1536 if (r < 0) {
1537 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1538 continue;
1539 }
add00535 1540
469830d1 1541 r = seccomp_load(seccomp);
7bc5e0b1 1542 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1543 return r;
1544 if (r < 0)
1545 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1546 }
1547
1548 return 0;
1549}
1550
6dc66688
ZJS
1551static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1552 uint32_t arch,
1553 int nr,
14cb109d 1554 unsigned arg_cnt,
6dc66688
ZJS
1555 const struct scmp_arg_cmp arg) {
1556 int r;
1557
1558 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1559 if (r < 0) {
1560 _cleanup_free_ char *n = NULL;
1561
1562 n = seccomp_syscall_resolve_num_arch(arch, nr);
1563 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1564 strna(n),
1565 seccomp_arch_to_string(arch));
1566 }
1567
1568 return r;
1569}
1570
2a8d6e63 1571/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1572#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1573assert_cc(SCMP_SYS(shmget) > 0);
1574assert_cc(SCMP_SYS(shmat) > 0);
1575assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1576#endif
6dc66688 1577
469830d1
LP
1578int seccomp_memory_deny_write_execute(void) {
1579 uint32_t arch;
b069c2a3 1580 unsigned loaded = 0;
469830d1
LP
1581
1582 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1583 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1584 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1585
469830d1
LP
1586 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1587
8a50cf69
LP
1588 switch (arch) {
1589
bed4668d
CE
1590 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1591 * We ignore that here, which means there's still a way to get writable/executable
1592 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1593
8a50cf69 1594 case SCMP_ARCH_X86:
57311925 1595 case SCMP_ARCH_S390:
8a50cf69
LP
1596 filter_syscall = SCMP_SYS(mmap2);
1597 block_syscall = SCMP_SYS(mmap);
bed4668d 1598 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1599 break;
1600
63d00dfb 1601 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1602 case SCMP_ARCH_PPC64:
1603 case SCMP_ARCH_PPC64LE:
bed4668d 1604 case SCMP_ARCH_S390X:
2a8d6e63 1605 filter_syscall = SCMP_SYS(mmap);
bed4668d 1606 /* shmat multiplexed, see above */
8a50cf69
LP
1607 break;
1608
4278d1f5
ZJS
1609 case SCMP_ARCH_ARM:
1610 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1611 shmat_syscall = SCMP_SYS(shmat);
1612 break;
1613
8a50cf69
LP
1614 case SCMP_ARCH_X86_64:
1615 case SCMP_ARCH_X32:
79873bc8 1616 case SCMP_ARCH_AARCH64:
bed4668d 1617 filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
8a50cf69
LP
1618 shmat_syscall = SCMP_SYS(shmat);
1619 break;
1620
1621 /* Please add more definitions here, if you port systemd to other architectures! */
1622
57311925 1623#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
8a50cf69
LP
1624#warning "Consider adding the right mmap() syscall definitions here!"
1625#endif
1626 }
1627
1628 /* Can't filter mmap() on this arch, then skip it */
1629 if (filter_syscall == 0)
1630 continue;
1631
469830d1
LP
1632 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1633 if (r < 0)
1634 return r;
1635
6dc66688
ZJS
1636 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1637 1,
1638 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1639 if (r < 0)
1640 continue;
8a50cf69
LP
1641
1642 if (block_syscall != 0) {
6dc66688
ZJS
1643 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1644 if (r < 0)
8a50cf69 1645 continue;
add00535 1646 }
a3be2849 1647
6dc66688
ZJS
1648 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1649 1,
b835eeb4
ZJS
1650 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1651 if (r < 0)
1652 continue;
1653
91691f1d 1654#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1655 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1656 1,
6dc66688
ZJS
1657 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1658 if (r < 0)
469830d1 1659 continue;
91691f1d 1660#endif
add00535 1661
67fb5f33 1662 if (shmat_syscall > 0) {
5ef3ed97 1663 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1664 1,
1665 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1666 if (r < 0)
8a50cf69 1667 continue;
469830d1
LP
1668 }
1669
1670 r = seccomp_load(seccomp);
7bc5e0b1 1671 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1672 return r;
add00535 1673 if (r < 0)
b069c2a3
ZJS
1674 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1675 seccomp_arch_to_string(arch));
903659e7 1676 loaded++;
469830d1 1677 }
add00535 1678
903659e7 1679 if (loaded == 0)
b069c2a3 1680 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1681
1682 return loaded;
469830d1
LP
1683}
1684
1685int seccomp_restrict_archs(Set *archs) {
1686 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1687 Iterator i;
1688 void *id;
1689 int r;
1690
1691 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1692 * list.
1693 *
1694 * There are some qualifications. However the most important use is to stop processes from bypassing
1695 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1696 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1697
2428aaf8
AJ
1698 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1699 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1700 * to run a program with the restrictions applied. */
469830d1
LP
1701 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1702 if (!seccomp)
1703 return -ENOMEM;
1704
1705 SET_FOREACH(id, archs, i) {
1706 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1707 if (r < 0 && r != -EEXIST)
1708 return r;
1709 }
1710
1711 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1712 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1713 * The important thing is that you can block the old 32-bit x86 syscalls.
1714 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1715
1716 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1717 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1718
1719 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1720 if (r < 0 && r != -EEXIST)
469830d1 1721 return r;
add00535
LP
1722 }
1723
469830d1
LP
1724 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1725 if (r < 0)
1726 return r;
add00535 1727
1c6af69b 1728 r = seccomp_load(seccomp);
7bc5e0b1 1729 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1730 return r;
1731 if (r < 0)
1732 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1733
1734 return 0;
a3be2849 1735}
b16bd535 1736
de7fef4b
ZJS
1737int parse_syscall_archs(char **l, Set **ret_archs) {
1738 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1739 char **s;
1740 int r;
1741
1742 assert(l);
de7fef4b 1743 assert(ret_archs);
b16bd535
YW
1744
1745 STRV_FOREACH(s, l) {
1746 uint32_t a;
1747
1748 r = seccomp_arch_from_string(*s, &a);
1749 if (r < 0)
1750 return -EINVAL;
1751
de7fef4b 1752 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1753 if (r < 0)
1754 return -ENOMEM;
1755 }
1756
de7fef4b 1757 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1758 return 0;
1759}
165a31c0 1760
8cfa775f 1761int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1762 const char *i;
1763 int r;
1764
1765 assert(set);
1766
1767 NULSTR_FOREACH(i, set->value) {
1768
1769 if (i[0] == '@') {
1770 const SyscallFilterSet *more;
1771
1772 more = syscall_filter_set_find(i);
1773 if (!more)
1774 return -ENXIO;
1775
165a31c0
LP
1776 r = seccomp_filter_set_add(filter, add, more);
1777 if (r < 0)
1778 return r;
1779 } else {
1780 int id;
1781
1782 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1783 if (id == __NR_SCMP_ERROR) {
1784 log_debug("Couldn't resolve system call, ignoring: %s", i);
1785 continue;
1786 }
165a31c0
LP
1787
1788 if (add) {
8cfa775f 1789 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1790 if (r < 0)
1791 return r;
1792 } else
8cfa775f 1793 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1794 }
1795 }
1796
1797 return 0;
1798}
78e864e5
TM
1799
1800int seccomp_lock_personality(unsigned long personality) {
72eafe71 1801 uint32_t arch;
78e864e5
TM
1802 int r;
1803
72eafe71
LP
1804 if (personality >= PERSONALITY_INVALID)
1805 return -EINVAL;
78e864e5 1806
72eafe71
LP
1807 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1808 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1809
72eafe71
LP
1810 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1811 if (r < 0)
1812 return r;
1813
1814 r = seccomp_rule_add_exact(
1815 seccomp,
1816 SCMP_ACT_ERRNO(EPERM),
1817 SCMP_SYS(personality),
1818 1,
1819 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1820 if (r < 0) {
1821 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1822 continue;
1823 }
72eafe71
LP
1824
1825 r = seccomp_load(seccomp);
7bc5e0b1 1826 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1827 return r;
1828 if (r < 0)
1829 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1830 }
1831
1832 return 0;
78e864e5 1833}
aecd5ac6
TM
1834
1835int seccomp_protect_hostname(void) {
1836 uint32_t arch;
1837 int r;
1838
1839 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1840 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1841
1842 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1843 if (r < 0)
1844 return r;
1845
1846 r = seccomp_rule_add_exact(
1847 seccomp,
1848 SCMP_ACT_ERRNO(EPERM),
1849 SCMP_SYS(sethostname),
1850 0);
9e6e543c
LP
1851 if (r < 0) {
1852 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1853 continue;
9e6e543c 1854 }
aecd5ac6
TM
1855
1856 r = seccomp_rule_add_exact(
1857 seccomp,
1858 SCMP_ACT_ERRNO(EPERM),
1859 SCMP_SYS(setdomainname),
1860 0);
9e6e543c
LP
1861 if (r < 0) {
1862 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1863 continue;
9e6e543c 1864 }
aecd5ac6
TM
1865
1866 r = seccomp_load(seccomp);
7bc5e0b1 1867 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1868 return r;
1869 if (r < 0)
1870 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1871 }
1872
1873 return 0;
1874}
3c27973b 1875
da4dc9a6
ZJS
1876static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1877 /* Checks the mode_t parameter of the following system calls:
1878 *
1879 * → chmod() + fchmod() + fchmodat()
1880 * → open() + creat() + openat()
1881 * → mkdir() + mkdirat()
1882 * → mknod() + mknodat()
1883 *
1884 * Returns error if *everything* failed, and 0 otherwise.
1885 */
1886 int r = 0;
1887 bool any = false;
1888
1889 r = seccomp_rule_add_exact(
1890 seccomp,
1891 SCMP_ACT_ERRNO(EPERM),
1892 SCMP_SYS(chmod),
1893 1,
1894 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1895 if (r < 0)
1896 log_debug_errno(r, "Failed to add filter for chmod: %m");
1897 else
1898 any = true;
1899
1900 r = seccomp_rule_add_exact(
1901 seccomp,
1902 SCMP_ACT_ERRNO(EPERM),
1903 SCMP_SYS(fchmod),
1904 1,
1905 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1906 if (r < 0)
1907 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1908 else
1909 any = true;
1910
1911 r = seccomp_rule_add_exact(
1912 seccomp,
1913 SCMP_ACT_ERRNO(EPERM),
1914 SCMP_SYS(fchmodat),
1915 1,
1916 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1917 if (r < 0)
1918 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1919 else
1920 any = true;
1921
1922 r = seccomp_rule_add_exact(
1923 seccomp,
1924 SCMP_ACT_ERRNO(EPERM),
1925 SCMP_SYS(mkdir),
1926 1,
1927 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1928 if (r < 0)
1929 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1930 else
1931 any = true;
1932
1933 r = seccomp_rule_add_exact(
1934 seccomp,
1935 SCMP_ACT_ERRNO(EPERM),
1936 SCMP_SYS(mkdirat),
1937 1,
1938 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1939 if (r < 0)
1940 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1941 else
1942 any = true;
1943
1944 r = seccomp_rule_add_exact(
1945 seccomp,
1946 SCMP_ACT_ERRNO(EPERM),
1947 SCMP_SYS(mknod),
1948 1,
1949 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1950 if (r < 0)
1951 log_debug_errno(r, "Failed to add filter for mknod: %m");
1952 else
1953 any = true;
1954
1955 r = seccomp_rule_add_exact(
1956 seccomp,
1957 SCMP_ACT_ERRNO(EPERM),
1958 SCMP_SYS(mknodat),
1959 1,
1960 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1961 if (r < 0)
1962 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1963 else
1964 any = true;
1965
1966#if SCMP_SYS(open) > 0
1967 r = seccomp_rule_add_exact(
1968 seccomp,
1969 SCMP_ACT_ERRNO(EPERM),
1970 SCMP_SYS(open),
1971 2,
1972 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1973 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1974 if (r < 0)
1975 log_debug_errno(r, "Failed to add filter for open: %m");
1976 else
1977 any = true;
1978#endif
1979
1980 r = seccomp_rule_add_exact(
1981 seccomp,
1982 SCMP_ACT_ERRNO(EPERM),
1983 SCMP_SYS(openat),
1984 2,
1985 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1986 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1987 if (r < 0)
1988 log_debug_errno(r, "Failed to add filter for openat: %m");
1989 else
1990 any = true;
1991
ecc04067
LP
1992#if defined(__SNR_openat2)
1993 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
1994 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
1995 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
1996 * compatible with kernels that are not absolutely recent. */
1997 r = seccomp_rule_add_exact(
1998 seccomp,
1999 SCMP_ACT_ERRNO(EPERM),
2000 SCMP_SYS(openat2),
2001 0);
2002 if (r < 0)
2003 log_debug_errno(r, "Failed to add filter for openat2: %m");
2004 else
2005 any = true;
2006#endif
2007
da4dc9a6
ZJS
2008 r = seccomp_rule_add_exact(
2009 seccomp,
2010 SCMP_ACT_ERRNO(EPERM),
2011 SCMP_SYS(creat),
2012 1,
2013 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2014 if (r < 0)
2015 log_debug_errno(r, "Failed to add filter for creat: %m");
2016 else
2017 any = true;
2018
2019 return any ? 0 : r;
2020}
2021
3c27973b
LP
2022int seccomp_restrict_suid_sgid(void) {
2023 uint32_t arch;
da4dc9a6 2024 int r, k;
3c27973b
LP
2025
2026 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2027 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2028
2029 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2030 if (r < 0)
2031 return r;
2032
da4dc9a6
ZJS
2033 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2034 if (r < 0)
2035 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2036
da4dc9a6
ZJS
2037 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2038 if (k < 0)
2039 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2040
da4dc9a6 2041 if (r < 0 && k < 0)
3c27973b 2042 continue;
3c27973b
LP
2043
2044 r = seccomp_load(seccomp);
7bc5e0b1 2045 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2046 return r;
2047 if (r < 0)
2048 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2049 }
2050
2051 return 0;
2052}
915fb324
LP
2053
2054uint32_t scmp_act_kill_process(void) {
2055
2056 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2057 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2058 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2059 * for single-threaded apps does the right thing. */
2060
2061#ifdef SCMP_ACT_KILL_PROCESS
2062 if (seccomp_api_get() >= 3)
2063 return SCMP_ACT_KILL_PROCESS;
2064#endif
2065
2066 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2067}