]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
Merge pull request #12267 from keszybz/udev-settle-warning
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
d8b4d14d 15#include "errno-list.h"
a8fbdf54 16#include "macro.h"
add00535 17#include "nsflags.h"
d8b4d14d 18#include "nulstr-util.h"
78e864e5 19#include "process-util.h"
cf0fbc49 20#include "seccomp-util.h"
b16bd535 21#include "set.h"
07630cea 22#include "string-util.h"
b16bd535 23#include "strv.h"
469830d1
LP
24
25const uint32_t seccomp_local_archs[] = {
26
f2d9751c
LP
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28
29#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
f2d9751c
LP
32 SCMP_ARCH_X32, /* native */
33#elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
469830d1 35 SCMP_ARCH_X32,
f2d9751c
LP
36 SCMP_ARCH_X86_64, /* native */
37#elif defined(__i386__)
38 SCMP_ARCH_X86,
39#elif defined(__aarch64__)
469830d1 40 SCMP_ARCH_ARM,
f2d9751c
LP
41 SCMP_ARCH_AARCH64, /* native */
42#elif defined(__arm__)
43 SCMP_ARCH_ARM,
44#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 48 SCMP_ARCH_MIPS,
f2d9751c
LP
49 SCMP_ARCH_MIPSEL, /* native */
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
469830d1 54 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
469830d1 59 SCMP_ARCH_MIPSEL,
f2d9751c
LP
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL64,
f2d9751c 68 SCMP_ARCH_MIPS64,
469830d1 69 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPS64N32, /* native */
71#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 79 SCMP_ARCH_PPC,
469830d1 80 SCMP_ARCH_PPC64LE,
f2d9751c
LP
81 SCMP_ARCH_PPC64, /* native */
82#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86#elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88#elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91#elif defined(__s390__)
469830d1 92 SCMP_ARCH_S390,
469830d1
LP
93#endif
94 (uint32_t) -1
95 };
57183d11
LP
96
97const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
57183d11 102
aa34055f
ZJS
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
57183d11 105 return "native";
aa34055f 106 case SCMP_ARCH_X86:
57183d11 107 return "x86";
aa34055f 108 case SCMP_ARCH_X86_64:
57183d11 109 return "x86-64";
aa34055f 110 case SCMP_ARCH_X32:
57183d11 111 return "x32";
aa34055f 112 case SCMP_ARCH_ARM:
57183d11 113 return "arm";
aa34055f
ZJS
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
6abfd303 135 return "s390";
aa34055f 136 case SCMP_ARCH_S390X:
6abfd303 137 return "s390x";
aa34055f
ZJS
138 default:
139 return NULL;
140 }
57183d11
LP
141}
142
143int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
57183d11
LP
183 else
184 return -EINVAL;
185
186 return 0;
187}
e9642be2 188
469830d1 189int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
190 scmp_filter_ctx seccomp;
191 int r;
192
469830d1
LP
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
469830d1
LP
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
1b52793d 203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
204 if (r < 0)
205 goto finish;
206
1b52793d 207 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230finish:
231 seccomp_release(seccomp);
232 return r;
233}
234
d347d902 235static bool is_basic_seccomp_available(void) {
4d5bd50a 236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
237}
238
239static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
d347d902
FS
242}
243
83f12b27 244bool is_seccomp_available(void) {
83f12b27 245 static int cached_enabled = -1;
4d5bd50a 246
83f12b27 247 if (cached_enabled < 0)
4d5bd50a
LP
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
83f12b27
FS
252 return cached_enabled;
253}
254
8130926d 255const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 256 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 257 .name = "@default",
d5efc18b 258 .help = "System calls that are always permitted",
40eb6a80
ZJS
259 .value =
260 "clock_getres\0"
261 "clock_gettime\0"
262 "clock_nanosleep\0"
263 "execve\0"
264 "exit\0"
265 "exit_group\0"
e41b0f42
LP
266 "futex\0"
267 "get_robust_list\0"
268 "get_thread_area\0"
09d3020b
DH
269 "getegid\0"
270 "getegid32\0"
271 "geteuid\0"
272 "geteuid32\0"
273 "getgid\0"
274 "getgid32\0"
275 "getgroups\0"
276 "getgroups32\0"
277 "getpgid\0"
278 "getpgrp\0"
279 "getpid\0"
280 "getppid\0"
281 "getresgid\0"
282 "getresgid32\0"
283 "getresuid\0"
284 "getresuid32\0"
40eb6a80 285 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
286 "getsid\0"
287 "gettid\0"
40eb6a80 288 "gettimeofday\0"
09d3020b
DH
289 "getuid\0"
290 "getuid32\0"
e41b0f42 291 "membarrier\0"
40eb6a80
ZJS
292 "nanosleep\0"
293 "pause\0"
4c3a9176 294 "prlimit64\0"
e41b0f42 295 "restart_syscall\0"
6fee3be0 296 "rseq\0"
40eb6a80 297 "rt_sigreturn\0"
8f44de08 298 "sched_yield\0"
e41b0f42
LP
299 "set_robust_list\0"
300 "set_thread_area\0"
301 "set_tid_address\0"
ce5faeac 302 "set_tls\0"
40eb6a80
ZJS
303 "sigreturn\0"
304 "time\0"
4c3a9176 305 "ugetrlimit\0"
40eb6a80 306 },
44898c53
LP
307 [SYSCALL_FILTER_SET_AIO] = {
308 .name = "@aio",
309 .help = "Asynchronous IO",
310 .value =
311 "io_cancel\0"
312 "io_destroy\0"
313 "io_getevents\0"
a05cfe23 314 "io_pgetevents\0"
44898c53
LP
315 "io_setup\0"
316 "io_submit\0"
317 },
133ddbbe 318 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 319 .name = "@basic-io",
d5efc18b 320 .help = "Basic IO",
133ddbbe 321 .value =
648a0ed0 322 "_llseek\0"
133ddbbe 323 "close\0"
648a0ed0 324 "dup\0"
133ddbbe
LP
325 "dup2\0"
326 "dup3\0"
133ddbbe
LP
327 "lseek\0"
328 "pread64\0"
329 "preadv\0"
44898c53 330 "preadv2\0"
133ddbbe
LP
331 "pwrite64\0"
332 "pwritev\0"
44898c53 333 "pwritev2\0"
133ddbbe
LP
334 "read\0"
335 "readv\0"
336 "write\0"
337 "writev\0"
338 },
44898c53
LP
339 [SYSCALL_FILTER_SET_CHOWN] = {
340 .name = "@chown",
341 .help = "Change ownership of files and directories",
342 .value =
343 "chown\0"
344 "chown32\0"
345 "fchown\0"
346 "fchown32\0"
347 "fchownat\0"
348 "lchown\0"
349 "lchown32\0"
350 },
8130926d 351 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 352 .name = "@clock",
d5efc18b 353 .help = "Change the system time",
201c1cc2
TM
354 .value =
355 "adjtimex\0"
1f9ac68b
LP
356 "clock_adjtime\0"
357 "clock_settime\0"
201c1cc2 358 "settimeofday\0"
1f9ac68b 359 "stime\0"
8130926d
LP
360 },
361 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 362 .name = "@cpu-emulation",
d5efc18b 363 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
364 .value =
365 "modify_ldt\0"
366 "subpage_prot\0"
367 "switch_endian\0"
368 "vm86\0"
369 "vm86old\0"
8130926d
LP
370 },
371 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 372 .name = "@debug",
d5efc18b 373 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
374 .value =
375 "lookup_dcookie\0"
376 "perf_event_open\0"
1f9ac68b
LP
377 "ptrace\0"
378 "rtas\0"
8130926d 379#ifdef __NR_s390_runtime_instr
1f9ac68b 380 "s390_runtime_instr\0"
8130926d 381#endif
1f9ac68b 382 "sys_debug_setcontext\0"
8130926d 383 },
1a1b13c9
LP
384 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
385 .name = "@file-system",
386 .help = "File system operations",
387 .value =
388 "access\0"
389 "chdir\0"
390 "chmod\0"
391 "close\0"
392 "creat\0"
393 "faccessat\0"
394 "fallocate\0"
395 "fchdir\0"
396 "fchmod\0"
397 "fchmodat\0"
1a1b13c9 398 "fcntl\0"
ceaa6aa7 399 "fcntl64\0"
1a1b13c9
LP
400 "fgetxattr\0"
401 "flistxattr\0"
ceaa6aa7 402 "fremovexattr\0"
1a1b13c9 403 "fsetxattr\0"
1a1b13c9 404 "fstat\0"
ceaa6aa7 405 "fstat64\0"
1a1b13c9 406 "fstatat64\0"
1a1b13c9 407 "fstatfs\0"
ceaa6aa7 408 "fstatfs64\0"
1a1b13c9 409 "ftruncate\0"
ceaa6aa7 410 "ftruncate64\0"
1a1b13c9
LP
411 "futimesat\0"
412 "getcwd\0"
1a1b13c9 413 "getdents\0"
ceaa6aa7 414 "getdents64\0"
1a1b13c9
LP
415 "getxattr\0"
416 "inotify_add_watch\0"
ceaa6aa7 417 "inotify_init\0"
1a1b13c9
LP
418 "inotify_init1\0"
419 "inotify_rm_watch\0"
420 "lgetxattr\0"
421 "link\0"
422 "linkat\0"
423 "listxattr\0"
424 "llistxattr\0"
425 "lremovexattr\0"
426 "lsetxattr\0"
1a1b13c9 427 "lstat\0"
ceaa6aa7 428 "lstat64\0"
1a1b13c9
LP
429 "mkdir\0"
430 "mkdirat\0"
431 "mknod\0"
432 "mknodat\0"
1a1b13c9 433 "mmap\0"
ceaa6aa7 434 "mmap2\0"
7961116e 435 "munmap\0"
1a1b13c9 436 "newfstatat\0"
ceaa6aa7
LP
437 "oldfstat\0"
438 "oldlstat\0"
439 "oldstat\0"
1a1b13c9
LP
440 "open\0"
441 "openat\0"
442 "readlink\0"
443 "readlinkat\0"
444 "removexattr\0"
445 "rename\0"
1a1b13c9 446 "renameat\0"
ceaa6aa7 447 "renameat2\0"
1a1b13c9
LP
448 "rmdir\0"
449 "setxattr\0"
1a1b13c9 450 "stat\0"
ceaa6aa7 451 "stat64\0"
1a1b13c9 452 "statfs\0"
ceaa6aa7 453 "statfs64\0"
8e6a7a8b 454#ifdef __NR_statx
a4135a74 455 "statx\0"
ceaa6aa7 456#endif
1a1b13c9
LP
457 "symlink\0"
458 "symlinkat\0"
1a1b13c9 459 "truncate\0"
ceaa6aa7 460 "truncate64\0"
1a1b13c9
LP
461 "unlink\0"
462 "unlinkat\0"
ceaa6aa7 463 "utime\0"
1a1b13c9
LP
464 "utimensat\0"
465 "utimes\0"
466 },
8130926d 467 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 468 .name = "@io-event",
d5efc18b 469 .help = "Event loop system calls",
201c1cc2
TM
470 .value =
471 "_newselect\0"
201c1cc2 472 "epoll_create\0"
215728ff 473 "epoll_create1\0"
201c1cc2
TM
474 "epoll_ctl\0"
475 "epoll_ctl_old\0"
476 "epoll_pwait\0"
477 "epoll_wait\0"
478 "epoll_wait_old\0"
201c1cc2 479 "eventfd\0"
215728ff 480 "eventfd2\0"
201c1cc2
TM
481 "poll\0"
482 "ppoll\0"
483 "pselect6\0"
484 "select\0"
8130926d
LP
485 },
486 [SYSCALL_FILTER_SET_IPC] = {
8130926d 487 .name = "@ipc",
d5efc18b
ZJS
488 .help = "SysV IPC, POSIX Message Queues or other IPC",
489 .value =
490 "ipc\0"
cd5bfd7e 491 "memfd_create\0"
201c1cc2
TM
492 "mq_getsetattr\0"
493 "mq_notify\0"
494 "mq_open\0"
495 "mq_timedreceive\0"
496 "mq_timedsend\0"
497 "mq_unlink\0"
498 "msgctl\0"
499 "msgget\0"
500 "msgrcv\0"
501 "msgsnd\0"
cd5bfd7e 502 "pipe\0"
215728ff 503 "pipe2\0"
201c1cc2
TM
504 "process_vm_readv\0"
505 "process_vm_writev\0"
506 "semctl\0"
507 "semget\0"
508 "semop\0"
509 "semtimedop\0"
510 "shmat\0"
511 "shmctl\0"
512 "shmdt\0"
513 "shmget\0"
8130926d
LP
514 },
515 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 516 .name = "@keyring",
d5efc18b 517 .help = "Kernel keyring access",
1f9ac68b
LP
518 .value =
519 "add_key\0"
520 "keyctl\0"
521 "request_key\0"
8130926d 522 },
cd0ddf6f
LP
523 [SYSCALL_FILTER_SET_MEMLOCK] = {
524 .name = "@memlock",
525 .help = "Memory locking control",
526 .value =
527 "mlock\0"
528 "mlock2\0"
529 "mlockall\0"
530 "munlock\0"
531 "munlockall\0"
532 },
8130926d 533 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 534 .name = "@module",
d5efc18b 535 .help = "Loading and unloading of kernel modules",
201c1cc2 536 .value =
201c1cc2
TM
537 "delete_module\0"
538 "finit_module\0"
539 "init_module\0"
8130926d
LP
540 },
541 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 542 .name = "@mount",
d5efc18b 543 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
544 .value =
545 "chroot\0"
546 "mount\0"
201c1cc2 547 "pivot_root\0"
201c1cc2 548 "umount\0"
215728ff 549 "umount2\0"
8130926d
LP
550 },
551 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 552 .name = "@network-io",
d5efc18b 553 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 554 .value =
201c1cc2 555 "accept\0"
215728ff 556 "accept4\0"
201c1cc2
TM
557 "bind\0"
558 "connect\0"
559 "getpeername\0"
560 "getsockname\0"
561 "getsockopt\0"
562 "listen\0"
563 "recv\0"
564 "recvfrom\0"
565 "recvmmsg\0"
566 "recvmsg\0"
567 "send\0"
568 "sendmmsg\0"
569 "sendmsg\0"
570 "sendto\0"
571 "setsockopt\0"
572 "shutdown\0"
573 "socket\0"
574 "socketcall\0"
575 "socketpair\0"
8130926d
LP
576 },
577 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 578 /* some unknown even to libseccomp */
8130926d 579 .name = "@obsolete",
d5efc18b 580 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
581 .value =
582 "_sysctl\0"
583 "afs_syscall\0"
802fa07a 584 "bdflush\0"
201c1cc2 585 "break\0"
1f9ac68b 586 "create_module\0"
201c1cc2
TM
587 "ftime\0"
588 "get_kernel_syms\0"
201c1cc2
TM
589 "getpmsg\0"
590 "gtty\0"
7e0c3b8f 591 "idle\0"
201c1cc2 592 "lock\0"
201c1cc2 593 "mpx\0"
201c1cc2
TM
594 "prof\0"
595 "profil\0"
201c1cc2
TM
596 "putpmsg\0"
597 "query_module\0"
201c1cc2
TM
598 "security\0"
599 "sgetmask\0"
600 "ssetmask\0"
601 "stty\0"
1f9ac68b 602 "sysfs\0"
201c1cc2
TM
603 "tuxcall\0"
604 "ulimit\0"
605 "uselib\0"
1f9ac68b 606 "ustat\0"
201c1cc2 607 "vserver\0"
8130926d
LP
608 },
609 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 610 .name = "@privileged",
d5efc18b 611 .help = "All system calls which need super-user capabilities",
201c1cc2 612 .value =
44898c53 613 "@chown\0"
201c1cc2
TM
614 "@clock\0"
615 "@module\0"
616 "@raw-io\0"
af0f047b
LP
617 "@reboot\0"
618 "@swap\0"
215728ff 619 "_sysctl\0"
201c1cc2 620 "acct\0"
201c1cc2 621 "bpf\0"
1f9ac68b 622 "capset\0"
201c1cc2 623 "chroot\0"
a05cfe23 624 "fanotify_init\0"
201c1cc2 625 "nfsservctl\0"
a05cfe23 626 "open_by_handle_at\0"
201c1cc2
TM
627 "pivot_root\0"
628 "quotactl\0"
201c1cc2 629 "setdomainname\0"
201c1cc2 630 "setfsuid\0"
215728ff 631 "setfsuid32\0"
201c1cc2 632 "setgroups\0"
215728ff 633 "setgroups32\0"
201c1cc2 634 "sethostname\0"
201c1cc2 635 "setresuid\0"
215728ff 636 "setresuid32\0"
201c1cc2 637 "setreuid\0"
215728ff 638 "setreuid32\0"
e05ee49b 639 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 640 "setuid32\0"
201c1cc2 641 "vhangup\0"
8130926d
LP
642 },
643 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 644 .name = "@process",
d5efc18b 645 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
646 .value =
647 "arch_prctl\0"
09d3020b 648 "capget\0" /* Able to query arbitrary processes */
201c1cc2 649 "clone\0"
201c1cc2
TM
650 "execveat\0"
651 "fork\0"
b887d2eb 652 "getrusage\0"
201c1cc2
TM
653 "kill\0"
654 "prctl\0"
b887d2eb
LP
655 "rt_sigqueueinfo\0"
656 "rt_tgsigqueueinfo\0"
201c1cc2 657 "setns\0"
a9518dc3 658 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 659 "tgkill\0"
b887d2eb 660 "times\0"
201c1cc2
TM
661 "tkill\0"
662 "unshare\0"
663 "vfork\0"
b887d2eb
LP
664 "wait4\0"
665 "waitid\0"
666 "waitpid\0"
8130926d
LP
667 },
668 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 669 .name = "@raw-io",
d5efc18b 670 .help = "Raw I/O port access",
201c1cc2
TM
671 .value =
672 "ioperm\0"
673 "iopl\0"
1f9ac68b 674 "pciconfig_iobase\0"
201c1cc2
TM
675 "pciconfig_read\0"
676 "pciconfig_write\0"
8130926d 677#ifdef __NR_s390_pci_mmio_read
201c1cc2 678 "s390_pci_mmio_read\0"
8130926d
LP
679#endif
680#ifdef __NR_s390_pci_mmio_write
201c1cc2 681 "s390_pci_mmio_write\0"
8130926d
LP
682#endif
683 },
bd2ab3f4
LP
684 [SYSCALL_FILTER_SET_REBOOT] = {
685 .name = "@reboot",
686 .help = "Reboot and reboot preparation/kexec",
687 .value =
bd2ab3f4 688 "kexec_file_load\0"
e59608fa 689 "kexec_load\0"
bd2ab3f4
LP
690 "reboot\0"
691 },
133ddbbe 692 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 693 .name = "@resources",
58a8f68b 694 .help = "Alter resource settings",
133ddbbe 695 .value =
0963c053
LP
696 "ioprio_set\0"
697 "mbind\0"
698 "migrate_pages\0"
699 "move_pages\0"
700 "nice\0"
0963c053
LP
701 "sched_setaffinity\0"
702 "sched_setattr\0"
133ddbbe
LP
703 "sched_setparam\0"
704 "sched_setscheduler\0"
0963c053 705 "set_mempolicy\0"
133ddbbe
LP
706 "setpriority\0"
707 "setrlimit\0"
133ddbbe 708 },
6eaaeee9
LP
709 [SYSCALL_FILTER_SET_SETUID] = {
710 .name = "@setuid",
711 .help = "Operations for changing user/group credentials",
712 .value =
6eaaeee9 713 "setgid\0"
215728ff 714 "setgid32\0"
6eaaeee9 715 "setgroups\0"
215728ff 716 "setgroups32\0"
6eaaeee9 717 "setregid\0"
215728ff 718 "setregid32\0"
6eaaeee9 719 "setresgid\0"
215728ff 720 "setresgid32\0"
6eaaeee9 721 "setresuid\0"
215728ff 722 "setresuid32\0"
6eaaeee9 723 "setreuid\0"
215728ff 724 "setreuid32\0"
6eaaeee9 725 "setuid\0"
215728ff 726 "setuid32\0"
6eaaeee9 727 },
cd0ddf6f
LP
728 [SYSCALL_FILTER_SET_SIGNAL] = {
729 .name = "@signal",
730 .help = "Process signal handling",
731 .value =
732 "rt_sigaction\0"
733 "rt_sigpending\0"
734 "rt_sigprocmask\0"
735 "rt_sigsuspend\0"
736 "rt_sigtimedwait\0"
737 "sigaction\0"
738 "sigaltstack\0"
739 "signal\0"
740 "signalfd\0"
741 "signalfd4\0"
742 "sigpending\0"
743 "sigprocmask\0"
744 "sigsuspend\0"
745 },
bd2ab3f4
LP
746 [SYSCALL_FILTER_SET_SWAP] = {
747 .name = "@swap",
748 .help = "Enable/disable swap devices",
749 .value =
750 "swapoff\0"
751 "swapon\0"
752 },
44898c53
LP
753 [SYSCALL_FILTER_SET_SYNC] = {
754 .name = "@sync",
755 .help = "Synchronize files and memory to storage",
756 .value =
757 "fdatasync\0"
758 "fsync\0"
759 "msync\0"
760 "sync\0"
761 "sync_file_range\0"
762 "syncfs\0"
763 },
70526841
LP
764 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
765 .name = "@system-service",
766 .help = "General system service operations",
767 .value =
768 "@aio\0"
769 "@basic-io\0"
770 "@chown\0"
771 "@default\0"
772 "@file-system\0"
773 "@io-event\0"
774 "@ipc\0"
775 "@keyring\0"
776 "@memlock\0"
777 "@network-io\0"
778 "@process\0"
779 "@resources\0"
780 "@setuid\0"
781 "@signal\0"
782 "@sync\0"
783 "@timer\0"
784 "brk\0"
785 "capget\0"
786 "capset\0"
787 "copy_file_range\0"
788 "fadvise64\0"
789 "fadvise64_64\0"
790 "flock\0"
791 "get_mempolicy\0"
792 "getcpu\0"
793 "getpriority\0"
794 "getrandom\0"
795 "ioctl\0"
796 "ioprio_get\0"
797 "kcmp\0"
798 "madvise\0"
70526841
LP
799 "mprotect\0"
800 "mremap\0"
801 "name_to_handle_at\0"
802 "oldolduname\0"
803 "olduname\0"
804 "personality\0"
805 "readahead\0"
806 "readdir\0"
807 "remap_file_pages\0"
808 "sched_get_priority_max\0"
809 "sched_get_priority_min\0"
810 "sched_getaffinity\0"
811 "sched_getattr\0"
812 "sched_getparam\0"
813 "sched_getscheduler\0"
814 "sched_rr_get_interval\0"
815 "sched_yield\0"
816 "sendfile\0"
817 "sendfile64\0"
818 "setfsgid\0"
819 "setfsgid32\0"
820 "setfsuid\0"
821 "setfsuid32\0"
822 "setpgid\0"
823 "setsid\0"
824 "splice\0"
825 "sysinfo\0"
826 "tee\0"
827 "umask\0"
828 "uname\0"
829 "userfaultfd\0"
830 "vmsplice\0"
831 },
cd0ddf6f
LP
832 [SYSCALL_FILTER_SET_TIMER] = {
833 .name = "@timer",
834 .help = "Schedule operations by time",
835 .value =
836 "alarm\0"
837 "getitimer\0"
838 "setitimer\0"
839 "timer_create\0"
840 "timer_delete\0"
841 "timer_getoverrun\0"
842 "timer_gettime\0"
843 "timer_settime\0"
844 "timerfd_create\0"
845 "timerfd_gettime\0"
846 "timerfd_settime\0"
847 "times\0"
848 },
201c1cc2 849};
8130926d
LP
850
851const SyscallFilterSet *syscall_filter_set_find(const char *name) {
852 unsigned i;
853
854 if (isempty(name) || name[0] != '@')
855 return NULL;
856
857 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
858 if (streq(syscall_filter_sets[i].name, name))
859 return syscall_filter_sets + i;
860
861 return NULL;
862}
863
b54f36c6 864static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 865
b54f36c6 866int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
867 assert(seccomp);
868 assert(name);
869
960e4569
LP
870 if (strv_contains(exclude, name))
871 return 0;
872
69b1b241
LP
873 if (name[0] == '@') {
874 const SyscallFilterSet *other;
875
876 other = syscall_filter_set_find(name);
baaa35ad
ZJS
877 if (!other)
878 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
879 "Filter set %s is not known!",
880 name);
69b1b241 881
b54f36c6
ZJS
882 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
883
69b1b241 884 } else {
b54f36c6 885 int id, r;
69b1b241
LP
886
887 id = seccomp_syscall_resolve_name(name);
cff7bff8 888 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
889 if (log_missing)
890 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 891 return 0;
cff7bff8 892 }
69b1b241
LP
893
894 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 895 if (r < 0) {
69b1b241 896 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
897 bool ignore = r == -EDOM;
898
899 if (!ignore || log_missing)
900 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
901 name, id, ignore ? ", ignoring" : "");
902 if (!ignore)
903 return r;
b54f36c6 904 }
69b1b241 905
b54f36c6
ZJS
906 return 0;
907 }
69b1b241
LP
908}
909
469830d1
LP
910static int seccomp_add_syscall_filter_set(
911 scmp_filter_ctx seccomp,
469830d1 912 const SyscallFilterSet *set,
960e4569 913 uint32_t action,
b54f36c6
ZJS
914 char **exclude,
915 bool log_missing) {
469830d1 916
8130926d
LP
917 const char *sys;
918 int r;
919
920 assert(seccomp);
921 assert(set);
922
923 NULSTR_FOREACH(sys, set->value) {
b54f36c6 924 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
925 if (r < 0)
926 return r;
469830d1
LP
927 }
928
929 return 0;
930}
931
b54f36c6 932int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
933 uint32_t arch;
934 int r;
935
936 assert(set);
937
938 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 939 * each local arch. */
469830d1
LP
940
941 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
942 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
943
944 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
945
946 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
947 if (r < 0)
948 return r;
469830d1 949
b54f36c6 950 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
951 if (r < 0)
952 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
953
954 r = seccomp_load(seccomp);
955 if (IN_SET(r, -EPERM, -EACCES))
956 return r;
957 if (r < 0)
958 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
959 }
960
961 return 0;
962}
a3be2849 963
b54f36c6 964int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 965 uint32_t arch;
a3be2849
LP
966 int r;
967
469830d1
LP
968 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
969 * SyscallFilterSet* table. */
a3be2849 970
8cfa775f 971 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 972 return 0;
a3be2849 973
469830d1
LP
974 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
975 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
976 Iterator i;
b54f36c6 977 void *syscall_id, *val;
a3be2849 978
469830d1 979 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 980
469830d1
LP
981 r = seccomp_init_for_arch(&seccomp, arch, default_action);
982 if (r < 0)
983 return r;
a3be2849 984
b54f36c6 985 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 986 uint32_t a = action;
b54f36c6
ZJS
987 int id = PTR_TO_INT(syscall_id) - 1;
988 int error = PTR_TO_INT(val);
8cfa775f 989
b54f36c6
ZJS
990 if (action != SCMP_ACT_ALLOW && error >= 0)
991 a = SCMP_ACT_ERRNO(error);
8cfa775f 992
b54f36c6 993 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
994 if (r < 0) {
995 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
996 _cleanup_free_ char *n = NULL;
7e86bd73 997 bool ignore;
469830d1 998
b54f36c6 999 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1000 ignore = r == -EDOM;
1001 if (!ignore || log_missing)
1002 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1003 strna(n), id, ignore ? ", ignoring" : "");
1004 if (!ignore)
1005 return r;
469830d1
LP
1006 }
1007 }
1008
1009 r = seccomp_load(seccomp);
1010 if (IN_SET(r, -EPERM, -EACCES))
1011 return r;
1012 if (r < 0)
1013 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1014 }
1015
1016 return 0;
add00535
LP
1017}
1018
58f6ab44 1019int seccomp_parse_syscall_filter(
898748d8
YW
1020 const char *name,
1021 int errno_num,
1022 Hashmap *filter,
13d92c63 1023 SeccompParseFlags flags,
898748d8
YW
1024 const char *unit,
1025 const char *filename,
1026 unsigned line) {
1027
1028 int r;
1029
1030 assert(name);
1031 assert(filter);
1032
1033 if (name[0] == '@') {
1034 const SyscallFilterSet *set;
1035 const char *i;
1036
1037 set = syscall_filter_set_find(name);
1038 if (!set) {
13d92c63 1039 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1040 return -EINVAL;
13d92c63
LP
1041
1042 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1043 "Unknown system call group, ignoring: %s", name);
1044 return 0;
898748d8
YW
1045 }
1046
1047 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1048 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1049 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1050 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1051 * about them. */
58f6ab44 1052 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1053 if (r < 0)
1054 return r;
1055 }
1056 } else {
1057 int id;
1058
1059 id = seccomp_syscall_resolve_name(name);
1060 if (id == __NR_SCMP_ERROR) {
13d92c63 1061 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1062 return -EINVAL;
13d92c63
LP
1063
1064 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1065 "Failed to parse system call, ignoring: %s", name);
1066 return 0;
898748d8
YW
1067 }
1068
1069 /* If we previously wanted to forbid a syscall and now
1070 * we want to allow it, then remove it from the list. */
13d92c63 1071 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
898748d8
YW
1072 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1073 if (r < 0)
851ee70a
LW
1074 switch (r) {
1075 case -ENOMEM:
1076 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1077 case -EEXIST:
9d7fe7c6
LW
1078 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1079 break;
851ee70a
LW
1080 default:
1081 return r;
1082 }
898748d8
YW
1083 } else
1084 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1085 }
1086
1087 return 0;
1088}
1089
add00535 1090int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1091 uint32_t arch;
add00535
LP
1092 int r;
1093
f1d34068 1094 if (DEBUG_LOGGING) {
add00535
LP
1095 _cleanup_free_ char *s = NULL;
1096
86c2a9f1 1097 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1098 log_debug("Restricting namespace to: %s.", strna(s));
1099 }
1100
1101 /* NOOP? */
1102 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1103 return 0;
1104
469830d1
LP
1105 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1106 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1107 unsigned i;
add00535 1108
469830d1
LP
1109 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1110
1111 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1112 if (r < 0)
1113 return r;
1114
1115 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1116 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1117 * altogether. */
1118 r = seccomp_rule_add_exact(
1119 seccomp,
1120 SCMP_ACT_ERRNO(EPERM),
1121 SCMP_SYS(setns),
1122 0);
1123 else
1124 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1125 * special invocation with a zero flags argument, right here. */
1126 r = seccomp_rule_add_exact(
1127 seccomp,
1128 SCMP_ACT_ERRNO(EPERM),
1129 SCMP_SYS(setns),
1130 1,
1131 SCMP_A1(SCMP_CMP_EQ, 0));
1132 if (r < 0) {
1133 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1134 continue;
1135 }
1136
1137 for (i = 0; namespace_flag_map[i].name; i++) {
1138 unsigned long f;
1139
1140 f = namespace_flag_map[i].flag;
1141 if ((retain & f) == f) {
1142 log_debug("Permitting %s.", namespace_flag_map[i].name);
1143 continue;
1144 }
1145
1146 log_debug("Blocking %s.", namespace_flag_map[i].name);
1147
1148 r = seccomp_rule_add_exact(
1149 seccomp,
1150 SCMP_ACT_ERRNO(EPERM),
1151 SCMP_SYS(unshare),
1152 1,
1153 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1154 if (r < 0) {
1155 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1156 break;
1157 }
1158
511ceb1f
ZJS
1159 /* On s390/s390x the first two parameters to clone are switched */
1160 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1161 r = seccomp_rule_add_exact(
1162 seccomp,
1163 SCMP_ACT_ERRNO(EPERM),
1164 SCMP_SYS(clone),
1165 1,
1166 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1167 else
1168 r = seccomp_rule_add_exact(
1169 seccomp,
1170 SCMP_ACT_ERRNO(EPERM),
1171 SCMP_SYS(clone),
1172 1,
1173 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1174 if (r < 0) {
1175 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1176 break;
1177 }
1178
1179 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1180 r = seccomp_rule_add_exact(
1181 seccomp,
1182 SCMP_ACT_ERRNO(EPERM),
1183 SCMP_SYS(setns),
1184 1,
1185 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1186 if (r < 0) {
1187 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1188 break;
1189 }
1190 }
1191 }
1192 if (r < 0)
1193 continue;
1194
1195 r = seccomp_load(seccomp);
1196 if (IN_SET(r, -EPERM, -EACCES))
1197 return r;
1198 if (r < 0)
1199 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 }
1201
1202 return 0;
1203}
1204
1205int seccomp_protect_sysctl(void) {
1206 uint32_t arch;
1207 int r;
1208
1209 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1210 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1211
1212 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1213
2e64e8f4
ZJS
1214 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1215 /* No _sysctl syscall */
1216 continue;
1217
469830d1
LP
1218 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1219 if (r < 0)
1220 return r;
1221
1222 r = seccomp_rule_add_exact(
add00535
LP
1223 seccomp,
1224 SCMP_ACT_ERRNO(EPERM),
469830d1 1225 SCMP_SYS(_sysctl),
add00535 1226 0);
469830d1
LP
1227 if (r < 0) {
1228 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1229 continue;
1230 }
1231
1232 r = seccomp_load(seccomp);
1233 if (IN_SET(r, -EPERM, -EACCES))
1234 return r;
1235 if (r < 0)
1236 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1237 }
1238
1239 return 0;
1240}
1241
1242int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1243 uint32_t arch;
1244 int r;
1245
1246 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1247 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1248 bool supported;
469830d1
LP
1249 Iterator i;
1250
1251 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1252
9606bc4b
LP
1253 switch (arch) {
1254
1255 case SCMP_ARCH_X86_64:
1256 case SCMP_ARCH_X32:
1257 case SCMP_ARCH_ARM:
1258 case SCMP_ARCH_AARCH64:
0d9fca76 1259 case SCMP_ARCH_PPC:
da1921a5
ZJS
1260 case SCMP_ARCH_PPC64:
1261 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1262 case SCMP_ARCH_MIPSEL64N32:
1263 case SCMP_ARCH_MIPS64N32:
1264 case SCMP_ARCH_MIPSEL64:
1265 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1266 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1267 supported = true;
1268 break;
1269
9606bc4b
LP
1270 case SCMP_ARCH_S390:
1271 case SCMP_ARCH_S390X:
da1921a5 1272 case SCMP_ARCH_X86:
f5aeac14
JC
1273 case SCMP_ARCH_MIPSEL:
1274 case SCMP_ARCH_MIPS:
9606bc4b
LP
1275 default:
1276 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1277 * don't know */
1278 supported = false;
1279 break;
1280 }
1281
1282 if (!supported)
1283 continue;
1284
469830d1
LP
1285 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1286 if (r < 0)
1287 return r;
1288
1289 if (whitelist) {
1290 int af, first = 0, last = 0;
1291 void *afp;
1292
1293 /* If this is a whitelist, we first block the address families that are out of range and then
1294 * everything that is not in the set. First, we find the lowest and highest address family in
1295 * the set. */
1296
1297 SET_FOREACH(afp, address_families, i) {
1298 af = PTR_TO_INT(afp);
1299
1300 if (af <= 0 || af >= af_max())
1301 continue;
1302
1303 if (first == 0 || af < first)
1304 first = af;
1305
1306 if (last == 0 || af > last)
1307 last = af;
1308 }
1309
1310 assert((first == 0) == (last == 0));
1311
1312 if (first == 0) {
1313
1314 /* No entries in the valid range, block everything */
1315 r = seccomp_rule_add_exact(
1316 seccomp,
1317 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1318 SCMP_SYS(socket),
1319 0);
1320 if (r < 0) {
1321 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1322 continue;
1323 }
1324
1325 } else {
1326
1327 /* Block everything below the first entry */
1328 r = seccomp_rule_add_exact(
1329 seccomp,
1330 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1331 SCMP_SYS(socket),
1332 1,
1333 SCMP_A0(SCMP_CMP_LT, first));
1334 if (r < 0) {
1335 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1336 continue;
1337 }
1338
1339 /* Block everything above the last entry */
1340 r = seccomp_rule_add_exact(
1341 seccomp,
1342 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1343 SCMP_SYS(socket),
1344 1,
1345 SCMP_A0(SCMP_CMP_GT, last));
1346 if (r < 0) {
1347 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1348 continue;
1349 }
1350
1351 /* Block everything between the first and last entry */
1352 for (af = 1; af < af_max(); af++) {
1353
1354 if (set_contains(address_families, INT_TO_PTR(af)))
1355 continue;
1356
1357 r = seccomp_rule_add_exact(
1358 seccomp,
1359 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1360 SCMP_SYS(socket),
1361 1,
1362 SCMP_A0(SCMP_CMP_EQ, af));
1363 if (r < 0)
1364 break;
1365 }
469830d1
LP
1366 if (r < 0) {
1367 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1368 continue;
1369 }
1370 }
1371
1372 } else {
1373 void *af;
1374
1375 /* If this is a blacklist, then generate one rule for
1376 * each address family that are then combined in OR
1377 * checks. */
1378
1379 SET_FOREACH(af, address_families, i) {
1380
1381 r = seccomp_rule_add_exact(
1382 seccomp,
1383 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1384 SCMP_SYS(socket),
1385 1,
1386 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1387 if (r < 0)
1388 break;
1389 }
469830d1
LP
1390 if (r < 0) {
1391 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1392 continue;
1393 }
1394 }
1395
1396 r = seccomp_load(seccomp);
1397 if (IN_SET(r, -EPERM, -EACCES))
1398 return r;
1399 if (r < 0)
1400 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1401 }
1402
1403 return 0;
1404}
1405
1406int seccomp_restrict_realtime(void) {
1407 static const int permitted_policies[] = {
1408 SCHED_OTHER,
1409 SCHED_BATCH,
1410 SCHED_IDLE,
1411 };
1412
1413 int r, max_policy = 0;
1414 uint32_t arch;
1415 unsigned i;
1416
1417 /* Determine the highest policy constant we want to allow */
1418 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1419 if (permitted_policies[i] > max_policy)
1420 max_policy = permitted_policies[i];
1421
1422 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1423 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1424 int p;
1425
1426 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1427
1428 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1429 if (r < 0)
1430 return r;
1431
1432 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1433 * whitelist. */
1434 for (p = 0; p < max_policy; p++) {
1435 bool good = false;
1436
1437 /* Check if this is in the whitelist. */
1438 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1439 if (permitted_policies[i] == p) {
1440 good = true;
1441 break;
1442 }
1443
1444 if (good)
1445 continue;
1446
1447 /* Deny this policy */
1448 r = seccomp_rule_add_exact(
1449 seccomp,
1450 SCMP_ACT_ERRNO(EPERM),
1451 SCMP_SYS(sched_setscheduler),
1452 1,
1453 SCMP_A1(SCMP_CMP_EQ, p));
1454 if (r < 0) {
1455 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1456 continue;
1457 }
1458 }
1459
1460 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1461 * unsigned here, hence no need no check for < 0 values. */
1462 r = seccomp_rule_add_exact(
add00535
LP
1463 seccomp,
1464 SCMP_ACT_ERRNO(EPERM),
469830d1 1465 SCMP_SYS(sched_setscheduler),
add00535 1466 1,
469830d1
LP
1467 SCMP_A1(SCMP_CMP_GT, max_policy));
1468 if (r < 0) {
1469 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1470 continue;
1471 }
add00535 1472
469830d1
LP
1473 r = seccomp_load(seccomp);
1474 if (IN_SET(r, -EPERM, -EACCES))
1475 return r;
1476 if (r < 0)
1477 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1478 }
1479
1480 return 0;
1481}
1482
6dc66688
ZJS
1483static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1484 uint32_t arch,
1485 int nr,
14cb109d 1486 unsigned arg_cnt,
6dc66688
ZJS
1487 const struct scmp_arg_cmp arg) {
1488 int r;
1489
1490 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1491 if (r < 0) {
1492 _cleanup_free_ char *n = NULL;
1493
1494 n = seccomp_syscall_resolve_num_arch(arch, nr);
1495 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1496 strna(n),
1497 seccomp_arch_to_string(arch));
1498 }
1499
1500 return r;
1501}
1502
2a8d6e63 1503/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1504#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1505assert_cc(SCMP_SYS(shmget) > 0);
1506assert_cc(SCMP_SYS(shmat) > 0);
1507assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1508#endif
6dc66688 1509
469830d1
LP
1510int seccomp_memory_deny_write_execute(void) {
1511 uint32_t arch;
1512 int r;
1513
1514 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1515 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1516 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1517
469830d1
LP
1518 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1519
8a50cf69
LP
1520 switch (arch) {
1521
1522 case SCMP_ARCH_X86:
1523 filter_syscall = SCMP_SYS(mmap2);
1524 block_syscall = SCMP_SYS(mmap);
67fb5f33 1525 shmat_syscall = SCMP_SYS(shmat);
2a8d6e63
ZJS
1526 break;
1527
63d00dfb 1528 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1529 case SCMP_ARCH_PPC64:
1530 case SCMP_ARCH_PPC64LE:
1531 filter_syscall = SCMP_SYS(mmap);
1532
1533 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1534 * We ignore that here, which means there's still a way to get writable/executable
1535 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1536
8a50cf69
LP
1537 break;
1538
4278d1f5
ZJS
1539 case SCMP_ARCH_ARM:
1540 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1541 shmat_syscall = SCMP_SYS(shmat);
1542 break;
1543
8a50cf69
LP
1544 case SCMP_ARCH_X86_64:
1545 case SCMP_ARCH_X32:
79873bc8 1546 case SCMP_ARCH_AARCH64:
303d6b4c 1547 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
8a50cf69
LP
1548 shmat_syscall = SCMP_SYS(shmat);
1549 break;
1550
1551 /* Please add more definitions here, if you port systemd to other architectures! */
1552
303d6b4c 1553#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
8a50cf69
LP
1554#warning "Consider adding the right mmap() syscall definitions here!"
1555#endif
1556 }
1557
1558 /* Can't filter mmap() on this arch, then skip it */
1559 if (filter_syscall == 0)
1560 continue;
1561
469830d1
LP
1562 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1563 if (r < 0)
1564 return r;
1565
6dc66688
ZJS
1566 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1567 1,
1568 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1569 if (r < 0)
1570 continue;
8a50cf69
LP
1571
1572 if (block_syscall != 0) {
6dc66688
ZJS
1573 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1574 if (r < 0)
8a50cf69 1575 continue;
add00535 1576 }
a3be2849 1577
6dc66688
ZJS
1578 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1579 1,
b835eeb4
ZJS
1580 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1581 if (r < 0)
1582 continue;
1583
91691f1d 1584#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1585 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1586 1,
6dc66688
ZJS
1587 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1588 if (r < 0)
469830d1 1589 continue;
91691f1d 1590#endif
add00535 1591
67fb5f33 1592 if (shmat_syscall > 0) {
6dc66688
ZJS
1593 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1594 1,
1595 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1596 if (r < 0)
8a50cf69 1597 continue;
469830d1
LP
1598 }
1599
1600 r = seccomp_load(seccomp);
1601 if (IN_SET(r, -EPERM, -EACCES))
1602 return r;
add00535 1603 if (r < 0)
469830d1
LP
1604 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1605 }
add00535 1606
469830d1
LP
1607 return 0;
1608}
1609
1610int seccomp_restrict_archs(Set *archs) {
1611 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1612 Iterator i;
1613 void *id;
1614 int r;
1615
1616 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1617 * list.
1618 *
1619 * There are some qualifications. However the most important use is to stop processes from bypassing
1620 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1621 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1622
2428aaf8
AJ
1623 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1624 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1625 * to run a program with the restrictions applied. */
469830d1
LP
1626 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1627 if (!seccomp)
1628 return -ENOMEM;
1629
1630 SET_FOREACH(id, archs, i) {
1631 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1632 if (r < 0 && r != -EEXIST)
1633 return r;
1634 }
1635
1636 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1637 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1638 * The important thing is that you can block the old 32-bit x86 syscalls.
1639 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1640
1641 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1642 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1643
1644 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1645 if (r < 0 && r != -EEXIST)
469830d1 1646 return r;
add00535
LP
1647 }
1648
469830d1
LP
1649 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1650 if (r < 0)
1651 return r;
add00535 1652
1c6af69b
LP
1653 r = seccomp_load(seccomp);
1654 if (IN_SET(r, -EPERM, -EACCES))
1655 return r;
1656 if (r < 0)
1657 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1658
1659 return 0;
a3be2849 1660}
b16bd535
YW
1661
1662int parse_syscall_archs(char **l, Set **archs) {
1663 _cleanup_set_free_ Set *_archs;
1664 char **s;
1665 int r;
1666
1667 assert(l);
1668 assert(archs);
1669
1670 r = set_ensure_allocated(&_archs, NULL);
1671 if (r < 0)
1672 return r;
1673
1674 STRV_FOREACH(s, l) {
1675 uint32_t a;
1676
1677 r = seccomp_arch_from_string(*s, &a);
1678 if (r < 0)
1679 return -EINVAL;
1680
1681 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1682 if (r < 0)
1683 return -ENOMEM;
1684 }
1685
1cc6c93a 1686 *archs = TAKE_PTR(_archs);
b16bd535
YW
1687
1688 return 0;
1689}
165a31c0 1690
8cfa775f 1691int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1692 const char *i;
1693 int r;
1694
1695 assert(set);
1696
1697 NULSTR_FOREACH(i, set->value) {
1698
1699 if (i[0] == '@') {
1700 const SyscallFilterSet *more;
1701
1702 more = syscall_filter_set_find(i);
1703 if (!more)
1704 return -ENXIO;
1705
165a31c0
LP
1706 r = seccomp_filter_set_add(filter, add, more);
1707 if (r < 0)
1708 return r;
1709 } else {
1710 int id;
1711
1712 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1713 if (id == __NR_SCMP_ERROR) {
1714 log_debug("Couldn't resolve system call, ignoring: %s", i);
1715 continue;
1716 }
165a31c0
LP
1717
1718 if (add) {
8cfa775f 1719 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1720 if (r < 0)
1721 return r;
1722 } else
8cfa775f 1723 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1724 }
1725 }
1726
1727 return 0;
1728}
78e864e5
TM
1729
1730int seccomp_lock_personality(unsigned long personality) {
72eafe71 1731 uint32_t arch;
78e864e5
TM
1732 int r;
1733
72eafe71
LP
1734 if (personality >= PERSONALITY_INVALID)
1735 return -EINVAL;
78e864e5 1736
72eafe71
LP
1737 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1738 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1739
72eafe71
LP
1740 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1741 if (r < 0)
1742 return r;
1743
1744 r = seccomp_rule_add_exact(
1745 seccomp,
1746 SCMP_ACT_ERRNO(EPERM),
1747 SCMP_SYS(personality),
1748 1,
1749 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1750 if (r < 0) {
1751 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1752 continue;
1753 }
72eafe71
LP
1754
1755 r = seccomp_load(seccomp);
1756 if (IN_SET(r, -EPERM, -EACCES))
1757 return r;
1758 if (r < 0)
1759 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1760 }
1761
1762 return 0;
78e864e5 1763}
aecd5ac6
TM
1764
1765int seccomp_protect_hostname(void) {
1766 uint32_t arch;
1767 int r;
1768
1769 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1770 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1771
1772 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1773 if (r < 0)
1774 return r;
1775
1776 r = seccomp_rule_add_exact(
1777 seccomp,
1778 SCMP_ACT_ERRNO(EPERM),
1779 SCMP_SYS(sethostname),
1780 0);
9e6e543c
LP
1781 if (r < 0) {
1782 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1783 continue;
9e6e543c 1784 }
aecd5ac6
TM
1785
1786 r = seccomp_rule_add_exact(
1787 seccomp,
1788 SCMP_ACT_ERRNO(EPERM),
1789 SCMP_SYS(setdomainname),
1790 0);
9e6e543c
LP
1791 if (r < 0) {
1792 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1793 continue;
9e6e543c 1794 }
aecd5ac6
TM
1795
1796 r = seccomp_load(seccomp);
1797 if (IN_SET(r, -EPERM, -EACCES))
1798 return r;
1799 if (r < 0)
1800 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1801 }
1802
1803 return 0;
1804}
3c27973b 1805
da4dc9a6
ZJS
1806static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1807 /* Checks the mode_t parameter of the following system calls:
1808 *
1809 * → chmod() + fchmod() + fchmodat()
1810 * → open() + creat() + openat()
1811 * → mkdir() + mkdirat()
1812 * → mknod() + mknodat()
1813 *
1814 * Returns error if *everything* failed, and 0 otherwise.
1815 */
1816 int r = 0;
1817 bool any = false;
1818
1819 r = seccomp_rule_add_exact(
1820 seccomp,
1821 SCMP_ACT_ERRNO(EPERM),
1822 SCMP_SYS(chmod),
1823 1,
1824 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1825 if (r < 0)
1826 log_debug_errno(r, "Failed to add filter for chmod: %m");
1827 else
1828 any = true;
1829
1830 r = seccomp_rule_add_exact(
1831 seccomp,
1832 SCMP_ACT_ERRNO(EPERM),
1833 SCMP_SYS(fchmod),
1834 1,
1835 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1836 if (r < 0)
1837 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1838 else
1839 any = true;
1840
1841 r = seccomp_rule_add_exact(
1842 seccomp,
1843 SCMP_ACT_ERRNO(EPERM),
1844 SCMP_SYS(fchmodat),
1845 1,
1846 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1847 if (r < 0)
1848 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1849 else
1850 any = true;
1851
1852 r = seccomp_rule_add_exact(
1853 seccomp,
1854 SCMP_ACT_ERRNO(EPERM),
1855 SCMP_SYS(mkdir),
1856 1,
1857 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1858 if (r < 0)
1859 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1860 else
1861 any = true;
1862
1863 r = seccomp_rule_add_exact(
1864 seccomp,
1865 SCMP_ACT_ERRNO(EPERM),
1866 SCMP_SYS(mkdirat),
1867 1,
1868 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1869 if (r < 0)
1870 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1871 else
1872 any = true;
1873
1874 r = seccomp_rule_add_exact(
1875 seccomp,
1876 SCMP_ACT_ERRNO(EPERM),
1877 SCMP_SYS(mknod),
1878 1,
1879 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1880 if (r < 0)
1881 log_debug_errno(r, "Failed to add filter for mknod: %m");
1882 else
1883 any = true;
1884
1885 r = seccomp_rule_add_exact(
1886 seccomp,
1887 SCMP_ACT_ERRNO(EPERM),
1888 SCMP_SYS(mknodat),
1889 1,
1890 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1891 if (r < 0)
1892 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1893 else
1894 any = true;
1895
1896#if SCMP_SYS(open) > 0
1897 r = seccomp_rule_add_exact(
1898 seccomp,
1899 SCMP_ACT_ERRNO(EPERM),
1900 SCMP_SYS(open),
1901 2,
1902 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1903 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1904 if (r < 0)
1905 log_debug_errno(r, "Failed to add filter for open: %m");
1906 else
1907 any = true;
1908#endif
1909
1910 r = seccomp_rule_add_exact(
1911 seccomp,
1912 SCMP_ACT_ERRNO(EPERM),
1913 SCMP_SYS(openat),
1914 2,
1915 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1916 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1917 if (r < 0)
1918 log_debug_errno(r, "Failed to add filter for openat: %m");
1919 else
1920 any = true;
1921
1922 r = seccomp_rule_add_exact(
1923 seccomp,
1924 SCMP_ACT_ERRNO(EPERM),
1925 SCMP_SYS(creat),
1926 1,
1927 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1928 if (r < 0)
1929 log_debug_errno(r, "Failed to add filter for creat: %m");
1930 else
1931 any = true;
1932
1933 return any ? 0 : r;
1934}
1935
3c27973b
LP
1936int seccomp_restrict_suid_sgid(void) {
1937 uint32_t arch;
da4dc9a6 1938 int r, k;
3c27973b
LP
1939
1940 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1941 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1942
1943 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1944 if (r < 0)
1945 return r;
1946
da4dc9a6
ZJS
1947 r = seccomp_restrict_sxid(seccomp, S_ISUID);
1948 if (r < 0)
1949 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 1950
da4dc9a6
ZJS
1951 k = seccomp_restrict_sxid(seccomp, S_ISGID);
1952 if (k < 0)
1953 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 1954
da4dc9a6 1955 if (r < 0 && k < 0)
3c27973b 1956 continue;
3c27973b
LP
1957
1958 r = seccomp_load(seccomp);
1959 if (IN_SET(r, -EPERM, -EACCES))
1960 return r;
1961 if (r < 0)
1962 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1963 }
1964
1965 return 0;
1966}