]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
seccomp: remove '@credentials' syscall set (#6958)
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
57183d11
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
a8fbdf54 20#include <errno.h>
469830d1 21#include <linux/seccomp.h>
57183d11 22#include <seccomp.h>
a8fbdf54 23#include <stddef.h>
469830d1 24#include <sys/mman.h>
d347d902 25#include <sys/prctl.h>
469830d1 26#include <sys/shm.h>
57183d11 27
469830d1 28#include "af-list.h"
add00535 29#include "alloc-util.h"
a8fbdf54 30#include "macro.h"
add00535 31#include "nsflags.h"
78e864e5 32#include "process-util.h"
cf0fbc49 33#include "seccomp-util.h"
b16bd535 34#include "set.h"
07630cea 35#include "string-util.h"
b16bd535 36#include "strv.h"
8130926d 37#include "util.h"
469830d1
LP
38#include "errno-list.h"
39
40const uint32_t seccomp_local_archs[] = {
41
f2d9751c
LP
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43
44#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
45 SCMP_ARCH_X86,
46 SCMP_ARCH_X86_64,
f2d9751c
LP
47 SCMP_ARCH_X32, /* native */
48#elif defined(__x86_64__) && !defined(__ILP32__)
49 SCMP_ARCH_X86,
469830d1 50 SCMP_ARCH_X32,
f2d9751c
LP
51 SCMP_ARCH_X86_64, /* native */
52#elif defined(__i386__)
53 SCMP_ARCH_X86,
54#elif defined(__aarch64__)
469830d1 55 SCMP_ARCH_ARM,
f2d9751c
LP
56 SCMP_ARCH_AARCH64, /* native */
57#elif defined(__arm__)
58 SCMP_ARCH_ARM,
59#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS, /* native */
62#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 63 SCMP_ARCH_MIPS,
f2d9751c
LP
64 SCMP_ARCH_MIPSEL, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64N32,
469830d1 69 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPSEL64,
71 SCMP_ARCH_MIPS64, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPS,
469830d1 74 SCMP_ARCH_MIPSEL,
f2d9751c
LP
75 SCMP_ARCH_MIPS64N32,
76 SCMP_ARCH_MIPSEL64N32,
77 SCMP_ARCH_MIPS64,
78 SCMP_ARCH_MIPSEL64, /* native */
79#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPSEL,
81 SCMP_ARCH_MIPS,
469830d1 82 SCMP_ARCH_MIPSEL64,
f2d9751c 83 SCMP_ARCH_MIPS64,
469830d1 84 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
85 SCMP_ARCH_MIPS64N32, /* native */
86#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
87 SCMP_ARCH_MIPS,
88 SCMP_ARCH_MIPSEL,
89 SCMP_ARCH_MIPS64,
90 SCMP_ARCH_MIPSEL64,
91 SCMP_ARCH_MIPS64N32,
92 SCMP_ARCH_MIPSEL64N32, /* native */
93#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 94 SCMP_ARCH_PPC,
469830d1 95 SCMP_ARCH_PPC64LE,
f2d9751c
LP
96 SCMP_ARCH_PPC64, /* native */
97#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC,
99 SCMP_ARCH_PPC64,
100 SCMP_ARCH_PPC64LE, /* native */
101#elif defined(__powerpc__)
102 SCMP_ARCH_PPC,
103#elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106#elif defined(__s390__)
469830d1 107 SCMP_ARCH_S390,
469830d1
LP
108#endif
109 (uint32_t) -1
110 };
57183d11
LP
111
112const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
57183d11 117
aa34055f
ZJS
118 switch(c) {
119 case SCMP_ARCH_NATIVE:
57183d11 120 return "native";
aa34055f 121 case SCMP_ARCH_X86:
57183d11 122 return "x86";
aa34055f 123 case SCMP_ARCH_X86_64:
57183d11 124 return "x86-64";
aa34055f 125 case SCMP_ARCH_X32:
57183d11 126 return "x32";
aa34055f 127 case SCMP_ARCH_ARM:
57183d11 128 return "arm";
aa34055f
ZJS
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 case SCMP_ARCH_MIPS:
132 return "mips";
133 case SCMP_ARCH_MIPS64:
134 return "mips64";
135 case SCMP_ARCH_MIPS64N32:
136 return "mips64-n32";
137 case SCMP_ARCH_MIPSEL:
138 return "mips-le";
139 case SCMP_ARCH_MIPSEL64:
140 return "mips64-le";
141 case SCMP_ARCH_MIPSEL64N32:
142 return "mips64-le-n32";
143 case SCMP_ARCH_PPC:
144 return "ppc";
145 case SCMP_ARCH_PPC64:
146 return "ppc64";
147 case SCMP_ARCH_PPC64LE:
148 return "ppc64-le";
149 case SCMP_ARCH_S390:
6abfd303 150 return "s390";
aa34055f 151 case SCMP_ARCH_S390X:
6abfd303 152 return "s390x";
aa34055f
ZJS
153 default:
154 return NULL;
155 }
57183d11
LP
156}
157
158int seccomp_arch_from_string(const char *n, uint32_t *ret) {
159 if (!n)
160 return -EINVAL;
161
162 assert(ret);
163
164 if (streq(n, "native"))
165 *ret = SCMP_ARCH_NATIVE;
166 else if (streq(n, "x86"))
167 *ret = SCMP_ARCH_X86;
168 else if (streq(n, "x86-64"))
169 *ret = SCMP_ARCH_X86_64;
170 else if (streq(n, "x32"))
171 *ret = SCMP_ARCH_X32;
172 else if (streq(n, "arm"))
173 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
174 else if (streq(n, "arm64"))
175 *ret = SCMP_ARCH_AARCH64;
176 else if (streq(n, "mips"))
177 *ret = SCMP_ARCH_MIPS;
178 else if (streq(n, "mips64"))
179 *ret = SCMP_ARCH_MIPS64;
180 else if (streq(n, "mips64-n32"))
181 *ret = SCMP_ARCH_MIPS64N32;
182 else if (streq(n, "mips-le"))
183 *ret = SCMP_ARCH_MIPSEL;
184 else if (streq(n, "mips64-le"))
185 *ret = SCMP_ARCH_MIPSEL64;
186 else if (streq(n, "mips64-le-n32"))
187 *ret = SCMP_ARCH_MIPSEL64N32;
188 else if (streq(n, "ppc"))
189 *ret = SCMP_ARCH_PPC;
190 else if (streq(n, "ppc64"))
191 *ret = SCMP_ARCH_PPC64;
192 else if (streq(n, "ppc64-le"))
193 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
57183d11
LP
198 else
199 return -EINVAL;
200
201 return 0;
202}
e9642be2 203
469830d1 204int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
205 scmp_filter_ctx seccomp;
206 int r;
207
469830d1
LP
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
469830d1
LP
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
1b52793d 218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
219 if (r < 0)
220 goto finish;
221
1b52793d 222 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
223 if (r < 0)
224 goto finish;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
235 if (r < 0)
236 goto finish;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 goto finish;
241
242 *ret = seccomp;
243 return 0;
244
245finish:
246 seccomp_release(seccomp);
247 return r;
248}
249
d347d902 250static bool is_basic_seccomp_available(void) {
4d5bd50a 251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
252}
253
254static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
d347d902
FS
257}
258
83f12b27 259bool is_seccomp_available(void) {
83f12b27 260 static int cached_enabled = -1;
4d5bd50a 261
83f12b27 262 if (cached_enabled < 0)
4d5bd50a
LP
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
83f12b27
FS
267 return cached_enabled;
268}
269
8130926d 270const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 271 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 272 .name = "@default",
d5efc18b 273 .help = "System calls that are always permitted",
40eb6a80
ZJS
274 .value =
275 "clock_getres\0"
276 "clock_gettime\0"
277 "clock_nanosleep\0"
278 "execve\0"
279 "exit\0"
280 "exit_group\0"
e41b0f42
LP
281 "futex\0"
282 "get_robust_list\0"
283 "get_thread_area\0"
09d3020b
DH
284 "getegid\0"
285 "getegid32\0"
286 "geteuid\0"
287 "geteuid32\0"
288 "getgid\0"
289 "getgid32\0"
290 "getgroups\0"
291 "getgroups32\0"
292 "getpgid\0"
293 "getpgrp\0"
294 "getpid\0"
295 "getppid\0"
296 "getresgid\0"
297 "getresgid32\0"
298 "getresuid\0"
299 "getresuid32\0"
40eb6a80 300 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
301 "getsid\0"
302 "gettid\0"
40eb6a80 303 "gettimeofday\0"
09d3020b
DH
304 "getuid\0"
305 "getuid32\0"
e41b0f42 306 "membarrier\0"
40eb6a80
ZJS
307 "nanosleep\0"
308 "pause\0"
e41b0f42 309 "restart_syscall\0"
40eb6a80 310 "rt_sigreturn\0"
e41b0f42
LP
311 "set_robust_list\0"
312 "set_thread_area\0"
313 "set_tid_address\0"
40eb6a80
ZJS
314 "sigreturn\0"
315 "time\0"
316 },
133ddbbe 317 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 318 .name = "@basic-io",
d5efc18b 319 .help = "Basic IO",
133ddbbe 320 .value =
648a0ed0 321 "_llseek\0"
133ddbbe 322 "close\0"
648a0ed0 323 "dup\0"
133ddbbe
LP
324 "dup2\0"
325 "dup3\0"
133ddbbe
LP
326 "lseek\0"
327 "pread64\0"
328 "preadv\0"
329 "pwrite64\0"
330 "pwritev\0"
331 "read\0"
332 "readv\0"
333 "write\0"
334 "writev\0"
335 },
8130926d 336 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 337 .name = "@clock",
d5efc18b 338 .help = "Change the system time",
201c1cc2
TM
339 .value =
340 "adjtimex\0"
1f9ac68b
LP
341 "clock_adjtime\0"
342 "clock_settime\0"
201c1cc2 343 "settimeofday\0"
1f9ac68b 344 "stime\0"
8130926d
LP
345 },
346 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 347 .name = "@cpu-emulation",
d5efc18b 348 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
349 .value =
350 "modify_ldt\0"
351 "subpage_prot\0"
352 "switch_endian\0"
353 "vm86\0"
354 "vm86old\0"
8130926d
LP
355 },
356 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 357 .name = "@debug",
d5efc18b 358 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
359 .value =
360 "lookup_dcookie\0"
361 "perf_event_open\0"
362 "process_vm_readv\0"
363 "process_vm_writev\0"
364 "ptrace\0"
365 "rtas\0"
8130926d 366#ifdef __NR_s390_runtime_instr
1f9ac68b 367 "s390_runtime_instr\0"
8130926d 368#endif
1f9ac68b 369 "sys_debug_setcontext\0"
8130926d 370 },
1a1b13c9
LP
371 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
372 .name = "@file-system",
373 .help = "File system operations",
374 .value =
375 "access\0"
376 "chdir\0"
377 "chmod\0"
378 "close\0"
379 "creat\0"
380 "faccessat\0"
381 "fallocate\0"
382 "fchdir\0"
383 "fchmod\0"
384 "fchmodat\0"
1a1b13c9 385 "fcntl\0"
ceaa6aa7 386 "fcntl64\0"
1a1b13c9
LP
387 "fgetxattr\0"
388 "flistxattr\0"
ceaa6aa7 389 "fremovexattr\0"
1a1b13c9 390 "fsetxattr\0"
1a1b13c9 391 "fstat\0"
ceaa6aa7 392 "fstat64\0"
1a1b13c9 393 "fstatat64\0"
1a1b13c9 394 "fstatfs\0"
ceaa6aa7 395 "fstatfs64\0"
1a1b13c9 396 "ftruncate\0"
ceaa6aa7 397 "ftruncate64\0"
1a1b13c9
LP
398 "futimesat\0"
399 "getcwd\0"
1a1b13c9 400 "getdents\0"
ceaa6aa7 401 "getdents64\0"
1a1b13c9
LP
402 "getxattr\0"
403 "inotify_add_watch\0"
ceaa6aa7 404 "inotify_init\0"
1a1b13c9
LP
405 "inotify_init1\0"
406 "inotify_rm_watch\0"
407 "lgetxattr\0"
408 "link\0"
409 "linkat\0"
410 "listxattr\0"
411 "llistxattr\0"
412 "lremovexattr\0"
413 "lsetxattr\0"
1a1b13c9 414 "lstat\0"
ceaa6aa7 415 "lstat64\0"
1a1b13c9
LP
416 "mkdir\0"
417 "mkdirat\0"
418 "mknod\0"
419 "mknodat\0"
1a1b13c9 420 "mmap\0"
ceaa6aa7 421 "mmap2\0"
7961116e 422 "munmap\0"
1a1b13c9 423 "newfstatat\0"
ceaa6aa7
LP
424 "oldfstat\0"
425 "oldlstat\0"
426 "oldstat\0"
1a1b13c9
LP
427 "open\0"
428 "openat\0"
429 "readlink\0"
430 "readlinkat\0"
431 "removexattr\0"
432 "rename\0"
1a1b13c9 433 "renameat\0"
ceaa6aa7 434 "renameat2\0"
1a1b13c9
LP
435 "rmdir\0"
436 "setxattr\0"
1a1b13c9 437 "stat\0"
ceaa6aa7 438 "stat64\0"
1a1b13c9 439 "statfs\0"
ceaa6aa7
LP
440 "statfs64\0"
441#ifdef __PNR_statx
a4135a74 442 "statx\0"
ceaa6aa7 443#endif
1a1b13c9
LP
444 "symlink\0"
445 "symlinkat\0"
1a1b13c9 446 "truncate\0"
ceaa6aa7 447 "truncate64\0"
1a1b13c9
LP
448 "unlink\0"
449 "unlinkat\0"
ceaa6aa7 450 "utime\0"
1a1b13c9
LP
451 "utimensat\0"
452 "utimes\0"
453 },
8130926d 454 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 455 .name = "@io-event",
d5efc18b 456 .help = "Event loop system calls",
201c1cc2
TM
457 .value =
458 "_newselect\0"
201c1cc2 459 "epoll_create\0"
215728ff 460 "epoll_create1\0"
201c1cc2
TM
461 "epoll_ctl\0"
462 "epoll_ctl_old\0"
463 "epoll_pwait\0"
464 "epoll_wait\0"
465 "epoll_wait_old\0"
201c1cc2 466 "eventfd\0"
215728ff 467 "eventfd2\0"
201c1cc2
TM
468 "poll\0"
469 "ppoll\0"
470 "pselect6\0"
471 "select\0"
8130926d
LP
472 },
473 [SYSCALL_FILTER_SET_IPC] = {
8130926d 474 .name = "@ipc",
d5efc18b
ZJS
475 .help = "SysV IPC, POSIX Message Queues or other IPC",
476 .value =
477 "ipc\0"
cd5bfd7e 478 "memfd_create\0"
201c1cc2
TM
479 "mq_getsetattr\0"
480 "mq_notify\0"
481 "mq_open\0"
482 "mq_timedreceive\0"
483 "mq_timedsend\0"
484 "mq_unlink\0"
485 "msgctl\0"
486 "msgget\0"
487 "msgrcv\0"
488 "msgsnd\0"
cd5bfd7e 489 "pipe\0"
215728ff 490 "pipe2\0"
201c1cc2
TM
491 "process_vm_readv\0"
492 "process_vm_writev\0"
493 "semctl\0"
494 "semget\0"
495 "semop\0"
496 "semtimedop\0"
497 "shmat\0"
498 "shmctl\0"
499 "shmdt\0"
500 "shmget\0"
8130926d
LP
501 },
502 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 503 .name = "@keyring",
d5efc18b 504 .help = "Kernel keyring access",
1f9ac68b
LP
505 .value =
506 "add_key\0"
507 "keyctl\0"
508 "request_key\0"
8130926d 509 },
cd0ddf6f
LP
510 [SYSCALL_FILTER_SET_MEMLOCK] = {
511 .name = "@memlock",
512 .help = "Memory locking control",
513 .value =
514 "mlock\0"
515 "mlock2\0"
516 "mlockall\0"
517 "munlock\0"
518 "munlockall\0"
519 },
8130926d 520 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 521 .name = "@module",
d5efc18b 522 .help = "Loading and unloading of kernel modules",
201c1cc2 523 .value =
201c1cc2
TM
524 "delete_module\0"
525 "finit_module\0"
526 "init_module\0"
8130926d
LP
527 },
528 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 529 .name = "@mount",
d5efc18b 530 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
531 .value =
532 "chroot\0"
533 "mount\0"
201c1cc2 534 "pivot_root\0"
201c1cc2 535 "umount\0"
215728ff 536 "umount2\0"
8130926d
LP
537 },
538 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 539 .name = "@network-io",
d5efc18b 540 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 541 .value =
201c1cc2 542 "accept\0"
215728ff 543 "accept4\0"
201c1cc2
TM
544 "bind\0"
545 "connect\0"
546 "getpeername\0"
547 "getsockname\0"
548 "getsockopt\0"
549 "listen\0"
550 "recv\0"
551 "recvfrom\0"
552 "recvmmsg\0"
553 "recvmsg\0"
554 "send\0"
555 "sendmmsg\0"
556 "sendmsg\0"
557 "sendto\0"
558 "setsockopt\0"
559 "shutdown\0"
560 "socket\0"
561 "socketcall\0"
562 "socketpair\0"
8130926d
LP
563 },
564 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 565 /* some unknown even to libseccomp */
8130926d 566 .name = "@obsolete",
d5efc18b 567 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
568 .value =
569 "_sysctl\0"
570 "afs_syscall\0"
802fa07a 571 "bdflush\0"
201c1cc2 572 "break\0"
1f9ac68b 573 "create_module\0"
201c1cc2
TM
574 "ftime\0"
575 "get_kernel_syms\0"
201c1cc2
TM
576 "getpmsg\0"
577 "gtty\0"
7e0c3b8f 578 "idle\0"
201c1cc2 579 "lock\0"
201c1cc2 580 "mpx\0"
201c1cc2
TM
581 "prof\0"
582 "profil\0"
201c1cc2
TM
583 "putpmsg\0"
584 "query_module\0"
201c1cc2
TM
585 "security\0"
586 "sgetmask\0"
587 "ssetmask\0"
588 "stty\0"
1f9ac68b 589 "sysfs\0"
201c1cc2
TM
590 "tuxcall\0"
591 "ulimit\0"
592 "uselib\0"
1f9ac68b 593 "ustat\0"
201c1cc2 594 "vserver\0"
8130926d
LP
595 },
596 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 597 .name = "@privileged",
d5efc18b 598 .help = "All system calls which need super-user capabilities",
201c1cc2
TM
599 .value =
600 "@clock\0"
601 "@module\0"
602 "@raw-io\0"
215728ff 603 "_sysctl\0"
201c1cc2 604 "acct\0"
201c1cc2 605 "bpf\0"
1f9ac68b 606 "capset\0"
201c1cc2 607 "chown\0"
215728ff 608 "chown32\0"
201c1cc2 609 "chroot\0"
201c1cc2 610 "fchown\0"
215728ff 611 "fchown32\0"
201c1cc2
TM
612 "fchownat\0"
613 "kexec_file_load\0"
614 "kexec_load\0"
201c1cc2 615 "lchown\0"
215728ff 616 "lchown32\0"
201c1cc2
TM
617 "nfsservctl\0"
618 "pivot_root\0"
619 "quotactl\0"
620 "reboot\0"
621 "setdomainname\0"
201c1cc2 622 "setfsuid\0"
215728ff 623 "setfsuid32\0"
201c1cc2 624 "setgroups\0"
215728ff 625 "setgroups32\0"
201c1cc2 626 "sethostname\0"
201c1cc2 627 "setresuid\0"
215728ff 628 "setresuid32\0"
201c1cc2 629 "setreuid\0"
215728ff 630 "setreuid32\0"
201c1cc2 631 "setuid\0"
215728ff 632 "setuid32\0"
201c1cc2
TM
633 "swapoff\0"
634 "swapon\0"
201c1cc2 635 "vhangup\0"
8130926d
LP
636 },
637 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 638 .name = "@process",
d5efc18b 639 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
640 .value =
641 "arch_prctl\0"
09d3020b 642 "capget\0" /* Able to query arbitrary processes */
201c1cc2 643 "clone\0"
201c1cc2
TM
644 "execveat\0"
645 "fork\0"
b887d2eb
LP
646 "getrusage\0"
647 "gettid\0"
201c1cc2
TM
648 "kill\0"
649 "prctl\0"
b887d2eb
LP
650 "rt_sigqueueinfo\0"
651 "rt_tgsigqueueinfo\0"
201c1cc2
TM
652 "setns\0"
653 "tgkill\0"
b887d2eb 654 "times\0"
201c1cc2
TM
655 "tkill\0"
656 "unshare\0"
657 "vfork\0"
b887d2eb
LP
658 "wait4\0"
659 "waitid\0"
660 "waitpid\0"
8130926d
LP
661 },
662 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 663 .name = "@raw-io",
d5efc18b 664 .help = "Raw I/O port access",
201c1cc2
TM
665 .value =
666 "ioperm\0"
667 "iopl\0"
1f9ac68b 668 "pciconfig_iobase\0"
201c1cc2
TM
669 "pciconfig_read\0"
670 "pciconfig_write\0"
8130926d 671#ifdef __NR_s390_pci_mmio_read
201c1cc2 672 "s390_pci_mmio_read\0"
8130926d
LP
673#endif
674#ifdef __NR_s390_pci_mmio_write
201c1cc2 675 "s390_pci_mmio_write\0"
8130926d
LP
676#endif
677 },
bd2ab3f4
LP
678 [SYSCALL_FILTER_SET_REBOOT] = {
679 .name = "@reboot",
680 .help = "Reboot and reboot preparation/kexec",
681 .value =
682 "kexec\0"
683 "kexec_file_load\0"
684 "reboot\0"
685 },
133ddbbe 686 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 687 .name = "@resources",
58a8f68b 688 .help = "Alter resource settings",
133ddbbe 689 .value =
0963c053
LP
690 "ioprio_set\0"
691 "mbind\0"
692 "migrate_pages\0"
693 "move_pages\0"
694 "nice\0"
695 "prlimit64\0"
696 "sched_setaffinity\0"
697 "sched_setattr\0"
133ddbbe
LP
698 "sched_setparam\0"
699 "sched_setscheduler\0"
0963c053 700 "set_mempolicy\0"
133ddbbe
LP
701 "setpriority\0"
702 "setrlimit\0"
133ddbbe 703 },
6eaaeee9
LP
704 [SYSCALL_FILTER_SET_SETUID] = {
705 .name = "@setuid",
706 .help = "Operations for changing user/group credentials",
707 .value =
6eaaeee9 708 "setgid\0"
215728ff 709 "setgid32\0"
6eaaeee9 710 "setgroups\0"
215728ff 711 "setgroups32\0"
6eaaeee9 712 "setregid\0"
215728ff 713 "setregid32\0"
6eaaeee9 714 "setresgid\0"
215728ff 715 "setresgid32\0"
6eaaeee9 716 "setresuid\0"
215728ff 717 "setresuid32\0"
6eaaeee9 718 "setreuid\0"
215728ff 719 "setreuid32\0"
6eaaeee9 720 "setuid\0"
215728ff 721 "setuid32\0"
6eaaeee9 722 },
cd0ddf6f
LP
723 [SYSCALL_FILTER_SET_SIGNAL] = {
724 .name = "@signal",
725 .help = "Process signal handling",
726 .value =
727 "rt_sigaction\0"
728 "rt_sigpending\0"
729 "rt_sigprocmask\0"
730 "rt_sigsuspend\0"
731 "rt_sigtimedwait\0"
732 "sigaction\0"
733 "sigaltstack\0"
734 "signal\0"
735 "signalfd\0"
736 "signalfd4\0"
737 "sigpending\0"
738 "sigprocmask\0"
739 "sigsuspend\0"
740 },
bd2ab3f4
LP
741 [SYSCALL_FILTER_SET_SWAP] = {
742 .name = "@swap",
743 .help = "Enable/disable swap devices",
744 .value =
745 "swapoff\0"
746 "swapon\0"
747 },
cd0ddf6f
LP
748 [SYSCALL_FILTER_SET_TIMER] = {
749 .name = "@timer",
750 .help = "Schedule operations by time",
751 .value =
752 "alarm\0"
753 "getitimer\0"
754 "setitimer\0"
755 "timer_create\0"
756 "timer_delete\0"
757 "timer_getoverrun\0"
758 "timer_gettime\0"
759 "timer_settime\0"
760 "timerfd_create\0"
761 "timerfd_gettime\0"
762 "timerfd_settime\0"
763 "times\0"
764 },
201c1cc2 765};
8130926d
LP
766
767const SyscallFilterSet *syscall_filter_set_find(const char *name) {
768 unsigned i;
769
770 if (isempty(name) || name[0] != '@')
771 return NULL;
772
773 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
774 if (streq(syscall_filter_sets[i].name, name))
775 return syscall_filter_sets + i;
776
777 return NULL;
778}
779
960e4569 780static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
69b1b241 781
960e4569 782int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
69b1b241
LP
783 int r;
784
785 assert(seccomp);
786 assert(name);
787
960e4569
LP
788 if (strv_contains(exclude, name))
789 return 0;
790
69b1b241
LP
791 if (name[0] == '@') {
792 const SyscallFilterSet *other;
793
794 other = syscall_filter_set_find(name);
cff7bff8
LP
795 if (!other) {
796 log_debug("Filter set %s is not known!", name);
69b1b241 797 return -EINVAL;
cff7bff8 798 }
69b1b241 799
960e4569 800 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
69b1b241
LP
801 if (r < 0)
802 return r;
803 } else {
804 int id;
805
806 id = seccomp_syscall_resolve_name(name);
cff7bff8
LP
807 if (id == __NR_SCMP_ERROR) {
808 log_debug("System call %s is not known!", name);
69b1b241 809 return -EINVAL; /* Not known at all? Then that's a real error */
cff7bff8 810 }
69b1b241
LP
811
812 r = seccomp_rule_add_exact(seccomp, action, id, 0);
813 if (r < 0)
814 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
815 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
816 }
817
818 return 0;
819}
820
469830d1
LP
821static int seccomp_add_syscall_filter_set(
822 scmp_filter_ctx seccomp,
469830d1 823 const SyscallFilterSet *set,
960e4569
LP
824 uint32_t action,
825 char **exclude) {
469830d1 826
8130926d
LP
827 const char *sys;
828 int r;
829
830 assert(seccomp);
831 assert(set);
832
833 NULSTR_FOREACH(sys, set->value) {
960e4569 834 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
69b1b241
LP
835 if (r < 0)
836 return r;
469830d1
LP
837 }
838
839 return 0;
840}
841
842int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
843 uint32_t arch;
844 int r;
845
846 assert(set);
847
848 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
849 * earch local arch. */
850
851 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
852 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
853
854 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
855
856 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
857 if (r < 0)
858 return r;
469830d1 859
960e4569 860 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
469830d1
LP
861 if (r < 0) {
862 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
863 continue;
864 }
865
866 r = seccomp_load(seccomp);
867 if (IN_SET(r, -EPERM, -EACCES))
868 return r;
869 if (r < 0)
870 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
871 }
872
873 return 0;
874}
a3be2849 875
469830d1
LP
876int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
877 uint32_t arch;
a3be2849
LP
878 int r;
879
469830d1
LP
880 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
881 * SyscallFilterSet* table. */
a3be2849 882
469830d1
LP
883 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
884 return 0;
a3be2849 885
469830d1
LP
886 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
887 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
888 Iterator i;
889 void *id;
a3be2849 890
469830d1 891 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 892
469830d1
LP
893 r = seccomp_init_for_arch(&seccomp, arch, default_action);
894 if (r < 0)
895 return r;
a3be2849 896
469830d1
LP
897 SET_FOREACH(id, set, i) {
898 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
899 if (r < 0) {
900 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
901 _cleanup_free_ char *n = NULL;
902
903 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
e7854c46 904 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
469830d1
LP
905 }
906 }
907
908 r = seccomp_load(seccomp);
909 if (IN_SET(r, -EPERM, -EACCES))
910 return r;
911 if (r < 0)
912 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
913 }
914
915 return 0;
add00535
LP
916}
917
918int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 919 uint32_t arch;
add00535
LP
920 int r;
921
922 if (log_get_max_level() >= LOG_DEBUG) {
923 _cleanup_free_ char *s = NULL;
924
925 (void) namespace_flag_to_string_many(retain, &s);
926 log_debug("Restricting namespace to: %s.", strna(s));
927 }
928
929 /* NOOP? */
930 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
931 return 0;
932
469830d1
LP
933 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
934 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
935 unsigned i;
add00535 936
469830d1
LP
937 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
938
939 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
940 if (r < 0)
941 return r;
942
943 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
944 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
945 * altogether. */
946 r = seccomp_rule_add_exact(
947 seccomp,
948 SCMP_ACT_ERRNO(EPERM),
949 SCMP_SYS(setns),
950 0);
951 else
952 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
953 * special invocation with a zero flags argument, right here. */
954 r = seccomp_rule_add_exact(
955 seccomp,
956 SCMP_ACT_ERRNO(EPERM),
957 SCMP_SYS(setns),
958 1,
959 SCMP_A1(SCMP_CMP_EQ, 0));
960 if (r < 0) {
961 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
962 continue;
963 }
964
965 for (i = 0; namespace_flag_map[i].name; i++) {
966 unsigned long f;
967
968 f = namespace_flag_map[i].flag;
969 if ((retain & f) == f) {
970 log_debug("Permitting %s.", namespace_flag_map[i].name);
971 continue;
972 }
973
974 log_debug("Blocking %s.", namespace_flag_map[i].name);
975
976 r = seccomp_rule_add_exact(
977 seccomp,
978 SCMP_ACT_ERRNO(EPERM),
979 SCMP_SYS(unshare),
980 1,
981 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
982 if (r < 0) {
983 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
984 break;
985 }
986
511ceb1f
ZJS
987 /* On s390/s390x the first two parameters to clone are switched */
988 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
989 r = seccomp_rule_add_exact(
990 seccomp,
991 SCMP_ACT_ERRNO(EPERM),
992 SCMP_SYS(clone),
993 1,
994 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
995 else
996 r = seccomp_rule_add_exact(
997 seccomp,
998 SCMP_ACT_ERRNO(EPERM),
999 SCMP_SYS(clone),
1000 1,
1001 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1002 if (r < 0) {
1003 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1004 break;
1005 }
1006
1007 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1008 r = seccomp_rule_add_exact(
1009 seccomp,
1010 SCMP_ACT_ERRNO(EPERM),
1011 SCMP_SYS(setns),
1012 1,
1013 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1014 if (r < 0) {
1015 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1016 break;
1017 }
1018 }
1019 }
1020 if (r < 0)
1021 continue;
1022
1023 r = seccomp_load(seccomp);
1024 if (IN_SET(r, -EPERM, -EACCES))
1025 return r;
1026 if (r < 0)
1027 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1028 }
1029
1030 return 0;
1031}
1032
1033int seccomp_protect_sysctl(void) {
1034 uint32_t arch;
1035 int r;
1036
1037 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1038 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1039
1040 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1041
2e64e8f4
ZJS
1042 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1043 /* No _sysctl syscall */
1044 continue;
1045
469830d1
LP
1046 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1047 if (r < 0)
1048 return r;
1049
1050 r = seccomp_rule_add_exact(
add00535
LP
1051 seccomp,
1052 SCMP_ACT_ERRNO(EPERM),
469830d1 1053 SCMP_SYS(_sysctl),
add00535 1054 0);
469830d1
LP
1055 if (r < 0) {
1056 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1057 continue;
1058 }
1059
1060 r = seccomp_load(seccomp);
1061 if (IN_SET(r, -EPERM, -EACCES))
1062 return r;
1063 if (r < 0)
1064 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1065 }
1066
1067 return 0;
1068}
1069
1070int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1071 uint32_t arch;
1072 int r;
1073
1074 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1075 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1076 bool supported;
469830d1
LP
1077 Iterator i;
1078
1079 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1080
9606bc4b
LP
1081 switch (arch) {
1082
1083 case SCMP_ARCH_X86_64:
1084 case SCMP_ARCH_X32:
1085 case SCMP_ARCH_ARM:
1086 case SCMP_ARCH_AARCH64:
da1921a5
ZJS
1087 case SCMP_ARCH_PPC64:
1088 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1089 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1090 supported = true;
1091 break;
1092
9606bc4b
LP
1093 case SCMP_ARCH_S390:
1094 case SCMP_ARCH_S390X:
1095 case SCMP_ARCH_PPC:
da1921a5 1096 case SCMP_ARCH_X86:
9606bc4b
LP
1097 default:
1098 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1099 * don't know */
1100 supported = false;
1101 break;
1102 }
1103
1104 if (!supported)
1105 continue;
1106
469830d1
LP
1107 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1108 if (r < 0)
1109 return r;
1110
1111 if (whitelist) {
1112 int af, first = 0, last = 0;
1113 void *afp;
1114
1115 /* If this is a whitelist, we first block the address families that are out of range and then
1116 * everything that is not in the set. First, we find the lowest and highest address family in
1117 * the set. */
1118
1119 SET_FOREACH(afp, address_families, i) {
1120 af = PTR_TO_INT(afp);
1121
1122 if (af <= 0 || af >= af_max())
1123 continue;
1124
1125 if (first == 0 || af < first)
1126 first = af;
1127
1128 if (last == 0 || af > last)
1129 last = af;
1130 }
1131
1132 assert((first == 0) == (last == 0));
1133
1134 if (first == 0) {
1135
1136 /* No entries in the valid range, block everything */
1137 r = seccomp_rule_add_exact(
1138 seccomp,
1139 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1140 SCMP_SYS(socket),
1141 0);
1142 if (r < 0) {
1143 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1144 continue;
1145 }
1146
1147 } else {
1148
1149 /* Block everything below the first entry */
1150 r = seccomp_rule_add_exact(
1151 seccomp,
1152 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1153 SCMP_SYS(socket),
1154 1,
1155 SCMP_A0(SCMP_CMP_LT, first));
1156 if (r < 0) {
1157 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1158 continue;
1159 }
1160
1161 /* Block everything above the last entry */
1162 r = seccomp_rule_add_exact(
1163 seccomp,
1164 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1165 SCMP_SYS(socket),
1166 1,
1167 SCMP_A0(SCMP_CMP_GT, last));
1168 if (r < 0) {
1169 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1170 continue;
1171 }
1172
1173 /* Block everything between the first and last entry */
1174 for (af = 1; af < af_max(); af++) {
1175
1176 if (set_contains(address_families, INT_TO_PTR(af)))
1177 continue;
1178
1179 r = seccomp_rule_add_exact(
1180 seccomp,
1181 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1182 SCMP_SYS(socket),
1183 1,
1184 SCMP_A0(SCMP_CMP_EQ, af));
1185 if (r < 0)
1186 break;
1187 }
1188
1189 if (r < 0) {
1190 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1191 continue;
1192 }
1193 }
1194
1195 } else {
1196 void *af;
1197
1198 /* If this is a blacklist, then generate one rule for
1199 * each address family that are then combined in OR
1200 * checks. */
1201
1202 SET_FOREACH(af, address_families, i) {
1203
1204 r = seccomp_rule_add_exact(
1205 seccomp,
1206 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1207 SCMP_SYS(socket),
1208 1,
1209 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1210 if (r < 0)
1211 break;
1212 }
1213
1214 if (r < 0) {
1215 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1216 continue;
1217 }
1218 }
1219
1220 r = seccomp_load(seccomp);
1221 if (IN_SET(r, -EPERM, -EACCES))
1222 return r;
1223 if (r < 0)
1224 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1225 }
1226
1227 return 0;
1228}
1229
1230int seccomp_restrict_realtime(void) {
1231 static const int permitted_policies[] = {
1232 SCHED_OTHER,
1233 SCHED_BATCH,
1234 SCHED_IDLE,
1235 };
1236
1237 int r, max_policy = 0;
1238 uint32_t arch;
1239 unsigned i;
1240
1241 /* Determine the highest policy constant we want to allow */
1242 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1243 if (permitted_policies[i] > max_policy)
1244 max_policy = permitted_policies[i];
1245
1246 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1247 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1248 int p;
1249
1250 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1251
1252 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1253 if (r < 0)
1254 return r;
1255
1256 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1257 * whitelist. */
1258 for (p = 0; p < max_policy; p++) {
1259 bool good = false;
1260
1261 /* Check if this is in the whitelist. */
1262 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1263 if (permitted_policies[i] == p) {
1264 good = true;
1265 break;
1266 }
1267
1268 if (good)
1269 continue;
1270
1271 /* Deny this policy */
1272 r = seccomp_rule_add_exact(
1273 seccomp,
1274 SCMP_ACT_ERRNO(EPERM),
1275 SCMP_SYS(sched_setscheduler),
1276 1,
1277 SCMP_A1(SCMP_CMP_EQ, p));
1278 if (r < 0) {
1279 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1280 continue;
1281 }
1282 }
1283
1284 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1285 * unsigned here, hence no need no check for < 0 values. */
1286 r = seccomp_rule_add_exact(
add00535
LP
1287 seccomp,
1288 SCMP_ACT_ERRNO(EPERM),
469830d1 1289 SCMP_SYS(sched_setscheduler),
add00535 1290 1,
469830d1
LP
1291 SCMP_A1(SCMP_CMP_GT, max_policy));
1292 if (r < 0) {
1293 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1294 continue;
1295 }
add00535 1296
469830d1
LP
1297 r = seccomp_load(seccomp);
1298 if (IN_SET(r, -EPERM, -EACCES))
1299 return r;
1300 if (r < 0)
1301 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1302 }
1303
1304 return 0;
1305}
1306
6dc66688
ZJS
1307static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1308 uint32_t arch,
1309 int nr,
1310 unsigned int arg_cnt,
1311 const struct scmp_arg_cmp arg) {
1312 int r;
1313
1314 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1315 if (r < 0) {
1316 _cleanup_free_ char *n = NULL;
1317
1318 n = seccomp_syscall_resolve_num_arch(arch, nr);
1319 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1320 strna(n),
1321 seccomp_arch_to_string(arch));
1322 }
1323
1324 return r;
1325}
1326
2a8d6e63 1327/* For known architectures, check that syscalls are indeed defined or not. */
4278d1f5 1328#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1329assert_cc(SCMP_SYS(shmget) > 0);
1330assert_cc(SCMP_SYS(shmat) > 0);
1331assert_cc(SCMP_SYS(shmdt) > 0);
1332#elif defined(__i386__) || defined(__powerpc64__)
1333assert_cc(SCMP_SYS(shmget) < 0);
1334assert_cc(SCMP_SYS(shmat) < 0);
1335assert_cc(SCMP_SYS(shmdt) < 0);
1336#endif
6dc66688 1337
469830d1 1338int seccomp_memory_deny_write_execute(void) {
8a50cf69 1339
469830d1
LP
1340 uint32_t arch;
1341 int r;
1342
1343 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1344 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1345 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1346
469830d1
LP
1347 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1348
8a50cf69
LP
1349 switch (arch) {
1350
1351 case SCMP_ARCH_X86:
1352 filter_syscall = SCMP_SYS(mmap2);
1353 block_syscall = SCMP_SYS(mmap);
2a8d6e63
ZJS
1354 break;
1355
1356 case SCMP_ARCH_PPC64:
1357 case SCMP_ARCH_PPC64LE:
1358 filter_syscall = SCMP_SYS(mmap);
1359
1360 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1361 * We ignore that here, which means there's still a way to get writable/executable
1362 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1363
8a50cf69
LP
1364 break;
1365
4278d1f5
ZJS
1366 case SCMP_ARCH_ARM:
1367 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1368 shmat_syscall = SCMP_SYS(shmat);
1369 break;
1370
8a50cf69
LP
1371 case SCMP_ARCH_X86_64:
1372 case SCMP_ARCH_X32:
79873bc8
ZJS
1373 case SCMP_ARCH_AARCH64:
1374 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
8a50cf69
LP
1375 shmat_syscall = SCMP_SYS(shmat);
1376 break;
1377
1378 /* Please add more definitions here, if you port systemd to other architectures! */
1379
4278d1f5 1380#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
8a50cf69
LP
1381#warning "Consider adding the right mmap() syscall definitions here!"
1382#endif
1383 }
1384
1385 /* Can't filter mmap() on this arch, then skip it */
1386 if (filter_syscall == 0)
1387 continue;
1388
469830d1
LP
1389 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1390 if (r < 0)
1391 return r;
1392
6dc66688
ZJS
1393 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1394 1,
1395 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1396 if (r < 0)
1397 continue;
8a50cf69
LP
1398
1399 if (block_syscall != 0) {
6dc66688
ZJS
1400 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1401 if (r < 0)
8a50cf69 1402 continue;
add00535 1403 }
a3be2849 1404
6dc66688
ZJS
1405 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1406 1,
1407 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1408 if (r < 0)
469830d1 1409 continue;
add00535 1410
8a50cf69 1411 if (shmat_syscall != 0) {
6dc66688
ZJS
1412 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1413 1,
1414 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1415 if (r < 0)
8a50cf69 1416 continue;
469830d1
LP
1417 }
1418
1419 r = seccomp_load(seccomp);
1420 if (IN_SET(r, -EPERM, -EACCES))
1421 return r;
add00535 1422 if (r < 0)
469830d1
LP
1423 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 }
add00535 1425
469830d1
LP
1426 return 0;
1427}
1428
1429int seccomp_restrict_archs(Set *archs) {
1430 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1431 Iterator i;
1432 void *id;
1433 int r;
1434
1435 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1436 * list. */
1437
1438 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1439 if (!seccomp)
1440 return -ENOMEM;
1441
1442 SET_FOREACH(id, archs, i) {
1443 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1444 if (r == -EEXIST)
1445 continue;
1446 if (r < 0)
1447 return r;
add00535
LP
1448 }
1449
469830d1
LP
1450 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1451 if (r < 0)
1452 return r;
add00535 1453
469830d1 1454 return seccomp_load(seccomp);
a3be2849 1455}
b16bd535
YW
1456
1457int parse_syscall_archs(char **l, Set **archs) {
1458 _cleanup_set_free_ Set *_archs;
1459 char **s;
1460 int r;
1461
1462 assert(l);
1463 assert(archs);
1464
1465 r = set_ensure_allocated(&_archs, NULL);
1466 if (r < 0)
1467 return r;
1468
1469 STRV_FOREACH(s, l) {
1470 uint32_t a;
1471
1472 r = seccomp_arch_from_string(*s, &a);
1473 if (r < 0)
1474 return -EINVAL;
1475
1476 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1477 if (r < 0)
1478 return -ENOMEM;
1479 }
1480
1481 *archs = _archs;
1482 _archs = NULL;
1483
1484 return 0;
1485}
165a31c0
LP
1486
1487int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
1488 const char *i;
1489 int r;
1490
1491 assert(set);
1492
1493 NULSTR_FOREACH(i, set->value) {
1494
1495 if (i[0] == '@') {
1496 const SyscallFilterSet *more;
1497
1498 more = syscall_filter_set_find(i);
1499 if (!more)
1500 return -ENXIO;
1501
1502
1503 r = seccomp_filter_set_add(filter, add, more);
1504 if (r < 0)
1505 return r;
1506 } else {
1507 int id;
1508
1509 id = seccomp_syscall_resolve_name(i);
1510 if (id == __NR_SCMP_ERROR)
1511 return -ENXIO;
1512
1513 if (add) {
1514 r = set_put(filter, INT_TO_PTR(id + 1));
1515 if (r < 0)
1516 return r;
1517 } else
1518 (void) set_remove(filter, INT_TO_PTR(id + 1));
1519 }
1520 }
1521
1522 return 0;
1523}
78e864e5
TM
1524
1525int seccomp_lock_personality(unsigned long personality) {
72eafe71 1526 uint32_t arch;
78e864e5
TM
1527 int r;
1528
72eafe71
LP
1529 if (personality >= PERSONALITY_INVALID)
1530 return -EINVAL;
78e864e5 1531
72eafe71
LP
1532 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1533 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1534
72eafe71
LP
1535 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1536 if (r < 0)
1537 return r;
1538
1539 r = seccomp_rule_add_exact(
1540 seccomp,
1541 SCMP_ACT_ERRNO(EPERM),
1542 SCMP_SYS(personality),
1543 1,
1544 SCMP_A0(SCMP_CMP_NE, personality));
1545 if (r < 0)
1546 return r;
1547
1548 r = seccomp_load(seccomp);
1549 if (IN_SET(r, -EPERM, -EACCES))
1550 return r;
1551 if (r < 0)
1552 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1553 }
1554
1555 return 0;
78e864e5 1556}