]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
networkd: improve interface rename log message a bit (#7299)
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
57183d11
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
a8fbdf54 20#include <errno.h>
469830d1 21#include <linux/seccomp.h>
57183d11 22#include <seccomp.h>
a8fbdf54 23#include <stddef.h>
469830d1 24#include <sys/mman.h>
d347d902 25#include <sys/prctl.h>
469830d1 26#include <sys/shm.h>
57183d11 27
469830d1 28#include "af-list.h"
add00535 29#include "alloc-util.h"
a8fbdf54 30#include "macro.h"
add00535 31#include "nsflags.h"
78e864e5 32#include "process-util.h"
cf0fbc49 33#include "seccomp-util.h"
b16bd535 34#include "set.h"
07630cea 35#include "string-util.h"
b16bd535 36#include "strv.h"
8130926d 37#include "util.h"
469830d1
LP
38#include "errno-list.h"
39
40const uint32_t seccomp_local_archs[] = {
41
f2d9751c
LP
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43
44#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
45 SCMP_ARCH_X86,
46 SCMP_ARCH_X86_64,
f2d9751c
LP
47 SCMP_ARCH_X32, /* native */
48#elif defined(__x86_64__) && !defined(__ILP32__)
49 SCMP_ARCH_X86,
469830d1 50 SCMP_ARCH_X32,
f2d9751c
LP
51 SCMP_ARCH_X86_64, /* native */
52#elif defined(__i386__)
53 SCMP_ARCH_X86,
54#elif defined(__aarch64__)
469830d1 55 SCMP_ARCH_ARM,
f2d9751c
LP
56 SCMP_ARCH_AARCH64, /* native */
57#elif defined(__arm__)
58 SCMP_ARCH_ARM,
59#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS, /* native */
62#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 63 SCMP_ARCH_MIPS,
f2d9751c
LP
64 SCMP_ARCH_MIPSEL, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64N32,
469830d1 69 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPSEL64,
71 SCMP_ARCH_MIPS64, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPS,
469830d1 74 SCMP_ARCH_MIPSEL,
f2d9751c
LP
75 SCMP_ARCH_MIPS64N32,
76 SCMP_ARCH_MIPSEL64N32,
77 SCMP_ARCH_MIPS64,
78 SCMP_ARCH_MIPSEL64, /* native */
79#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPSEL,
81 SCMP_ARCH_MIPS,
469830d1 82 SCMP_ARCH_MIPSEL64,
f2d9751c 83 SCMP_ARCH_MIPS64,
469830d1 84 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
85 SCMP_ARCH_MIPS64N32, /* native */
86#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
87 SCMP_ARCH_MIPS,
88 SCMP_ARCH_MIPSEL,
89 SCMP_ARCH_MIPS64,
90 SCMP_ARCH_MIPSEL64,
91 SCMP_ARCH_MIPS64N32,
92 SCMP_ARCH_MIPSEL64N32, /* native */
93#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 94 SCMP_ARCH_PPC,
469830d1 95 SCMP_ARCH_PPC64LE,
f2d9751c
LP
96 SCMP_ARCH_PPC64, /* native */
97#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC,
99 SCMP_ARCH_PPC64,
100 SCMP_ARCH_PPC64LE, /* native */
101#elif defined(__powerpc__)
102 SCMP_ARCH_PPC,
103#elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106#elif defined(__s390__)
469830d1 107 SCMP_ARCH_S390,
469830d1
LP
108#endif
109 (uint32_t) -1
110 };
57183d11
LP
111
112const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
57183d11 117
aa34055f
ZJS
118 switch(c) {
119 case SCMP_ARCH_NATIVE:
57183d11 120 return "native";
aa34055f 121 case SCMP_ARCH_X86:
57183d11 122 return "x86";
aa34055f 123 case SCMP_ARCH_X86_64:
57183d11 124 return "x86-64";
aa34055f 125 case SCMP_ARCH_X32:
57183d11 126 return "x32";
aa34055f 127 case SCMP_ARCH_ARM:
57183d11 128 return "arm";
aa34055f
ZJS
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 case SCMP_ARCH_MIPS:
132 return "mips";
133 case SCMP_ARCH_MIPS64:
134 return "mips64";
135 case SCMP_ARCH_MIPS64N32:
136 return "mips64-n32";
137 case SCMP_ARCH_MIPSEL:
138 return "mips-le";
139 case SCMP_ARCH_MIPSEL64:
140 return "mips64-le";
141 case SCMP_ARCH_MIPSEL64N32:
142 return "mips64-le-n32";
143 case SCMP_ARCH_PPC:
144 return "ppc";
145 case SCMP_ARCH_PPC64:
146 return "ppc64";
147 case SCMP_ARCH_PPC64LE:
148 return "ppc64-le";
149 case SCMP_ARCH_S390:
6abfd303 150 return "s390";
aa34055f 151 case SCMP_ARCH_S390X:
6abfd303 152 return "s390x";
aa34055f
ZJS
153 default:
154 return NULL;
155 }
57183d11
LP
156}
157
158int seccomp_arch_from_string(const char *n, uint32_t *ret) {
159 if (!n)
160 return -EINVAL;
161
162 assert(ret);
163
164 if (streq(n, "native"))
165 *ret = SCMP_ARCH_NATIVE;
166 else if (streq(n, "x86"))
167 *ret = SCMP_ARCH_X86;
168 else if (streq(n, "x86-64"))
169 *ret = SCMP_ARCH_X86_64;
170 else if (streq(n, "x32"))
171 *ret = SCMP_ARCH_X32;
172 else if (streq(n, "arm"))
173 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
174 else if (streq(n, "arm64"))
175 *ret = SCMP_ARCH_AARCH64;
176 else if (streq(n, "mips"))
177 *ret = SCMP_ARCH_MIPS;
178 else if (streq(n, "mips64"))
179 *ret = SCMP_ARCH_MIPS64;
180 else if (streq(n, "mips64-n32"))
181 *ret = SCMP_ARCH_MIPS64N32;
182 else if (streq(n, "mips-le"))
183 *ret = SCMP_ARCH_MIPSEL;
184 else if (streq(n, "mips64-le"))
185 *ret = SCMP_ARCH_MIPSEL64;
186 else if (streq(n, "mips64-le-n32"))
187 *ret = SCMP_ARCH_MIPSEL64N32;
188 else if (streq(n, "ppc"))
189 *ret = SCMP_ARCH_PPC;
190 else if (streq(n, "ppc64"))
191 *ret = SCMP_ARCH_PPC64;
192 else if (streq(n, "ppc64-le"))
193 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
57183d11
LP
198 else
199 return -EINVAL;
200
201 return 0;
202}
e9642be2 203
469830d1 204int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
205 scmp_filter_ctx seccomp;
206 int r;
207
469830d1
LP
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
469830d1
LP
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
1b52793d 218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
219 if (r < 0)
220 goto finish;
221
1b52793d 222 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
223 if (r < 0)
224 goto finish;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
235 if (r < 0)
236 goto finish;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 goto finish;
241
242 *ret = seccomp;
243 return 0;
244
245finish:
246 seccomp_release(seccomp);
247 return r;
248}
249
d347d902 250static bool is_basic_seccomp_available(void) {
4d5bd50a 251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
252}
253
254static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
d347d902
FS
257}
258
83f12b27 259bool is_seccomp_available(void) {
83f12b27 260 static int cached_enabled = -1;
4d5bd50a 261
83f12b27 262 if (cached_enabled < 0)
4d5bd50a
LP
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
83f12b27
FS
267 return cached_enabled;
268}
269
8130926d 270const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 271 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 272 .name = "@default",
d5efc18b 273 .help = "System calls that are always permitted",
40eb6a80
ZJS
274 .value =
275 "clock_getres\0"
276 "clock_gettime\0"
277 "clock_nanosleep\0"
278 "execve\0"
279 "exit\0"
280 "exit_group\0"
e41b0f42
LP
281 "futex\0"
282 "get_robust_list\0"
283 "get_thread_area\0"
09d3020b
DH
284 "getegid\0"
285 "getegid32\0"
286 "geteuid\0"
287 "geteuid32\0"
288 "getgid\0"
289 "getgid32\0"
290 "getgroups\0"
291 "getgroups32\0"
292 "getpgid\0"
293 "getpgrp\0"
294 "getpid\0"
295 "getppid\0"
296 "getresgid\0"
297 "getresgid32\0"
298 "getresuid\0"
299 "getresuid32\0"
40eb6a80 300 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
301 "getsid\0"
302 "gettid\0"
40eb6a80 303 "gettimeofday\0"
09d3020b
DH
304 "getuid\0"
305 "getuid32\0"
e41b0f42 306 "membarrier\0"
40eb6a80
ZJS
307 "nanosleep\0"
308 "pause\0"
4c3a9176 309 "prlimit64\0"
e41b0f42 310 "restart_syscall\0"
40eb6a80 311 "rt_sigreturn\0"
8f44de08 312 "sched_yield\0"
e41b0f42
LP
313 "set_robust_list\0"
314 "set_thread_area\0"
315 "set_tid_address\0"
40eb6a80
ZJS
316 "sigreturn\0"
317 "time\0"
4c3a9176 318 "ugetrlimit\0"
40eb6a80 319 },
44898c53
LP
320 [SYSCALL_FILTER_SET_AIO] = {
321 .name = "@aio",
322 .help = "Asynchronous IO",
323 .value =
324 "io_cancel\0"
325 "io_destroy\0"
326 "io_getevents\0"
327 "io_setup\0"
328 "io_submit\0"
329 },
133ddbbe 330 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 331 .name = "@basic-io",
d5efc18b 332 .help = "Basic IO",
133ddbbe 333 .value =
648a0ed0 334 "_llseek\0"
133ddbbe 335 "close\0"
648a0ed0 336 "dup\0"
133ddbbe
LP
337 "dup2\0"
338 "dup3\0"
133ddbbe
LP
339 "lseek\0"
340 "pread64\0"
341 "preadv\0"
44898c53 342 "preadv2\0"
133ddbbe
LP
343 "pwrite64\0"
344 "pwritev\0"
44898c53 345 "pwritev2\0"
133ddbbe
LP
346 "read\0"
347 "readv\0"
348 "write\0"
349 "writev\0"
350 },
44898c53
LP
351 [SYSCALL_FILTER_SET_CHOWN] = {
352 .name = "@chown",
353 .help = "Change ownership of files and directories",
354 .value =
355 "chown\0"
356 "chown32\0"
357 "fchown\0"
358 "fchown32\0"
359 "fchownat\0"
360 "lchown\0"
361 "lchown32\0"
362 },
8130926d 363 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 364 .name = "@clock",
d5efc18b 365 .help = "Change the system time",
201c1cc2
TM
366 .value =
367 "adjtimex\0"
1f9ac68b
LP
368 "clock_adjtime\0"
369 "clock_settime\0"
201c1cc2 370 "settimeofday\0"
1f9ac68b 371 "stime\0"
8130926d
LP
372 },
373 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 374 .name = "@cpu-emulation",
d5efc18b 375 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
376 .value =
377 "modify_ldt\0"
378 "subpage_prot\0"
379 "switch_endian\0"
380 "vm86\0"
381 "vm86old\0"
8130926d
LP
382 },
383 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 384 .name = "@debug",
d5efc18b 385 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
386 .value =
387 "lookup_dcookie\0"
388 "perf_event_open\0"
389 "process_vm_readv\0"
390 "process_vm_writev\0"
391 "ptrace\0"
392 "rtas\0"
8130926d 393#ifdef __NR_s390_runtime_instr
1f9ac68b 394 "s390_runtime_instr\0"
8130926d 395#endif
1f9ac68b 396 "sys_debug_setcontext\0"
8130926d 397 },
1a1b13c9
LP
398 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
399 .name = "@file-system",
400 .help = "File system operations",
401 .value =
402 "access\0"
403 "chdir\0"
404 "chmod\0"
405 "close\0"
406 "creat\0"
407 "faccessat\0"
408 "fallocate\0"
409 "fchdir\0"
410 "fchmod\0"
411 "fchmodat\0"
1a1b13c9 412 "fcntl\0"
ceaa6aa7 413 "fcntl64\0"
1a1b13c9
LP
414 "fgetxattr\0"
415 "flistxattr\0"
ceaa6aa7 416 "fremovexattr\0"
1a1b13c9 417 "fsetxattr\0"
1a1b13c9 418 "fstat\0"
ceaa6aa7 419 "fstat64\0"
1a1b13c9 420 "fstatat64\0"
1a1b13c9 421 "fstatfs\0"
ceaa6aa7 422 "fstatfs64\0"
1a1b13c9 423 "ftruncate\0"
ceaa6aa7 424 "ftruncate64\0"
1a1b13c9
LP
425 "futimesat\0"
426 "getcwd\0"
1a1b13c9 427 "getdents\0"
ceaa6aa7 428 "getdents64\0"
1a1b13c9
LP
429 "getxattr\0"
430 "inotify_add_watch\0"
ceaa6aa7 431 "inotify_init\0"
1a1b13c9
LP
432 "inotify_init1\0"
433 "inotify_rm_watch\0"
434 "lgetxattr\0"
435 "link\0"
436 "linkat\0"
437 "listxattr\0"
438 "llistxattr\0"
439 "lremovexattr\0"
440 "lsetxattr\0"
1a1b13c9 441 "lstat\0"
ceaa6aa7 442 "lstat64\0"
1a1b13c9
LP
443 "mkdir\0"
444 "mkdirat\0"
445 "mknod\0"
446 "mknodat\0"
1a1b13c9 447 "mmap\0"
ceaa6aa7 448 "mmap2\0"
7961116e 449 "munmap\0"
1a1b13c9 450 "newfstatat\0"
ceaa6aa7
LP
451 "oldfstat\0"
452 "oldlstat\0"
453 "oldstat\0"
1a1b13c9
LP
454 "open\0"
455 "openat\0"
456 "readlink\0"
457 "readlinkat\0"
458 "removexattr\0"
459 "rename\0"
1a1b13c9 460 "renameat\0"
ceaa6aa7 461 "renameat2\0"
1a1b13c9
LP
462 "rmdir\0"
463 "setxattr\0"
1a1b13c9 464 "stat\0"
ceaa6aa7 465 "stat64\0"
1a1b13c9 466 "statfs\0"
ceaa6aa7 467 "statfs64\0"
8e6a7a8b 468#ifdef __NR_statx
a4135a74 469 "statx\0"
ceaa6aa7 470#endif
1a1b13c9
LP
471 "symlink\0"
472 "symlinkat\0"
1a1b13c9 473 "truncate\0"
ceaa6aa7 474 "truncate64\0"
1a1b13c9
LP
475 "unlink\0"
476 "unlinkat\0"
ceaa6aa7 477 "utime\0"
1a1b13c9
LP
478 "utimensat\0"
479 "utimes\0"
480 },
8130926d 481 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 482 .name = "@io-event",
d5efc18b 483 .help = "Event loop system calls",
201c1cc2
TM
484 .value =
485 "_newselect\0"
201c1cc2 486 "epoll_create\0"
215728ff 487 "epoll_create1\0"
201c1cc2
TM
488 "epoll_ctl\0"
489 "epoll_ctl_old\0"
490 "epoll_pwait\0"
491 "epoll_wait\0"
492 "epoll_wait_old\0"
201c1cc2 493 "eventfd\0"
215728ff 494 "eventfd2\0"
201c1cc2
TM
495 "poll\0"
496 "ppoll\0"
497 "pselect6\0"
498 "select\0"
8130926d
LP
499 },
500 [SYSCALL_FILTER_SET_IPC] = {
8130926d 501 .name = "@ipc",
d5efc18b
ZJS
502 .help = "SysV IPC, POSIX Message Queues or other IPC",
503 .value =
504 "ipc\0"
cd5bfd7e 505 "memfd_create\0"
201c1cc2
TM
506 "mq_getsetattr\0"
507 "mq_notify\0"
508 "mq_open\0"
509 "mq_timedreceive\0"
510 "mq_timedsend\0"
511 "mq_unlink\0"
512 "msgctl\0"
513 "msgget\0"
514 "msgrcv\0"
515 "msgsnd\0"
cd5bfd7e 516 "pipe\0"
215728ff 517 "pipe2\0"
201c1cc2
TM
518 "process_vm_readv\0"
519 "process_vm_writev\0"
520 "semctl\0"
521 "semget\0"
522 "semop\0"
523 "semtimedop\0"
524 "shmat\0"
525 "shmctl\0"
526 "shmdt\0"
527 "shmget\0"
8130926d
LP
528 },
529 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 530 .name = "@keyring",
d5efc18b 531 .help = "Kernel keyring access",
1f9ac68b
LP
532 .value =
533 "add_key\0"
534 "keyctl\0"
535 "request_key\0"
8130926d 536 },
cd0ddf6f
LP
537 [SYSCALL_FILTER_SET_MEMLOCK] = {
538 .name = "@memlock",
539 .help = "Memory locking control",
540 .value =
541 "mlock\0"
542 "mlock2\0"
543 "mlockall\0"
544 "munlock\0"
545 "munlockall\0"
546 },
8130926d 547 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 548 .name = "@module",
d5efc18b 549 .help = "Loading and unloading of kernel modules",
201c1cc2 550 .value =
201c1cc2
TM
551 "delete_module\0"
552 "finit_module\0"
553 "init_module\0"
8130926d
LP
554 },
555 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 556 .name = "@mount",
d5efc18b 557 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
558 .value =
559 "chroot\0"
560 "mount\0"
201c1cc2 561 "pivot_root\0"
201c1cc2 562 "umount\0"
215728ff 563 "umount2\0"
8130926d
LP
564 },
565 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 566 .name = "@network-io",
d5efc18b 567 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 568 .value =
201c1cc2 569 "accept\0"
215728ff 570 "accept4\0"
201c1cc2
TM
571 "bind\0"
572 "connect\0"
573 "getpeername\0"
574 "getsockname\0"
575 "getsockopt\0"
576 "listen\0"
577 "recv\0"
578 "recvfrom\0"
579 "recvmmsg\0"
580 "recvmsg\0"
581 "send\0"
582 "sendmmsg\0"
583 "sendmsg\0"
584 "sendto\0"
585 "setsockopt\0"
586 "shutdown\0"
587 "socket\0"
588 "socketcall\0"
589 "socketpair\0"
8130926d
LP
590 },
591 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 592 /* some unknown even to libseccomp */
8130926d 593 .name = "@obsolete",
d5efc18b 594 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
595 .value =
596 "_sysctl\0"
597 "afs_syscall\0"
802fa07a 598 "bdflush\0"
201c1cc2 599 "break\0"
1f9ac68b 600 "create_module\0"
201c1cc2
TM
601 "ftime\0"
602 "get_kernel_syms\0"
201c1cc2
TM
603 "getpmsg\0"
604 "gtty\0"
7e0c3b8f 605 "idle\0"
201c1cc2 606 "lock\0"
201c1cc2 607 "mpx\0"
201c1cc2
TM
608 "prof\0"
609 "profil\0"
201c1cc2
TM
610 "putpmsg\0"
611 "query_module\0"
201c1cc2
TM
612 "security\0"
613 "sgetmask\0"
614 "ssetmask\0"
615 "stty\0"
1f9ac68b 616 "sysfs\0"
201c1cc2
TM
617 "tuxcall\0"
618 "ulimit\0"
619 "uselib\0"
1f9ac68b 620 "ustat\0"
201c1cc2 621 "vserver\0"
8130926d
LP
622 },
623 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 624 .name = "@privileged",
d5efc18b 625 .help = "All system calls which need super-user capabilities",
201c1cc2 626 .value =
44898c53 627 "@chown\0"
201c1cc2
TM
628 "@clock\0"
629 "@module\0"
630 "@raw-io\0"
af0f047b
LP
631 "@reboot\0"
632 "@swap\0"
215728ff 633 "_sysctl\0"
201c1cc2 634 "acct\0"
201c1cc2 635 "bpf\0"
1f9ac68b 636 "capset\0"
201c1cc2 637 "chroot\0"
201c1cc2
TM
638 "nfsservctl\0"
639 "pivot_root\0"
640 "quotactl\0"
201c1cc2 641 "setdomainname\0"
201c1cc2 642 "setfsuid\0"
215728ff 643 "setfsuid32\0"
201c1cc2 644 "setgroups\0"
215728ff 645 "setgroups32\0"
201c1cc2 646 "sethostname\0"
201c1cc2 647 "setresuid\0"
215728ff 648 "setresuid32\0"
201c1cc2 649 "setreuid\0"
215728ff 650 "setreuid32\0"
201c1cc2 651 "setuid\0"
215728ff 652 "setuid32\0"
201c1cc2 653 "vhangup\0"
8130926d
LP
654 },
655 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 656 .name = "@process",
d5efc18b 657 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
658 .value =
659 "arch_prctl\0"
09d3020b 660 "capget\0" /* Able to query arbitrary processes */
201c1cc2 661 "clone\0"
201c1cc2
TM
662 "execveat\0"
663 "fork\0"
b887d2eb 664 "getrusage\0"
201c1cc2
TM
665 "kill\0"
666 "prctl\0"
b887d2eb
LP
667 "rt_sigqueueinfo\0"
668 "rt_tgsigqueueinfo\0"
201c1cc2
TM
669 "setns\0"
670 "tgkill\0"
b887d2eb 671 "times\0"
201c1cc2
TM
672 "tkill\0"
673 "unshare\0"
674 "vfork\0"
b887d2eb
LP
675 "wait4\0"
676 "waitid\0"
677 "waitpid\0"
8130926d
LP
678 },
679 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 680 .name = "@raw-io",
d5efc18b 681 .help = "Raw I/O port access",
201c1cc2
TM
682 .value =
683 "ioperm\0"
684 "iopl\0"
1f9ac68b 685 "pciconfig_iobase\0"
201c1cc2
TM
686 "pciconfig_read\0"
687 "pciconfig_write\0"
8130926d 688#ifdef __NR_s390_pci_mmio_read
201c1cc2 689 "s390_pci_mmio_read\0"
8130926d
LP
690#endif
691#ifdef __NR_s390_pci_mmio_write
201c1cc2 692 "s390_pci_mmio_write\0"
8130926d
LP
693#endif
694 },
bd2ab3f4
LP
695 [SYSCALL_FILTER_SET_REBOOT] = {
696 .name = "@reboot",
697 .help = "Reboot and reboot preparation/kexec",
698 .value =
bd2ab3f4 699 "kexec_file_load\0"
e59608fa 700 "kexec_load\0"
bd2ab3f4
LP
701 "reboot\0"
702 },
133ddbbe 703 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 704 .name = "@resources",
58a8f68b 705 .help = "Alter resource settings",
133ddbbe 706 .value =
0963c053
LP
707 "ioprio_set\0"
708 "mbind\0"
709 "migrate_pages\0"
710 "move_pages\0"
711 "nice\0"
0963c053
LP
712 "sched_setaffinity\0"
713 "sched_setattr\0"
133ddbbe
LP
714 "sched_setparam\0"
715 "sched_setscheduler\0"
0963c053 716 "set_mempolicy\0"
133ddbbe
LP
717 "setpriority\0"
718 "setrlimit\0"
133ddbbe 719 },
6eaaeee9
LP
720 [SYSCALL_FILTER_SET_SETUID] = {
721 .name = "@setuid",
722 .help = "Operations for changing user/group credentials",
723 .value =
6eaaeee9 724 "setgid\0"
215728ff 725 "setgid32\0"
6eaaeee9 726 "setgroups\0"
215728ff 727 "setgroups32\0"
6eaaeee9 728 "setregid\0"
215728ff 729 "setregid32\0"
6eaaeee9 730 "setresgid\0"
215728ff 731 "setresgid32\0"
6eaaeee9 732 "setresuid\0"
215728ff 733 "setresuid32\0"
6eaaeee9 734 "setreuid\0"
215728ff 735 "setreuid32\0"
6eaaeee9 736 "setuid\0"
215728ff 737 "setuid32\0"
6eaaeee9 738 },
cd0ddf6f
LP
739 [SYSCALL_FILTER_SET_SIGNAL] = {
740 .name = "@signal",
741 .help = "Process signal handling",
742 .value =
743 "rt_sigaction\0"
744 "rt_sigpending\0"
745 "rt_sigprocmask\0"
746 "rt_sigsuspend\0"
747 "rt_sigtimedwait\0"
748 "sigaction\0"
749 "sigaltstack\0"
750 "signal\0"
751 "signalfd\0"
752 "signalfd4\0"
753 "sigpending\0"
754 "sigprocmask\0"
755 "sigsuspend\0"
756 },
bd2ab3f4
LP
757 [SYSCALL_FILTER_SET_SWAP] = {
758 .name = "@swap",
759 .help = "Enable/disable swap devices",
760 .value =
761 "swapoff\0"
762 "swapon\0"
763 },
44898c53
LP
764 [SYSCALL_FILTER_SET_SYNC] = {
765 .name = "@sync",
766 .help = "Synchronize files and memory to storage",
767 .value =
768 "fdatasync\0"
769 "fsync\0"
770 "msync\0"
771 "sync\0"
772 "sync_file_range\0"
773 "syncfs\0"
774 },
cd0ddf6f
LP
775 [SYSCALL_FILTER_SET_TIMER] = {
776 .name = "@timer",
777 .help = "Schedule operations by time",
778 .value =
779 "alarm\0"
780 "getitimer\0"
781 "setitimer\0"
782 "timer_create\0"
783 "timer_delete\0"
784 "timer_getoverrun\0"
785 "timer_gettime\0"
786 "timer_settime\0"
787 "timerfd_create\0"
788 "timerfd_gettime\0"
789 "timerfd_settime\0"
790 "times\0"
791 },
201c1cc2 792};
8130926d
LP
793
794const SyscallFilterSet *syscall_filter_set_find(const char *name) {
795 unsigned i;
796
797 if (isempty(name) || name[0] != '@')
798 return NULL;
799
800 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
801 if (streq(syscall_filter_sets[i].name, name))
802 return syscall_filter_sets + i;
803
804 return NULL;
805}
806
960e4569 807static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
69b1b241 808
960e4569 809int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
69b1b241
LP
810 int r;
811
812 assert(seccomp);
813 assert(name);
814
960e4569
LP
815 if (strv_contains(exclude, name))
816 return 0;
817
69b1b241
LP
818 if (name[0] == '@') {
819 const SyscallFilterSet *other;
820
821 other = syscall_filter_set_find(name);
cff7bff8
LP
822 if (!other) {
823 log_debug("Filter set %s is not known!", name);
69b1b241 824 return -EINVAL;
cff7bff8 825 }
69b1b241 826
960e4569 827 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
69b1b241
LP
828 if (r < 0)
829 return r;
830 } else {
831 int id;
832
833 id = seccomp_syscall_resolve_name(name);
cff7bff8 834 if (id == __NR_SCMP_ERROR) {
ff217dc3
LP
835 log_debug("System call %s is not known, ignoring.", name);
836 return 0;
cff7bff8 837 }
69b1b241
LP
838
839 r = seccomp_rule_add_exact(seccomp, action, id, 0);
840 if (r < 0)
841 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
842 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
843 }
844
845 return 0;
846}
847
469830d1
LP
848static int seccomp_add_syscall_filter_set(
849 scmp_filter_ctx seccomp,
469830d1 850 const SyscallFilterSet *set,
960e4569
LP
851 uint32_t action,
852 char **exclude) {
469830d1 853
8130926d
LP
854 const char *sys;
855 int r;
856
857 assert(seccomp);
858 assert(set);
859
860 NULSTR_FOREACH(sys, set->value) {
960e4569 861 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
69b1b241
LP
862 if (r < 0)
863 return r;
469830d1
LP
864 }
865
866 return 0;
867}
868
869int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
870 uint32_t arch;
871 int r;
872
873 assert(set);
874
875 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
876 * earch local arch. */
877
878 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
879 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
880
881 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
882
883 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
884 if (r < 0)
885 return r;
469830d1 886
960e4569 887 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
469830d1
LP
888 if (r < 0) {
889 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
890 continue;
891 }
892
893 r = seccomp_load(seccomp);
894 if (IN_SET(r, -EPERM, -EACCES))
895 return r;
896 if (r < 0)
897 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
898 }
899
900 return 0;
901}
a3be2849 902
8cfa775f 903int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action) {
469830d1 904 uint32_t arch;
a3be2849
LP
905 int r;
906
469830d1
LP
907 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
908 * SyscallFilterSet* table. */
a3be2849 909
8cfa775f 910 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 911 return 0;
a3be2849 912
469830d1
LP
913 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
914 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
915 Iterator i;
8cfa775f 916 void *id, *val;
a3be2849 917
469830d1 918 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 919
469830d1
LP
920 r = seccomp_init_for_arch(&seccomp, arch, default_action);
921 if (r < 0)
922 return r;
a3be2849 923
8cfa775f
YW
924 HASHMAP_FOREACH_KEY(val, id, set, i) {
925 uint32_t a = action;
926 int e = PTR_TO_INT(val);
927
928 if (action != SCMP_ACT_ALLOW && e >= 0)
929 a = SCMP_ACT_ERRNO(e);
930
931 r = seccomp_rule_add_exact(seccomp, a, PTR_TO_INT(id) - 1, 0);
469830d1
LP
932 if (r < 0) {
933 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
934 _cleanup_free_ char *n = NULL;
935
936 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
e7854c46 937 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
469830d1
LP
938 }
939 }
940
941 r = seccomp_load(seccomp);
942 if (IN_SET(r, -EPERM, -EACCES))
943 return r;
944 if (r < 0)
945 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
946 }
947
948 return 0;
add00535
LP
949}
950
951int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 952 uint32_t arch;
add00535
LP
953 int r;
954
955 if (log_get_max_level() >= LOG_DEBUG) {
956 _cleanup_free_ char *s = NULL;
957
958 (void) namespace_flag_to_string_many(retain, &s);
959 log_debug("Restricting namespace to: %s.", strna(s));
960 }
961
962 /* NOOP? */
963 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
964 return 0;
965
469830d1
LP
966 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
967 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
968 unsigned i;
add00535 969
469830d1
LP
970 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
971
972 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
973 if (r < 0)
974 return r;
975
976 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
977 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
978 * altogether. */
979 r = seccomp_rule_add_exact(
980 seccomp,
981 SCMP_ACT_ERRNO(EPERM),
982 SCMP_SYS(setns),
983 0);
984 else
985 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
986 * special invocation with a zero flags argument, right here. */
987 r = seccomp_rule_add_exact(
988 seccomp,
989 SCMP_ACT_ERRNO(EPERM),
990 SCMP_SYS(setns),
991 1,
992 SCMP_A1(SCMP_CMP_EQ, 0));
993 if (r < 0) {
994 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
995 continue;
996 }
997
998 for (i = 0; namespace_flag_map[i].name; i++) {
999 unsigned long f;
1000
1001 f = namespace_flag_map[i].flag;
1002 if ((retain & f) == f) {
1003 log_debug("Permitting %s.", namespace_flag_map[i].name);
1004 continue;
1005 }
1006
1007 log_debug("Blocking %s.", namespace_flag_map[i].name);
1008
1009 r = seccomp_rule_add_exact(
1010 seccomp,
1011 SCMP_ACT_ERRNO(EPERM),
1012 SCMP_SYS(unshare),
1013 1,
1014 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1015 if (r < 0) {
1016 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1017 break;
1018 }
1019
511ceb1f
ZJS
1020 /* On s390/s390x the first two parameters to clone are switched */
1021 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1022 r = seccomp_rule_add_exact(
1023 seccomp,
1024 SCMP_ACT_ERRNO(EPERM),
1025 SCMP_SYS(clone),
1026 1,
1027 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1028 else
1029 r = seccomp_rule_add_exact(
1030 seccomp,
1031 SCMP_ACT_ERRNO(EPERM),
1032 SCMP_SYS(clone),
1033 1,
1034 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1035 if (r < 0) {
1036 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1037 break;
1038 }
1039
1040 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1041 r = seccomp_rule_add_exact(
1042 seccomp,
1043 SCMP_ACT_ERRNO(EPERM),
1044 SCMP_SYS(setns),
1045 1,
1046 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1047 if (r < 0) {
1048 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1049 break;
1050 }
1051 }
1052 }
1053 if (r < 0)
1054 continue;
1055
1056 r = seccomp_load(seccomp);
1057 if (IN_SET(r, -EPERM, -EACCES))
1058 return r;
1059 if (r < 0)
1060 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1061 }
1062
1063 return 0;
1064}
1065
1066int seccomp_protect_sysctl(void) {
1067 uint32_t arch;
1068 int r;
1069
1070 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1071 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1072
1073 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1074
2e64e8f4
ZJS
1075 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1076 /* No _sysctl syscall */
1077 continue;
1078
469830d1
LP
1079 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1080 if (r < 0)
1081 return r;
1082
1083 r = seccomp_rule_add_exact(
add00535
LP
1084 seccomp,
1085 SCMP_ACT_ERRNO(EPERM),
469830d1 1086 SCMP_SYS(_sysctl),
add00535 1087 0);
469830d1
LP
1088 if (r < 0) {
1089 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1090 continue;
1091 }
1092
1093 r = seccomp_load(seccomp);
1094 if (IN_SET(r, -EPERM, -EACCES))
1095 return r;
1096 if (r < 0)
1097 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1098 }
1099
1100 return 0;
1101}
1102
1103int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1104 uint32_t arch;
1105 int r;
1106
1107 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1108 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1109 bool supported;
469830d1
LP
1110 Iterator i;
1111
1112 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1113
9606bc4b
LP
1114 switch (arch) {
1115
1116 case SCMP_ARCH_X86_64:
1117 case SCMP_ARCH_X32:
1118 case SCMP_ARCH_ARM:
1119 case SCMP_ARCH_AARCH64:
da1921a5
ZJS
1120 case SCMP_ARCH_PPC64:
1121 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1122 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1123 supported = true;
1124 break;
1125
9606bc4b
LP
1126 case SCMP_ARCH_S390:
1127 case SCMP_ARCH_S390X:
1128 case SCMP_ARCH_PPC:
da1921a5 1129 case SCMP_ARCH_X86:
9606bc4b
LP
1130 default:
1131 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1132 * don't know */
1133 supported = false;
1134 break;
1135 }
1136
1137 if (!supported)
1138 continue;
1139
469830d1
LP
1140 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1141 if (r < 0)
1142 return r;
1143
1144 if (whitelist) {
1145 int af, first = 0, last = 0;
1146 void *afp;
1147
1148 /* If this is a whitelist, we first block the address families that are out of range and then
1149 * everything that is not in the set. First, we find the lowest and highest address family in
1150 * the set. */
1151
1152 SET_FOREACH(afp, address_families, i) {
1153 af = PTR_TO_INT(afp);
1154
1155 if (af <= 0 || af >= af_max())
1156 continue;
1157
1158 if (first == 0 || af < first)
1159 first = af;
1160
1161 if (last == 0 || af > last)
1162 last = af;
1163 }
1164
1165 assert((first == 0) == (last == 0));
1166
1167 if (first == 0) {
1168
1169 /* No entries in the valid range, block everything */
1170 r = seccomp_rule_add_exact(
1171 seccomp,
1172 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1173 SCMP_SYS(socket),
1174 0);
1175 if (r < 0) {
1176 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1177 continue;
1178 }
1179
1180 } else {
1181
1182 /* Block everything below the first entry */
1183 r = seccomp_rule_add_exact(
1184 seccomp,
1185 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1186 SCMP_SYS(socket),
1187 1,
1188 SCMP_A0(SCMP_CMP_LT, first));
1189 if (r < 0) {
1190 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1191 continue;
1192 }
1193
1194 /* Block everything above the last entry */
1195 r = seccomp_rule_add_exact(
1196 seccomp,
1197 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1198 SCMP_SYS(socket),
1199 1,
1200 SCMP_A0(SCMP_CMP_GT, last));
1201 if (r < 0) {
1202 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1203 continue;
1204 }
1205
1206 /* Block everything between the first and last entry */
1207 for (af = 1; af < af_max(); af++) {
1208
1209 if (set_contains(address_families, INT_TO_PTR(af)))
1210 continue;
1211
1212 r = seccomp_rule_add_exact(
1213 seccomp,
1214 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1215 SCMP_SYS(socket),
1216 1,
1217 SCMP_A0(SCMP_CMP_EQ, af));
1218 if (r < 0)
1219 break;
1220 }
469830d1
LP
1221 if (r < 0) {
1222 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1223 continue;
1224 }
1225 }
1226
1227 } else {
1228 void *af;
1229
1230 /* If this is a blacklist, then generate one rule for
1231 * each address family that are then combined in OR
1232 * checks. */
1233
1234 SET_FOREACH(af, address_families, i) {
1235
1236 r = seccomp_rule_add_exact(
1237 seccomp,
1238 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1239 SCMP_SYS(socket),
1240 1,
1241 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1242 if (r < 0)
1243 break;
1244 }
469830d1
LP
1245 if (r < 0) {
1246 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1247 continue;
1248 }
1249 }
1250
1251 r = seccomp_load(seccomp);
1252 if (IN_SET(r, -EPERM, -EACCES))
1253 return r;
1254 if (r < 0)
1255 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1256 }
1257
1258 return 0;
1259}
1260
1261int seccomp_restrict_realtime(void) {
1262 static const int permitted_policies[] = {
1263 SCHED_OTHER,
1264 SCHED_BATCH,
1265 SCHED_IDLE,
1266 };
1267
1268 int r, max_policy = 0;
1269 uint32_t arch;
1270 unsigned i;
1271
1272 /* Determine the highest policy constant we want to allow */
1273 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1274 if (permitted_policies[i] > max_policy)
1275 max_policy = permitted_policies[i];
1276
1277 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1278 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1279 int p;
1280
1281 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1282
1283 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1284 if (r < 0)
1285 return r;
1286
1287 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1288 * whitelist. */
1289 for (p = 0; p < max_policy; p++) {
1290 bool good = false;
1291
1292 /* Check if this is in the whitelist. */
1293 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1294 if (permitted_policies[i] == p) {
1295 good = true;
1296 break;
1297 }
1298
1299 if (good)
1300 continue;
1301
1302 /* Deny this policy */
1303 r = seccomp_rule_add_exact(
1304 seccomp,
1305 SCMP_ACT_ERRNO(EPERM),
1306 SCMP_SYS(sched_setscheduler),
1307 1,
1308 SCMP_A1(SCMP_CMP_EQ, p));
1309 if (r < 0) {
1310 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1311 continue;
1312 }
1313 }
1314
1315 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1316 * unsigned here, hence no need no check for < 0 values. */
1317 r = seccomp_rule_add_exact(
add00535
LP
1318 seccomp,
1319 SCMP_ACT_ERRNO(EPERM),
469830d1 1320 SCMP_SYS(sched_setscheduler),
add00535 1321 1,
469830d1
LP
1322 SCMP_A1(SCMP_CMP_GT, max_policy));
1323 if (r < 0) {
1324 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1325 continue;
1326 }
add00535 1327
469830d1
LP
1328 r = seccomp_load(seccomp);
1329 if (IN_SET(r, -EPERM, -EACCES))
1330 return r;
1331 if (r < 0)
1332 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333 }
1334
1335 return 0;
1336}
1337
6dc66688
ZJS
1338static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1339 uint32_t arch,
1340 int nr,
1341 unsigned int arg_cnt,
1342 const struct scmp_arg_cmp arg) {
1343 int r;
1344
1345 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1346 if (r < 0) {
1347 _cleanup_free_ char *n = NULL;
1348
1349 n = seccomp_syscall_resolve_num_arch(arch, nr);
1350 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1351 strna(n),
1352 seccomp_arch_to_string(arch));
1353 }
1354
1355 return r;
1356}
1357
2a8d6e63 1358/* For known architectures, check that syscalls are indeed defined or not. */
4278d1f5 1359#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1360assert_cc(SCMP_SYS(shmget) > 0);
1361assert_cc(SCMP_SYS(shmat) > 0);
1362assert_cc(SCMP_SYS(shmdt) > 0);
1363#elif defined(__i386__) || defined(__powerpc64__)
1364assert_cc(SCMP_SYS(shmget) < 0);
1365assert_cc(SCMP_SYS(shmat) < 0);
1366assert_cc(SCMP_SYS(shmdt) < 0);
1367#endif
6dc66688 1368
469830d1 1369int seccomp_memory_deny_write_execute(void) {
8a50cf69 1370
469830d1
LP
1371 uint32_t arch;
1372 int r;
1373
1374 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1375 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1376 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1377
469830d1
LP
1378 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1379
8a50cf69
LP
1380 switch (arch) {
1381
1382 case SCMP_ARCH_X86:
1383 filter_syscall = SCMP_SYS(mmap2);
1384 block_syscall = SCMP_SYS(mmap);
2a8d6e63
ZJS
1385 break;
1386
1387 case SCMP_ARCH_PPC64:
1388 case SCMP_ARCH_PPC64LE:
1389 filter_syscall = SCMP_SYS(mmap);
1390
1391 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1392 * We ignore that here, which means there's still a way to get writable/executable
1393 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1394
8a50cf69
LP
1395 break;
1396
4278d1f5
ZJS
1397 case SCMP_ARCH_ARM:
1398 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1399 shmat_syscall = SCMP_SYS(shmat);
1400 break;
1401
8a50cf69
LP
1402 case SCMP_ARCH_X86_64:
1403 case SCMP_ARCH_X32:
79873bc8
ZJS
1404 case SCMP_ARCH_AARCH64:
1405 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
8a50cf69
LP
1406 shmat_syscall = SCMP_SYS(shmat);
1407 break;
1408
1409 /* Please add more definitions here, if you port systemd to other architectures! */
1410
4278d1f5 1411#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
8a50cf69
LP
1412#warning "Consider adding the right mmap() syscall definitions here!"
1413#endif
1414 }
1415
1416 /* Can't filter mmap() on this arch, then skip it */
1417 if (filter_syscall == 0)
1418 continue;
1419
469830d1
LP
1420 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1421 if (r < 0)
1422 return r;
1423
6dc66688
ZJS
1424 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1425 1,
1426 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1427 if (r < 0)
1428 continue;
8a50cf69
LP
1429
1430 if (block_syscall != 0) {
6dc66688
ZJS
1431 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1432 if (r < 0)
8a50cf69 1433 continue;
add00535 1434 }
a3be2849 1435
6dc66688
ZJS
1436 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1437 1,
1438 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1439 if (r < 0)
469830d1 1440 continue;
add00535 1441
8a50cf69 1442 if (shmat_syscall != 0) {
6dc66688
ZJS
1443 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1444 1,
1445 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1446 if (r < 0)
8a50cf69 1447 continue;
469830d1
LP
1448 }
1449
1450 r = seccomp_load(seccomp);
1451 if (IN_SET(r, -EPERM, -EACCES))
1452 return r;
add00535 1453 if (r < 0)
469830d1
LP
1454 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1455 }
add00535 1456
469830d1
LP
1457 return 0;
1458}
1459
1460int seccomp_restrict_archs(Set *archs) {
1461 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1462 Iterator i;
1463 void *id;
1464 int r;
1465
1466 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1467 * list. */
1468
1469 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1470 if (!seccomp)
1471 return -ENOMEM;
1472
1473 SET_FOREACH(id, archs, i) {
1474 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1475 if (r == -EEXIST)
1476 continue;
1477 if (r < 0)
1478 return r;
add00535
LP
1479 }
1480
469830d1
LP
1481 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1482 if (r < 0)
1483 return r;
add00535 1484
1c6af69b
LP
1485 r = seccomp_load(seccomp);
1486 if (IN_SET(r, -EPERM, -EACCES))
1487 return r;
1488 if (r < 0)
1489 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1490
1491 return 0;
a3be2849 1492}
b16bd535
YW
1493
1494int parse_syscall_archs(char **l, Set **archs) {
1495 _cleanup_set_free_ Set *_archs;
1496 char **s;
1497 int r;
1498
1499 assert(l);
1500 assert(archs);
1501
1502 r = set_ensure_allocated(&_archs, NULL);
1503 if (r < 0)
1504 return r;
1505
1506 STRV_FOREACH(s, l) {
1507 uint32_t a;
1508
1509 r = seccomp_arch_from_string(*s, &a);
1510 if (r < 0)
1511 return -EINVAL;
1512
1513 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1514 if (r < 0)
1515 return -ENOMEM;
1516 }
1517
1518 *archs = _archs;
1519 _archs = NULL;
1520
1521 return 0;
1522}
165a31c0 1523
8cfa775f 1524int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1525 const char *i;
1526 int r;
1527
1528 assert(set);
1529
1530 NULSTR_FOREACH(i, set->value) {
1531
1532 if (i[0] == '@') {
1533 const SyscallFilterSet *more;
1534
1535 more = syscall_filter_set_find(i);
1536 if (!more)
1537 return -ENXIO;
1538
165a31c0
LP
1539 r = seccomp_filter_set_add(filter, add, more);
1540 if (r < 0)
1541 return r;
1542 } else {
1543 int id;
1544
1545 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1546 if (id == __NR_SCMP_ERROR) {
1547 log_debug("Couldn't resolve system call, ignoring: %s", i);
1548 continue;
1549 }
165a31c0
LP
1550
1551 if (add) {
8cfa775f 1552 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1553 if (r < 0)
1554 return r;
1555 } else
8cfa775f 1556 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1557 }
1558 }
1559
1560 return 0;
1561}
78e864e5
TM
1562
1563int seccomp_lock_personality(unsigned long personality) {
72eafe71 1564 uint32_t arch;
78e864e5
TM
1565 int r;
1566
72eafe71
LP
1567 if (personality >= PERSONALITY_INVALID)
1568 return -EINVAL;
78e864e5 1569
72eafe71
LP
1570 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1571 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1572
72eafe71
LP
1573 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1574 if (r < 0)
1575 return r;
1576
1577 r = seccomp_rule_add_exact(
1578 seccomp,
1579 SCMP_ACT_ERRNO(EPERM),
1580 SCMP_SYS(personality),
1581 1,
1582 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1583 if (r < 0) {
1584 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1585 continue;
1586 }
72eafe71
LP
1587
1588 r = seccomp_load(seccomp);
1589 if (IN_SET(r, -EPERM, -EACCES))
1590 return r;
1591 if (r < 0)
1592 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1593 }
1594
1595 return 0;
78e864e5 1596}