]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
tree-wide: use TAKE_PTR() and TAKE_FD() macros
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2014 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
a8fbdf54 21#include <errno.h>
469830d1 22#include <linux/seccomp.h>
57183d11 23#include <seccomp.h>
a8fbdf54 24#include <stddef.h>
469830d1 25#include <sys/mman.h>
d347d902 26#include <sys/prctl.h>
469830d1 27#include <sys/shm.h>
57183d11 28
469830d1 29#include "af-list.h"
add00535 30#include "alloc-util.h"
a8fbdf54 31#include "macro.h"
add00535 32#include "nsflags.h"
78e864e5 33#include "process-util.h"
cf0fbc49 34#include "seccomp-util.h"
b16bd535 35#include "set.h"
07630cea 36#include "string-util.h"
b16bd535 37#include "strv.h"
8130926d 38#include "util.h"
469830d1
LP
39#include "errno-list.h"
40
41const uint32_t seccomp_local_archs[] = {
42
f2d9751c
LP
43 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
44
45#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
46 SCMP_ARCH_X86,
47 SCMP_ARCH_X86_64,
f2d9751c
LP
48 SCMP_ARCH_X32, /* native */
49#elif defined(__x86_64__) && !defined(__ILP32__)
50 SCMP_ARCH_X86,
469830d1 51 SCMP_ARCH_X32,
f2d9751c
LP
52 SCMP_ARCH_X86_64, /* native */
53#elif defined(__i386__)
54 SCMP_ARCH_X86,
55#elif defined(__aarch64__)
469830d1 56 SCMP_ARCH_ARM,
f2d9751c
LP
57 SCMP_ARCH_AARCH64, /* native */
58#elif defined(__arm__)
59 SCMP_ARCH_ARM,
60#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
61 SCMP_ARCH_MIPSEL,
62 SCMP_ARCH_MIPS, /* native */
63#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 64 SCMP_ARCH_MIPS,
f2d9751c
LP
65 SCMP_ARCH_MIPSEL, /* native */
66#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
67 SCMP_ARCH_MIPSEL,
68 SCMP_ARCH_MIPS,
69 SCMP_ARCH_MIPSEL64N32,
469830d1 70 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
71 SCMP_ARCH_MIPSEL64,
72 SCMP_ARCH_MIPS64, /* native */
73#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
74 SCMP_ARCH_MIPS,
469830d1 75 SCMP_ARCH_MIPSEL,
f2d9751c
LP
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32,
78 SCMP_ARCH_MIPS64,
79 SCMP_ARCH_MIPSEL64, /* native */
80#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
81 SCMP_ARCH_MIPSEL,
82 SCMP_ARCH_MIPS,
469830d1 83 SCMP_ARCH_MIPSEL64,
f2d9751c 84 SCMP_ARCH_MIPS64,
469830d1 85 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
86 SCMP_ARCH_MIPS64N32, /* native */
87#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
88 SCMP_ARCH_MIPS,
89 SCMP_ARCH_MIPSEL,
90 SCMP_ARCH_MIPS64,
91 SCMP_ARCH_MIPSEL64,
92 SCMP_ARCH_MIPS64N32,
93 SCMP_ARCH_MIPSEL64N32, /* native */
94#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 95 SCMP_ARCH_PPC,
469830d1 96 SCMP_ARCH_PPC64LE,
f2d9751c
LP
97 SCMP_ARCH_PPC64, /* native */
98#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
99 SCMP_ARCH_PPC,
100 SCMP_ARCH_PPC64,
101 SCMP_ARCH_PPC64LE, /* native */
102#elif defined(__powerpc__)
103 SCMP_ARCH_PPC,
104#elif defined(__s390x__)
105 SCMP_ARCH_S390,
106 SCMP_ARCH_S390X, /* native */
107#elif defined(__s390__)
469830d1 108 SCMP_ARCH_S390,
469830d1
LP
109#endif
110 (uint32_t) -1
111 };
57183d11
LP
112
113const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
114 /* Maintain order used in <seccomp.h>.
115 *
116 * Names used here should be the same as those used for ConditionArchitecture=,
117 * except for "subarchitectures" like x32. */
57183d11 118
aa34055f
ZJS
119 switch(c) {
120 case SCMP_ARCH_NATIVE:
57183d11 121 return "native";
aa34055f 122 case SCMP_ARCH_X86:
57183d11 123 return "x86";
aa34055f 124 case SCMP_ARCH_X86_64:
57183d11 125 return "x86-64";
aa34055f 126 case SCMP_ARCH_X32:
57183d11 127 return "x32";
aa34055f 128 case SCMP_ARCH_ARM:
57183d11 129 return "arm";
aa34055f
ZJS
130 case SCMP_ARCH_AARCH64:
131 return "arm64";
132 case SCMP_ARCH_MIPS:
133 return "mips";
134 case SCMP_ARCH_MIPS64:
135 return "mips64";
136 case SCMP_ARCH_MIPS64N32:
137 return "mips64-n32";
138 case SCMP_ARCH_MIPSEL:
139 return "mips-le";
140 case SCMP_ARCH_MIPSEL64:
141 return "mips64-le";
142 case SCMP_ARCH_MIPSEL64N32:
143 return "mips64-le-n32";
144 case SCMP_ARCH_PPC:
145 return "ppc";
146 case SCMP_ARCH_PPC64:
147 return "ppc64";
148 case SCMP_ARCH_PPC64LE:
149 return "ppc64-le";
150 case SCMP_ARCH_S390:
6abfd303 151 return "s390";
aa34055f 152 case SCMP_ARCH_S390X:
6abfd303 153 return "s390x";
aa34055f
ZJS
154 default:
155 return NULL;
156 }
57183d11
LP
157}
158
159int seccomp_arch_from_string(const char *n, uint32_t *ret) {
160 if (!n)
161 return -EINVAL;
162
163 assert(ret);
164
165 if (streq(n, "native"))
166 *ret = SCMP_ARCH_NATIVE;
167 else if (streq(n, "x86"))
168 *ret = SCMP_ARCH_X86;
169 else if (streq(n, "x86-64"))
170 *ret = SCMP_ARCH_X86_64;
171 else if (streq(n, "x32"))
172 *ret = SCMP_ARCH_X32;
173 else if (streq(n, "arm"))
174 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
175 else if (streq(n, "arm64"))
176 *ret = SCMP_ARCH_AARCH64;
177 else if (streq(n, "mips"))
178 *ret = SCMP_ARCH_MIPS;
179 else if (streq(n, "mips64"))
180 *ret = SCMP_ARCH_MIPS64;
181 else if (streq(n, "mips64-n32"))
182 *ret = SCMP_ARCH_MIPS64N32;
183 else if (streq(n, "mips-le"))
184 *ret = SCMP_ARCH_MIPSEL;
185 else if (streq(n, "mips64-le"))
186 *ret = SCMP_ARCH_MIPSEL64;
187 else if (streq(n, "mips64-le-n32"))
188 *ret = SCMP_ARCH_MIPSEL64N32;
189 else if (streq(n, "ppc"))
190 *ret = SCMP_ARCH_PPC;
191 else if (streq(n, "ppc64"))
192 *ret = SCMP_ARCH_PPC64;
193 else if (streq(n, "ppc64-le"))
194 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
195 else if (streq(n, "s390"))
196 *ret = SCMP_ARCH_S390;
197 else if (streq(n, "s390x"))
198 *ret = SCMP_ARCH_S390X;
57183d11
LP
199 else
200 return -EINVAL;
201
202 return 0;
203}
e9642be2 204
469830d1 205int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
206 scmp_filter_ctx seccomp;
207 int r;
208
469830d1
LP
209 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
210 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
211
212 seccomp = seccomp_init(default_action);
213 if (!seccomp)
214 return -ENOMEM;
215
469830d1
LP
216 if (arch != SCMP_ARCH_NATIVE &&
217 arch != seccomp_arch_native()) {
218
1b52793d 219 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
220 if (r < 0)
221 goto finish;
222
1b52793d 223 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
224 if (r < 0)
225 goto finish;
226
227 assert(seccomp_arch_exist(seccomp, arch) >= 0);
228 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
229 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
230 } else {
231 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
232 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
233 }
234
235 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
236 if (r < 0)
237 goto finish;
238
239 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
240 if (r < 0)
241 goto finish;
242
243 *ret = seccomp;
244 return 0;
245
246finish:
247 seccomp_release(seccomp);
248 return r;
249}
250
d347d902 251static bool is_basic_seccomp_available(void) {
4d5bd50a 252 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
253}
254
255static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
256 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
257 errno == EFAULT;
d347d902
FS
258}
259
83f12b27 260bool is_seccomp_available(void) {
83f12b27 261 static int cached_enabled = -1;
4d5bd50a 262
83f12b27 263 if (cached_enabled < 0)
4d5bd50a
LP
264 cached_enabled =
265 is_basic_seccomp_available() &&
266 is_seccomp_filter_available();
267
83f12b27
FS
268 return cached_enabled;
269}
270
8130926d 271const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 272 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 273 .name = "@default",
d5efc18b 274 .help = "System calls that are always permitted",
40eb6a80
ZJS
275 .value =
276 "clock_getres\0"
277 "clock_gettime\0"
278 "clock_nanosleep\0"
279 "execve\0"
280 "exit\0"
281 "exit_group\0"
e41b0f42
LP
282 "futex\0"
283 "get_robust_list\0"
284 "get_thread_area\0"
09d3020b
DH
285 "getegid\0"
286 "getegid32\0"
287 "geteuid\0"
288 "geteuid32\0"
289 "getgid\0"
290 "getgid32\0"
291 "getgroups\0"
292 "getgroups32\0"
293 "getpgid\0"
294 "getpgrp\0"
295 "getpid\0"
296 "getppid\0"
297 "getresgid\0"
298 "getresgid32\0"
299 "getresuid\0"
300 "getresuid32\0"
40eb6a80 301 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
302 "getsid\0"
303 "gettid\0"
40eb6a80 304 "gettimeofday\0"
09d3020b
DH
305 "getuid\0"
306 "getuid32\0"
e41b0f42 307 "membarrier\0"
40eb6a80
ZJS
308 "nanosleep\0"
309 "pause\0"
4c3a9176 310 "prlimit64\0"
e41b0f42 311 "restart_syscall\0"
40eb6a80 312 "rt_sigreturn\0"
8f44de08 313 "sched_yield\0"
e41b0f42
LP
314 "set_robust_list\0"
315 "set_thread_area\0"
316 "set_tid_address\0"
ce5faeac 317 "set_tls\0"
40eb6a80
ZJS
318 "sigreturn\0"
319 "time\0"
4c3a9176 320 "ugetrlimit\0"
40eb6a80 321 },
44898c53
LP
322 [SYSCALL_FILTER_SET_AIO] = {
323 .name = "@aio",
324 .help = "Asynchronous IO",
325 .value =
326 "io_cancel\0"
327 "io_destroy\0"
328 "io_getevents\0"
329 "io_setup\0"
330 "io_submit\0"
331 },
133ddbbe 332 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 333 .name = "@basic-io",
d5efc18b 334 .help = "Basic IO",
133ddbbe 335 .value =
648a0ed0 336 "_llseek\0"
133ddbbe 337 "close\0"
648a0ed0 338 "dup\0"
133ddbbe
LP
339 "dup2\0"
340 "dup3\0"
133ddbbe
LP
341 "lseek\0"
342 "pread64\0"
343 "preadv\0"
44898c53 344 "preadv2\0"
133ddbbe
LP
345 "pwrite64\0"
346 "pwritev\0"
44898c53 347 "pwritev2\0"
133ddbbe
LP
348 "read\0"
349 "readv\0"
350 "write\0"
351 "writev\0"
352 },
44898c53
LP
353 [SYSCALL_FILTER_SET_CHOWN] = {
354 .name = "@chown",
355 .help = "Change ownership of files and directories",
356 .value =
357 "chown\0"
358 "chown32\0"
359 "fchown\0"
360 "fchown32\0"
361 "fchownat\0"
362 "lchown\0"
363 "lchown32\0"
364 },
8130926d 365 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 366 .name = "@clock",
d5efc18b 367 .help = "Change the system time",
201c1cc2
TM
368 .value =
369 "adjtimex\0"
1f9ac68b
LP
370 "clock_adjtime\0"
371 "clock_settime\0"
201c1cc2 372 "settimeofday\0"
1f9ac68b 373 "stime\0"
8130926d
LP
374 },
375 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 376 .name = "@cpu-emulation",
d5efc18b 377 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
378 .value =
379 "modify_ldt\0"
380 "subpage_prot\0"
381 "switch_endian\0"
382 "vm86\0"
383 "vm86old\0"
8130926d
LP
384 },
385 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 386 .name = "@debug",
d5efc18b 387 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
388 .value =
389 "lookup_dcookie\0"
390 "perf_event_open\0"
391 "process_vm_readv\0"
392 "process_vm_writev\0"
393 "ptrace\0"
394 "rtas\0"
8130926d 395#ifdef __NR_s390_runtime_instr
1f9ac68b 396 "s390_runtime_instr\0"
8130926d 397#endif
1f9ac68b 398 "sys_debug_setcontext\0"
8130926d 399 },
1a1b13c9
LP
400 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
401 .name = "@file-system",
402 .help = "File system operations",
403 .value =
404 "access\0"
405 "chdir\0"
406 "chmod\0"
407 "close\0"
408 "creat\0"
409 "faccessat\0"
410 "fallocate\0"
411 "fchdir\0"
412 "fchmod\0"
413 "fchmodat\0"
1a1b13c9 414 "fcntl\0"
ceaa6aa7 415 "fcntl64\0"
1a1b13c9
LP
416 "fgetxattr\0"
417 "flistxattr\0"
ceaa6aa7 418 "fremovexattr\0"
1a1b13c9 419 "fsetxattr\0"
1a1b13c9 420 "fstat\0"
ceaa6aa7 421 "fstat64\0"
1a1b13c9 422 "fstatat64\0"
1a1b13c9 423 "fstatfs\0"
ceaa6aa7 424 "fstatfs64\0"
1a1b13c9 425 "ftruncate\0"
ceaa6aa7 426 "ftruncate64\0"
1a1b13c9
LP
427 "futimesat\0"
428 "getcwd\0"
1a1b13c9 429 "getdents\0"
ceaa6aa7 430 "getdents64\0"
1a1b13c9
LP
431 "getxattr\0"
432 "inotify_add_watch\0"
ceaa6aa7 433 "inotify_init\0"
1a1b13c9
LP
434 "inotify_init1\0"
435 "inotify_rm_watch\0"
436 "lgetxattr\0"
437 "link\0"
438 "linkat\0"
439 "listxattr\0"
440 "llistxattr\0"
441 "lremovexattr\0"
442 "lsetxattr\0"
1a1b13c9 443 "lstat\0"
ceaa6aa7 444 "lstat64\0"
1a1b13c9
LP
445 "mkdir\0"
446 "mkdirat\0"
447 "mknod\0"
448 "mknodat\0"
1a1b13c9 449 "mmap\0"
ceaa6aa7 450 "mmap2\0"
7961116e 451 "munmap\0"
1a1b13c9 452 "newfstatat\0"
ceaa6aa7
LP
453 "oldfstat\0"
454 "oldlstat\0"
455 "oldstat\0"
1a1b13c9
LP
456 "open\0"
457 "openat\0"
458 "readlink\0"
459 "readlinkat\0"
460 "removexattr\0"
461 "rename\0"
1a1b13c9 462 "renameat\0"
ceaa6aa7 463 "renameat2\0"
1a1b13c9
LP
464 "rmdir\0"
465 "setxattr\0"
1a1b13c9 466 "stat\0"
ceaa6aa7 467 "stat64\0"
1a1b13c9 468 "statfs\0"
ceaa6aa7 469 "statfs64\0"
8e6a7a8b 470#ifdef __NR_statx
a4135a74 471 "statx\0"
ceaa6aa7 472#endif
1a1b13c9
LP
473 "symlink\0"
474 "symlinkat\0"
1a1b13c9 475 "truncate\0"
ceaa6aa7 476 "truncate64\0"
1a1b13c9
LP
477 "unlink\0"
478 "unlinkat\0"
ceaa6aa7 479 "utime\0"
1a1b13c9
LP
480 "utimensat\0"
481 "utimes\0"
482 },
8130926d 483 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 484 .name = "@io-event",
d5efc18b 485 .help = "Event loop system calls",
201c1cc2
TM
486 .value =
487 "_newselect\0"
201c1cc2 488 "epoll_create\0"
215728ff 489 "epoll_create1\0"
201c1cc2
TM
490 "epoll_ctl\0"
491 "epoll_ctl_old\0"
492 "epoll_pwait\0"
493 "epoll_wait\0"
494 "epoll_wait_old\0"
201c1cc2 495 "eventfd\0"
215728ff 496 "eventfd2\0"
201c1cc2
TM
497 "poll\0"
498 "ppoll\0"
499 "pselect6\0"
500 "select\0"
8130926d
LP
501 },
502 [SYSCALL_FILTER_SET_IPC] = {
8130926d 503 .name = "@ipc",
d5efc18b
ZJS
504 .help = "SysV IPC, POSIX Message Queues or other IPC",
505 .value =
506 "ipc\0"
cd5bfd7e 507 "memfd_create\0"
201c1cc2
TM
508 "mq_getsetattr\0"
509 "mq_notify\0"
510 "mq_open\0"
511 "mq_timedreceive\0"
512 "mq_timedsend\0"
513 "mq_unlink\0"
514 "msgctl\0"
515 "msgget\0"
516 "msgrcv\0"
517 "msgsnd\0"
cd5bfd7e 518 "pipe\0"
215728ff 519 "pipe2\0"
201c1cc2
TM
520 "process_vm_readv\0"
521 "process_vm_writev\0"
522 "semctl\0"
523 "semget\0"
524 "semop\0"
525 "semtimedop\0"
526 "shmat\0"
527 "shmctl\0"
528 "shmdt\0"
529 "shmget\0"
8130926d
LP
530 },
531 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 532 .name = "@keyring",
d5efc18b 533 .help = "Kernel keyring access",
1f9ac68b
LP
534 .value =
535 "add_key\0"
536 "keyctl\0"
537 "request_key\0"
8130926d 538 },
cd0ddf6f
LP
539 [SYSCALL_FILTER_SET_MEMLOCK] = {
540 .name = "@memlock",
541 .help = "Memory locking control",
542 .value =
543 "mlock\0"
544 "mlock2\0"
545 "mlockall\0"
546 "munlock\0"
547 "munlockall\0"
548 },
8130926d 549 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 550 .name = "@module",
d5efc18b 551 .help = "Loading and unloading of kernel modules",
201c1cc2 552 .value =
201c1cc2
TM
553 "delete_module\0"
554 "finit_module\0"
555 "init_module\0"
8130926d
LP
556 },
557 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 558 .name = "@mount",
d5efc18b 559 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
560 .value =
561 "chroot\0"
562 "mount\0"
201c1cc2 563 "pivot_root\0"
201c1cc2 564 "umount\0"
215728ff 565 "umount2\0"
8130926d
LP
566 },
567 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 568 .name = "@network-io",
d5efc18b 569 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 570 .value =
201c1cc2 571 "accept\0"
215728ff 572 "accept4\0"
201c1cc2
TM
573 "bind\0"
574 "connect\0"
575 "getpeername\0"
576 "getsockname\0"
577 "getsockopt\0"
578 "listen\0"
579 "recv\0"
580 "recvfrom\0"
581 "recvmmsg\0"
582 "recvmsg\0"
583 "send\0"
584 "sendmmsg\0"
585 "sendmsg\0"
586 "sendto\0"
587 "setsockopt\0"
588 "shutdown\0"
589 "socket\0"
590 "socketcall\0"
591 "socketpair\0"
8130926d
LP
592 },
593 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 594 /* some unknown even to libseccomp */
8130926d 595 .name = "@obsolete",
d5efc18b 596 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
597 .value =
598 "_sysctl\0"
599 "afs_syscall\0"
802fa07a 600 "bdflush\0"
201c1cc2 601 "break\0"
1f9ac68b 602 "create_module\0"
201c1cc2
TM
603 "ftime\0"
604 "get_kernel_syms\0"
201c1cc2
TM
605 "getpmsg\0"
606 "gtty\0"
7e0c3b8f 607 "idle\0"
201c1cc2 608 "lock\0"
201c1cc2 609 "mpx\0"
201c1cc2
TM
610 "prof\0"
611 "profil\0"
201c1cc2
TM
612 "putpmsg\0"
613 "query_module\0"
201c1cc2
TM
614 "security\0"
615 "sgetmask\0"
616 "ssetmask\0"
617 "stty\0"
1f9ac68b 618 "sysfs\0"
201c1cc2
TM
619 "tuxcall\0"
620 "ulimit\0"
621 "uselib\0"
1f9ac68b 622 "ustat\0"
201c1cc2 623 "vserver\0"
8130926d
LP
624 },
625 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 626 .name = "@privileged",
d5efc18b 627 .help = "All system calls which need super-user capabilities",
201c1cc2 628 .value =
44898c53 629 "@chown\0"
201c1cc2
TM
630 "@clock\0"
631 "@module\0"
632 "@raw-io\0"
af0f047b
LP
633 "@reboot\0"
634 "@swap\0"
215728ff 635 "_sysctl\0"
201c1cc2 636 "acct\0"
201c1cc2 637 "bpf\0"
1f9ac68b 638 "capset\0"
201c1cc2 639 "chroot\0"
201c1cc2
TM
640 "nfsservctl\0"
641 "pivot_root\0"
642 "quotactl\0"
201c1cc2 643 "setdomainname\0"
201c1cc2 644 "setfsuid\0"
215728ff 645 "setfsuid32\0"
201c1cc2 646 "setgroups\0"
215728ff 647 "setgroups32\0"
201c1cc2 648 "sethostname\0"
201c1cc2 649 "setresuid\0"
215728ff 650 "setresuid32\0"
201c1cc2 651 "setreuid\0"
215728ff 652 "setreuid32\0"
201c1cc2 653 "setuid\0"
215728ff 654 "setuid32\0"
201c1cc2 655 "vhangup\0"
8130926d
LP
656 },
657 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 658 .name = "@process",
d5efc18b 659 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
660 .value =
661 "arch_prctl\0"
09d3020b 662 "capget\0" /* Able to query arbitrary processes */
201c1cc2 663 "clone\0"
201c1cc2
TM
664 "execveat\0"
665 "fork\0"
b887d2eb 666 "getrusage\0"
201c1cc2
TM
667 "kill\0"
668 "prctl\0"
b887d2eb
LP
669 "rt_sigqueueinfo\0"
670 "rt_tgsigqueueinfo\0"
201c1cc2
TM
671 "setns\0"
672 "tgkill\0"
b887d2eb 673 "times\0"
201c1cc2
TM
674 "tkill\0"
675 "unshare\0"
676 "vfork\0"
b887d2eb
LP
677 "wait4\0"
678 "waitid\0"
679 "waitpid\0"
8130926d
LP
680 },
681 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 682 .name = "@raw-io",
d5efc18b 683 .help = "Raw I/O port access",
201c1cc2
TM
684 .value =
685 "ioperm\0"
686 "iopl\0"
1f9ac68b 687 "pciconfig_iobase\0"
201c1cc2
TM
688 "pciconfig_read\0"
689 "pciconfig_write\0"
8130926d 690#ifdef __NR_s390_pci_mmio_read
201c1cc2 691 "s390_pci_mmio_read\0"
8130926d
LP
692#endif
693#ifdef __NR_s390_pci_mmio_write
201c1cc2 694 "s390_pci_mmio_write\0"
8130926d
LP
695#endif
696 },
bd2ab3f4
LP
697 [SYSCALL_FILTER_SET_REBOOT] = {
698 .name = "@reboot",
699 .help = "Reboot and reboot preparation/kexec",
700 .value =
bd2ab3f4 701 "kexec_file_load\0"
e59608fa 702 "kexec_load\0"
bd2ab3f4
LP
703 "reboot\0"
704 },
133ddbbe 705 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 706 .name = "@resources",
58a8f68b 707 .help = "Alter resource settings",
133ddbbe 708 .value =
0963c053
LP
709 "ioprio_set\0"
710 "mbind\0"
711 "migrate_pages\0"
712 "move_pages\0"
713 "nice\0"
0963c053
LP
714 "sched_setaffinity\0"
715 "sched_setattr\0"
133ddbbe
LP
716 "sched_setparam\0"
717 "sched_setscheduler\0"
0963c053 718 "set_mempolicy\0"
133ddbbe
LP
719 "setpriority\0"
720 "setrlimit\0"
133ddbbe 721 },
6eaaeee9
LP
722 [SYSCALL_FILTER_SET_SETUID] = {
723 .name = "@setuid",
724 .help = "Operations for changing user/group credentials",
725 .value =
6eaaeee9 726 "setgid\0"
215728ff 727 "setgid32\0"
6eaaeee9 728 "setgroups\0"
215728ff 729 "setgroups32\0"
6eaaeee9 730 "setregid\0"
215728ff 731 "setregid32\0"
6eaaeee9 732 "setresgid\0"
215728ff 733 "setresgid32\0"
6eaaeee9 734 "setresuid\0"
215728ff 735 "setresuid32\0"
6eaaeee9 736 "setreuid\0"
215728ff 737 "setreuid32\0"
6eaaeee9 738 "setuid\0"
215728ff 739 "setuid32\0"
6eaaeee9 740 },
cd0ddf6f
LP
741 [SYSCALL_FILTER_SET_SIGNAL] = {
742 .name = "@signal",
743 .help = "Process signal handling",
744 .value =
745 "rt_sigaction\0"
746 "rt_sigpending\0"
747 "rt_sigprocmask\0"
748 "rt_sigsuspend\0"
749 "rt_sigtimedwait\0"
750 "sigaction\0"
751 "sigaltstack\0"
752 "signal\0"
753 "signalfd\0"
754 "signalfd4\0"
755 "sigpending\0"
756 "sigprocmask\0"
757 "sigsuspend\0"
758 },
bd2ab3f4
LP
759 [SYSCALL_FILTER_SET_SWAP] = {
760 .name = "@swap",
761 .help = "Enable/disable swap devices",
762 .value =
763 "swapoff\0"
764 "swapon\0"
765 },
44898c53
LP
766 [SYSCALL_FILTER_SET_SYNC] = {
767 .name = "@sync",
768 .help = "Synchronize files and memory to storage",
769 .value =
770 "fdatasync\0"
771 "fsync\0"
772 "msync\0"
773 "sync\0"
774 "sync_file_range\0"
775 "syncfs\0"
776 },
cd0ddf6f
LP
777 [SYSCALL_FILTER_SET_TIMER] = {
778 .name = "@timer",
779 .help = "Schedule operations by time",
780 .value =
781 "alarm\0"
782 "getitimer\0"
783 "setitimer\0"
784 "timer_create\0"
785 "timer_delete\0"
786 "timer_getoverrun\0"
787 "timer_gettime\0"
788 "timer_settime\0"
789 "timerfd_create\0"
790 "timerfd_gettime\0"
791 "timerfd_settime\0"
792 "times\0"
793 },
201c1cc2 794};
8130926d
LP
795
796const SyscallFilterSet *syscall_filter_set_find(const char *name) {
797 unsigned i;
798
799 if (isempty(name) || name[0] != '@')
800 return NULL;
801
802 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
803 if (streq(syscall_filter_sets[i].name, name))
804 return syscall_filter_sets + i;
805
806 return NULL;
807}
808
960e4569 809static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
69b1b241 810
960e4569 811int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
69b1b241
LP
812 int r;
813
814 assert(seccomp);
815 assert(name);
816
960e4569
LP
817 if (strv_contains(exclude, name))
818 return 0;
819
69b1b241
LP
820 if (name[0] == '@') {
821 const SyscallFilterSet *other;
822
823 other = syscall_filter_set_find(name);
cff7bff8
LP
824 if (!other) {
825 log_debug("Filter set %s is not known!", name);
69b1b241 826 return -EINVAL;
cff7bff8 827 }
69b1b241 828
960e4569 829 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
69b1b241
LP
830 if (r < 0)
831 return r;
832 } else {
833 int id;
834
835 id = seccomp_syscall_resolve_name(name);
cff7bff8 836 if (id == __NR_SCMP_ERROR) {
ff217dc3
LP
837 log_debug("System call %s is not known, ignoring.", name);
838 return 0;
cff7bff8 839 }
69b1b241
LP
840
841 r = seccomp_rule_add_exact(seccomp, action, id, 0);
842 if (r < 0)
843 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
844 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
845 }
846
847 return 0;
848}
849
469830d1
LP
850static int seccomp_add_syscall_filter_set(
851 scmp_filter_ctx seccomp,
469830d1 852 const SyscallFilterSet *set,
960e4569
LP
853 uint32_t action,
854 char **exclude) {
469830d1 855
8130926d
LP
856 const char *sys;
857 int r;
858
859 assert(seccomp);
860 assert(set);
861
862 NULSTR_FOREACH(sys, set->value) {
960e4569 863 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
69b1b241
LP
864 if (r < 0)
865 return r;
469830d1
LP
866 }
867
868 return 0;
869}
870
871int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
872 uint32_t arch;
873 int r;
874
875 assert(set);
876
877 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
878 * earch local arch. */
879
880 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
881 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
882
883 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
884
885 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
886 if (r < 0)
887 return r;
469830d1 888
960e4569 889 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
469830d1
LP
890 if (r < 0) {
891 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
892 continue;
893 }
894
895 r = seccomp_load(seccomp);
896 if (IN_SET(r, -EPERM, -EACCES))
897 return r;
898 if (r < 0)
899 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
900 }
901
902 return 0;
903}
a3be2849 904
8cfa775f 905int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action) {
469830d1 906 uint32_t arch;
a3be2849
LP
907 int r;
908
469830d1
LP
909 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
910 * SyscallFilterSet* table. */
a3be2849 911
8cfa775f 912 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 913 return 0;
a3be2849 914
469830d1
LP
915 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
916 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
917 Iterator i;
8cfa775f 918 void *id, *val;
a3be2849 919
469830d1 920 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 921
469830d1
LP
922 r = seccomp_init_for_arch(&seccomp, arch, default_action);
923 if (r < 0)
924 return r;
a3be2849 925
8cfa775f
YW
926 HASHMAP_FOREACH_KEY(val, id, set, i) {
927 uint32_t a = action;
928 int e = PTR_TO_INT(val);
929
930 if (action != SCMP_ACT_ALLOW && e >= 0)
931 a = SCMP_ACT_ERRNO(e);
932
933 r = seccomp_rule_add_exact(seccomp, a, PTR_TO_INT(id) - 1, 0);
469830d1
LP
934 if (r < 0) {
935 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
936 _cleanup_free_ char *n = NULL;
937
5c19ff79 938 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
e7854c46 939 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
469830d1
LP
940 }
941 }
942
943 r = seccomp_load(seccomp);
944 if (IN_SET(r, -EPERM, -EACCES))
945 return r;
946 if (r < 0)
947 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
948 }
949
950 return 0;
add00535
LP
951}
952
13d92c63 953int seccomp_parse_syscall_filter_full(
898748d8
YW
954 const char *name,
955 int errno_num,
956 Hashmap *filter,
13d92c63 957 SeccompParseFlags flags,
898748d8
YW
958 const char *unit,
959 const char *filename,
960 unsigned line) {
961
962 int r;
963
964 assert(name);
965 assert(filter);
966
967 if (name[0] == '@') {
968 const SyscallFilterSet *set;
969 const char *i;
970
971 set = syscall_filter_set_find(name);
972 if (!set) {
13d92c63 973 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 974 return -EINVAL;
13d92c63
LP
975
976 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
977 "Unknown system call group, ignoring: %s", name);
978 return 0;
898748d8
YW
979 }
980
981 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
982 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
983 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
984 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
985 * about them. */
986 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
987 if (r < 0)
988 return r;
989 }
990 } else {
991 int id;
992
993 id = seccomp_syscall_resolve_name(name);
994 if (id == __NR_SCMP_ERROR) {
13d92c63 995 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 996 return -EINVAL;
13d92c63
LP
997
998 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
999 "Failed to parse system call, ignoring: %s", name);
1000 return 0;
898748d8
YW
1001 }
1002
1003 /* If we previously wanted to forbid a syscall and now
1004 * we want to allow it, then remove it from the list. */
13d92c63 1005 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
898748d8
YW
1006 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1007 if (r < 0)
13d92c63 1008 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
898748d8
YW
1009 } else
1010 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1011 }
1012
1013 return 0;
1014}
1015
add00535 1016int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1017 uint32_t arch;
add00535
LP
1018 int r;
1019
f1d34068 1020 if (DEBUG_LOGGING) {
add00535
LP
1021 _cleanup_free_ char *s = NULL;
1022
1023 (void) namespace_flag_to_string_many(retain, &s);
1024 log_debug("Restricting namespace to: %s.", strna(s));
1025 }
1026
1027 /* NOOP? */
1028 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1029 return 0;
1030
469830d1
LP
1031 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1032 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1033 unsigned i;
add00535 1034
469830d1
LP
1035 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1036
1037 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1038 if (r < 0)
1039 return r;
1040
1041 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1042 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1043 * altogether. */
1044 r = seccomp_rule_add_exact(
1045 seccomp,
1046 SCMP_ACT_ERRNO(EPERM),
1047 SCMP_SYS(setns),
1048 0);
1049 else
1050 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1051 * special invocation with a zero flags argument, right here. */
1052 r = seccomp_rule_add_exact(
1053 seccomp,
1054 SCMP_ACT_ERRNO(EPERM),
1055 SCMP_SYS(setns),
1056 1,
1057 SCMP_A1(SCMP_CMP_EQ, 0));
1058 if (r < 0) {
1059 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1060 continue;
1061 }
1062
1063 for (i = 0; namespace_flag_map[i].name; i++) {
1064 unsigned long f;
1065
1066 f = namespace_flag_map[i].flag;
1067 if ((retain & f) == f) {
1068 log_debug("Permitting %s.", namespace_flag_map[i].name);
1069 continue;
1070 }
1071
1072 log_debug("Blocking %s.", namespace_flag_map[i].name);
1073
1074 r = seccomp_rule_add_exact(
1075 seccomp,
1076 SCMP_ACT_ERRNO(EPERM),
1077 SCMP_SYS(unshare),
1078 1,
1079 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1080 if (r < 0) {
1081 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1082 break;
1083 }
1084
511ceb1f
ZJS
1085 /* On s390/s390x the first two parameters to clone are switched */
1086 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1087 r = seccomp_rule_add_exact(
1088 seccomp,
1089 SCMP_ACT_ERRNO(EPERM),
1090 SCMP_SYS(clone),
1091 1,
1092 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1093 else
1094 r = seccomp_rule_add_exact(
1095 seccomp,
1096 SCMP_ACT_ERRNO(EPERM),
1097 SCMP_SYS(clone),
1098 1,
1099 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1100 if (r < 0) {
1101 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1102 break;
1103 }
1104
1105 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1106 r = seccomp_rule_add_exact(
1107 seccomp,
1108 SCMP_ACT_ERRNO(EPERM),
1109 SCMP_SYS(setns),
1110 1,
1111 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1112 if (r < 0) {
1113 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1114 break;
1115 }
1116 }
1117 }
1118 if (r < 0)
1119 continue;
1120
1121 r = seccomp_load(seccomp);
1122 if (IN_SET(r, -EPERM, -EACCES))
1123 return r;
1124 if (r < 0)
1125 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1126 }
1127
1128 return 0;
1129}
1130
1131int seccomp_protect_sysctl(void) {
1132 uint32_t arch;
1133 int r;
1134
1135 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1136 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1137
1138 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1139
2e64e8f4
ZJS
1140 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1141 /* No _sysctl syscall */
1142 continue;
1143
469830d1
LP
1144 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1145 if (r < 0)
1146 return r;
1147
1148 r = seccomp_rule_add_exact(
add00535
LP
1149 seccomp,
1150 SCMP_ACT_ERRNO(EPERM),
469830d1 1151 SCMP_SYS(_sysctl),
add00535 1152 0);
469830d1
LP
1153 if (r < 0) {
1154 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1155 continue;
1156 }
1157
1158 r = seccomp_load(seccomp);
1159 if (IN_SET(r, -EPERM, -EACCES))
1160 return r;
1161 if (r < 0)
1162 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1163 }
1164
1165 return 0;
1166}
1167
1168int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1169 uint32_t arch;
1170 int r;
1171
1172 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1173 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1174 bool supported;
469830d1
LP
1175 Iterator i;
1176
1177 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1178
9606bc4b
LP
1179 switch (arch) {
1180
1181 case SCMP_ARCH_X86_64:
1182 case SCMP_ARCH_X32:
1183 case SCMP_ARCH_ARM:
1184 case SCMP_ARCH_AARCH64:
0d9fca76 1185 case SCMP_ARCH_PPC:
da1921a5
ZJS
1186 case SCMP_ARCH_PPC64:
1187 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1188 case SCMP_ARCH_MIPSEL64N32:
1189 case SCMP_ARCH_MIPS64N32:
1190 case SCMP_ARCH_MIPSEL64:
1191 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1192 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1193 supported = true;
1194 break;
1195
9606bc4b
LP
1196 case SCMP_ARCH_S390:
1197 case SCMP_ARCH_S390X:
da1921a5 1198 case SCMP_ARCH_X86:
f5aeac14
JC
1199 case SCMP_ARCH_MIPSEL:
1200 case SCMP_ARCH_MIPS:
9606bc4b
LP
1201 default:
1202 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1203 * don't know */
1204 supported = false;
1205 break;
1206 }
1207
1208 if (!supported)
1209 continue;
1210
469830d1
LP
1211 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1212 if (r < 0)
1213 return r;
1214
1215 if (whitelist) {
1216 int af, first = 0, last = 0;
1217 void *afp;
1218
1219 /* If this is a whitelist, we first block the address families that are out of range and then
1220 * everything that is not in the set. First, we find the lowest and highest address family in
1221 * the set. */
1222
1223 SET_FOREACH(afp, address_families, i) {
1224 af = PTR_TO_INT(afp);
1225
1226 if (af <= 0 || af >= af_max())
1227 continue;
1228
1229 if (first == 0 || af < first)
1230 first = af;
1231
1232 if (last == 0 || af > last)
1233 last = af;
1234 }
1235
1236 assert((first == 0) == (last == 0));
1237
1238 if (first == 0) {
1239
1240 /* No entries in the valid range, block everything */
1241 r = seccomp_rule_add_exact(
1242 seccomp,
1243 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1244 SCMP_SYS(socket),
1245 0);
1246 if (r < 0) {
1247 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1248 continue;
1249 }
1250
1251 } else {
1252
1253 /* Block everything below the first entry */
1254 r = seccomp_rule_add_exact(
1255 seccomp,
1256 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1257 SCMP_SYS(socket),
1258 1,
1259 SCMP_A0(SCMP_CMP_LT, first));
1260 if (r < 0) {
1261 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1262 continue;
1263 }
1264
1265 /* Block everything above the last entry */
1266 r = seccomp_rule_add_exact(
1267 seccomp,
1268 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1269 SCMP_SYS(socket),
1270 1,
1271 SCMP_A0(SCMP_CMP_GT, last));
1272 if (r < 0) {
1273 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1274 continue;
1275 }
1276
1277 /* Block everything between the first and last entry */
1278 for (af = 1; af < af_max(); af++) {
1279
1280 if (set_contains(address_families, INT_TO_PTR(af)))
1281 continue;
1282
1283 r = seccomp_rule_add_exact(
1284 seccomp,
1285 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1286 SCMP_SYS(socket),
1287 1,
1288 SCMP_A0(SCMP_CMP_EQ, af));
1289 if (r < 0)
1290 break;
1291 }
469830d1
LP
1292 if (r < 0) {
1293 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1294 continue;
1295 }
1296 }
1297
1298 } else {
1299 void *af;
1300
1301 /* If this is a blacklist, then generate one rule for
1302 * each address family that are then combined in OR
1303 * checks. */
1304
1305 SET_FOREACH(af, address_families, i) {
1306
1307 r = seccomp_rule_add_exact(
1308 seccomp,
1309 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1310 SCMP_SYS(socket),
1311 1,
1312 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1313 if (r < 0)
1314 break;
1315 }
469830d1
LP
1316 if (r < 0) {
1317 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1318 continue;
1319 }
1320 }
1321
1322 r = seccomp_load(seccomp);
1323 if (IN_SET(r, -EPERM, -EACCES))
1324 return r;
1325 if (r < 0)
1326 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1327 }
1328
1329 return 0;
1330}
1331
1332int seccomp_restrict_realtime(void) {
1333 static const int permitted_policies[] = {
1334 SCHED_OTHER,
1335 SCHED_BATCH,
1336 SCHED_IDLE,
1337 };
1338
1339 int r, max_policy = 0;
1340 uint32_t arch;
1341 unsigned i;
1342
1343 /* Determine the highest policy constant we want to allow */
1344 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1345 if (permitted_policies[i] > max_policy)
1346 max_policy = permitted_policies[i];
1347
1348 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1349 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1350 int p;
1351
1352 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1353
1354 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1355 if (r < 0)
1356 return r;
1357
1358 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1359 * whitelist. */
1360 for (p = 0; p < max_policy; p++) {
1361 bool good = false;
1362
1363 /* Check if this is in the whitelist. */
1364 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1365 if (permitted_policies[i] == p) {
1366 good = true;
1367 break;
1368 }
1369
1370 if (good)
1371 continue;
1372
1373 /* Deny this policy */
1374 r = seccomp_rule_add_exact(
1375 seccomp,
1376 SCMP_ACT_ERRNO(EPERM),
1377 SCMP_SYS(sched_setscheduler),
1378 1,
1379 SCMP_A1(SCMP_CMP_EQ, p));
1380 if (r < 0) {
1381 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1382 continue;
1383 }
1384 }
1385
1386 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1387 * unsigned here, hence no need no check for < 0 values. */
1388 r = seccomp_rule_add_exact(
add00535
LP
1389 seccomp,
1390 SCMP_ACT_ERRNO(EPERM),
469830d1 1391 SCMP_SYS(sched_setscheduler),
add00535 1392 1,
469830d1
LP
1393 SCMP_A1(SCMP_CMP_GT, max_policy));
1394 if (r < 0) {
1395 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1396 continue;
1397 }
add00535 1398
469830d1
LP
1399 r = seccomp_load(seccomp);
1400 if (IN_SET(r, -EPERM, -EACCES))
1401 return r;
1402 if (r < 0)
1403 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1404 }
1405
1406 return 0;
1407}
1408
6dc66688
ZJS
1409static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1410 uint32_t arch,
1411 int nr,
1412 unsigned int arg_cnt,
1413 const struct scmp_arg_cmp arg) {
1414 int r;
1415
1416 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1417 if (r < 0) {
1418 _cleanup_free_ char *n = NULL;
1419
1420 n = seccomp_syscall_resolve_num_arch(arch, nr);
1421 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1422 strna(n),
1423 seccomp_arch_to_string(arch));
1424 }
1425
1426 return r;
1427}
1428
2a8d6e63 1429/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1430#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1431assert_cc(SCMP_SYS(shmget) > 0);
1432assert_cc(SCMP_SYS(shmat) > 0);
1433assert_cc(SCMP_SYS(shmdt) > 0);
303d6b4c 1434#elif defined(__i386__) || defined(__powerpc64__)
2a8d6e63
ZJS
1435assert_cc(SCMP_SYS(shmget) < 0);
1436assert_cc(SCMP_SYS(shmat) < 0);
1437assert_cc(SCMP_SYS(shmdt) < 0);
1438#endif
6dc66688 1439
469830d1 1440int seccomp_memory_deny_write_execute(void) {
8a50cf69 1441
469830d1
LP
1442 uint32_t arch;
1443 int r;
1444
1445 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1446 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1447 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1448
469830d1
LP
1449 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1450
8a50cf69
LP
1451 switch (arch) {
1452
1453 case SCMP_ARCH_X86:
1454 filter_syscall = SCMP_SYS(mmap2);
1455 block_syscall = SCMP_SYS(mmap);
2a8d6e63
ZJS
1456 break;
1457
63d00dfb 1458 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1459 case SCMP_ARCH_PPC64:
1460 case SCMP_ARCH_PPC64LE:
1461 filter_syscall = SCMP_SYS(mmap);
1462
1463 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1464 * We ignore that here, which means there's still a way to get writable/executable
1465 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1466
8a50cf69
LP
1467 break;
1468
4278d1f5
ZJS
1469 case SCMP_ARCH_ARM:
1470 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1471 shmat_syscall = SCMP_SYS(shmat);
1472 break;
1473
8a50cf69
LP
1474 case SCMP_ARCH_X86_64:
1475 case SCMP_ARCH_X32:
79873bc8 1476 case SCMP_ARCH_AARCH64:
303d6b4c 1477 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
8a50cf69
LP
1478 shmat_syscall = SCMP_SYS(shmat);
1479 break;
1480
1481 /* Please add more definitions here, if you port systemd to other architectures! */
1482
303d6b4c 1483#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
8a50cf69
LP
1484#warning "Consider adding the right mmap() syscall definitions here!"
1485#endif
1486 }
1487
1488 /* Can't filter mmap() on this arch, then skip it */
1489 if (filter_syscall == 0)
1490 continue;
1491
469830d1
LP
1492 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1493 if (r < 0)
1494 return r;
1495
6dc66688
ZJS
1496 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1497 1,
1498 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1499 if (r < 0)
1500 continue;
8a50cf69
LP
1501
1502 if (block_syscall != 0) {
6dc66688
ZJS
1503 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1504 if (r < 0)
8a50cf69 1505 continue;
add00535 1506 }
a3be2849 1507
6dc66688
ZJS
1508 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1509 1,
b835eeb4
ZJS
1510 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1511 if (r < 0)
1512 continue;
1513
91691f1d 1514#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1515 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1516 1,
6dc66688
ZJS
1517 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1518 if (r < 0)
469830d1 1519 continue;
91691f1d 1520#endif
add00535 1521
8a50cf69 1522 if (shmat_syscall != 0) {
6dc66688
ZJS
1523 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1524 1,
1525 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1526 if (r < 0)
8a50cf69 1527 continue;
469830d1
LP
1528 }
1529
1530 r = seccomp_load(seccomp);
1531 if (IN_SET(r, -EPERM, -EACCES))
1532 return r;
add00535 1533 if (r < 0)
469830d1
LP
1534 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1535 }
add00535 1536
469830d1
LP
1537 return 0;
1538}
1539
1540int seccomp_restrict_archs(Set *archs) {
1541 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1542 Iterator i;
1543 void *id;
1544 int r;
1545
1546 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1547 * list.
1548 *
1549 * There are some qualifications. However the most important use is to stop processes from bypassing
1550 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1551 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1552
2428aaf8
AJ
1553 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1554 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1555 * to run a program with the restrictions applied. */
469830d1
LP
1556 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1557 if (!seccomp)
1558 return -ENOMEM;
1559
1560 SET_FOREACH(id, archs, i) {
1561 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1562 if (r < 0 && r != -EEXIST)
1563 return r;
1564 }
1565
1566 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1567 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1568 * The important thing is that you can block the old 32-bit x86 syscalls.
1569 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1570
1571 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1572 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1573
1574 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1575 if (r < 0 && r != -EEXIST)
469830d1 1576 return r;
add00535
LP
1577 }
1578
469830d1
LP
1579 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1580 if (r < 0)
1581 return r;
add00535 1582
1c6af69b
LP
1583 r = seccomp_load(seccomp);
1584 if (IN_SET(r, -EPERM, -EACCES))
1585 return r;
1586 if (r < 0)
1587 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1588
1589 return 0;
a3be2849 1590}
b16bd535
YW
1591
1592int parse_syscall_archs(char **l, Set **archs) {
1593 _cleanup_set_free_ Set *_archs;
1594 char **s;
1595 int r;
1596
1597 assert(l);
1598 assert(archs);
1599
1600 r = set_ensure_allocated(&_archs, NULL);
1601 if (r < 0)
1602 return r;
1603
1604 STRV_FOREACH(s, l) {
1605 uint32_t a;
1606
1607 r = seccomp_arch_from_string(*s, &a);
1608 if (r < 0)
1609 return -EINVAL;
1610
1611 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1612 if (r < 0)
1613 return -ENOMEM;
1614 }
1615
1cc6c93a 1616 *archs = TAKE_PTR(_archs);
b16bd535
YW
1617
1618 return 0;
1619}
165a31c0 1620
8cfa775f 1621int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1622 const char *i;
1623 int r;
1624
1625 assert(set);
1626
1627 NULSTR_FOREACH(i, set->value) {
1628
1629 if (i[0] == '@') {
1630 const SyscallFilterSet *more;
1631
1632 more = syscall_filter_set_find(i);
1633 if (!more)
1634 return -ENXIO;
1635
165a31c0
LP
1636 r = seccomp_filter_set_add(filter, add, more);
1637 if (r < 0)
1638 return r;
1639 } else {
1640 int id;
1641
1642 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1643 if (id == __NR_SCMP_ERROR) {
1644 log_debug("Couldn't resolve system call, ignoring: %s", i);
1645 continue;
1646 }
165a31c0
LP
1647
1648 if (add) {
8cfa775f 1649 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1650 if (r < 0)
1651 return r;
1652 } else
8cfa775f 1653 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1654 }
1655 }
1656
1657 return 0;
1658}
78e864e5
TM
1659
1660int seccomp_lock_personality(unsigned long personality) {
72eafe71 1661 uint32_t arch;
78e864e5
TM
1662 int r;
1663
72eafe71
LP
1664 if (personality >= PERSONALITY_INVALID)
1665 return -EINVAL;
78e864e5 1666
72eafe71
LP
1667 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1668 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1669
72eafe71
LP
1670 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1671 if (r < 0)
1672 return r;
1673
1674 r = seccomp_rule_add_exact(
1675 seccomp,
1676 SCMP_ACT_ERRNO(EPERM),
1677 SCMP_SYS(personality),
1678 1,
1679 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1680 if (r < 0) {
1681 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1682 continue;
1683 }
72eafe71
LP
1684
1685 r = seccomp_load(seccomp);
1686 if (IN_SET(r, -EPERM, -EACCES))
1687 return r;
1688 if (r < 0)
1689 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1690 }
1691
1692 return 0;
78e864e5 1693}