]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
seccomp: add three more seccomp groups
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
57183d11
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
a8fbdf54 20#include <errno.h>
469830d1 21#include <linux/seccomp.h>
57183d11 22#include <seccomp.h>
a8fbdf54 23#include <stddef.h>
469830d1 24#include <sys/mman.h>
d347d902 25#include <sys/prctl.h>
469830d1 26#include <sys/shm.h>
57183d11 27
469830d1 28#include "af-list.h"
add00535 29#include "alloc-util.h"
a8fbdf54 30#include "macro.h"
add00535 31#include "nsflags.h"
78e864e5 32#include "process-util.h"
cf0fbc49 33#include "seccomp-util.h"
b16bd535 34#include "set.h"
07630cea 35#include "string-util.h"
b16bd535 36#include "strv.h"
8130926d 37#include "util.h"
469830d1
LP
38#include "errno-list.h"
39
40const uint32_t seccomp_local_archs[] = {
41
f2d9751c
LP
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43
44#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
45 SCMP_ARCH_X86,
46 SCMP_ARCH_X86_64,
f2d9751c
LP
47 SCMP_ARCH_X32, /* native */
48#elif defined(__x86_64__) && !defined(__ILP32__)
49 SCMP_ARCH_X86,
469830d1 50 SCMP_ARCH_X32,
f2d9751c
LP
51 SCMP_ARCH_X86_64, /* native */
52#elif defined(__i386__)
53 SCMP_ARCH_X86,
54#elif defined(__aarch64__)
469830d1 55 SCMP_ARCH_ARM,
f2d9751c
LP
56 SCMP_ARCH_AARCH64, /* native */
57#elif defined(__arm__)
58 SCMP_ARCH_ARM,
59#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS, /* native */
62#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 63 SCMP_ARCH_MIPS,
f2d9751c
LP
64 SCMP_ARCH_MIPSEL, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64N32,
469830d1 69 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPSEL64,
71 SCMP_ARCH_MIPS64, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPS,
469830d1 74 SCMP_ARCH_MIPSEL,
f2d9751c
LP
75 SCMP_ARCH_MIPS64N32,
76 SCMP_ARCH_MIPSEL64N32,
77 SCMP_ARCH_MIPS64,
78 SCMP_ARCH_MIPSEL64, /* native */
79#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPSEL,
81 SCMP_ARCH_MIPS,
469830d1 82 SCMP_ARCH_MIPSEL64,
f2d9751c 83 SCMP_ARCH_MIPS64,
469830d1 84 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
85 SCMP_ARCH_MIPS64N32, /* native */
86#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
87 SCMP_ARCH_MIPS,
88 SCMP_ARCH_MIPSEL,
89 SCMP_ARCH_MIPS64,
90 SCMP_ARCH_MIPSEL64,
91 SCMP_ARCH_MIPS64N32,
92 SCMP_ARCH_MIPSEL64N32, /* native */
93#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 94 SCMP_ARCH_PPC,
469830d1 95 SCMP_ARCH_PPC64LE,
f2d9751c
LP
96 SCMP_ARCH_PPC64, /* native */
97#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC,
99 SCMP_ARCH_PPC64,
100 SCMP_ARCH_PPC64LE, /* native */
101#elif defined(__powerpc__)
102 SCMP_ARCH_PPC,
103#elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106#elif defined(__s390__)
469830d1 107 SCMP_ARCH_S390,
469830d1
LP
108#endif
109 (uint32_t) -1
110 };
57183d11
LP
111
112const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
57183d11 117
aa34055f
ZJS
118 switch(c) {
119 case SCMP_ARCH_NATIVE:
57183d11 120 return "native";
aa34055f 121 case SCMP_ARCH_X86:
57183d11 122 return "x86";
aa34055f 123 case SCMP_ARCH_X86_64:
57183d11 124 return "x86-64";
aa34055f 125 case SCMP_ARCH_X32:
57183d11 126 return "x32";
aa34055f 127 case SCMP_ARCH_ARM:
57183d11 128 return "arm";
aa34055f
ZJS
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 case SCMP_ARCH_MIPS:
132 return "mips";
133 case SCMP_ARCH_MIPS64:
134 return "mips64";
135 case SCMP_ARCH_MIPS64N32:
136 return "mips64-n32";
137 case SCMP_ARCH_MIPSEL:
138 return "mips-le";
139 case SCMP_ARCH_MIPSEL64:
140 return "mips64-le";
141 case SCMP_ARCH_MIPSEL64N32:
142 return "mips64-le-n32";
143 case SCMP_ARCH_PPC:
144 return "ppc";
145 case SCMP_ARCH_PPC64:
146 return "ppc64";
147 case SCMP_ARCH_PPC64LE:
148 return "ppc64-le";
149 case SCMP_ARCH_S390:
6abfd303 150 return "s390";
aa34055f 151 case SCMP_ARCH_S390X:
6abfd303 152 return "s390x";
aa34055f
ZJS
153 default:
154 return NULL;
155 }
57183d11
LP
156}
157
158int seccomp_arch_from_string(const char *n, uint32_t *ret) {
159 if (!n)
160 return -EINVAL;
161
162 assert(ret);
163
164 if (streq(n, "native"))
165 *ret = SCMP_ARCH_NATIVE;
166 else if (streq(n, "x86"))
167 *ret = SCMP_ARCH_X86;
168 else if (streq(n, "x86-64"))
169 *ret = SCMP_ARCH_X86_64;
170 else if (streq(n, "x32"))
171 *ret = SCMP_ARCH_X32;
172 else if (streq(n, "arm"))
173 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
174 else if (streq(n, "arm64"))
175 *ret = SCMP_ARCH_AARCH64;
176 else if (streq(n, "mips"))
177 *ret = SCMP_ARCH_MIPS;
178 else if (streq(n, "mips64"))
179 *ret = SCMP_ARCH_MIPS64;
180 else if (streq(n, "mips64-n32"))
181 *ret = SCMP_ARCH_MIPS64N32;
182 else if (streq(n, "mips-le"))
183 *ret = SCMP_ARCH_MIPSEL;
184 else if (streq(n, "mips64-le"))
185 *ret = SCMP_ARCH_MIPSEL64;
186 else if (streq(n, "mips64-le-n32"))
187 *ret = SCMP_ARCH_MIPSEL64N32;
188 else if (streq(n, "ppc"))
189 *ret = SCMP_ARCH_PPC;
190 else if (streq(n, "ppc64"))
191 *ret = SCMP_ARCH_PPC64;
192 else if (streq(n, "ppc64-le"))
193 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
57183d11
LP
198 else
199 return -EINVAL;
200
201 return 0;
202}
e9642be2 203
469830d1 204int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
205 scmp_filter_ctx seccomp;
206 int r;
207
469830d1
LP
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
469830d1
LP
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
1b52793d 218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
219 if (r < 0)
220 goto finish;
221
1b52793d 222 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
223 if (r < 0)
224 goto finish;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
235 if (r < 0)
236 goto finish;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 goto finish;
241
242 *ret = seccomp;
243 return 0;
244
245finish:
246 seccomp_release(seccomp);
247 return r;
248}
249
d347d902 250static bool is_basic_seccomp_available(void) {
4d5bd50a 251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
252}
253
254static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
d347d902
FS
257}
258
83f12b27 259bool is_seccomp_available(void) {
83f12b27 260 static int cached_enabled = -1;
4d5bd50a 261
83f12b27 262 if (cached_enabled < 0)
4d5bd50a
LP
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
83f12b27
FS
267 return cached_enabled;
268}
269
8130926d 270const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 271 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 272 .name = "@default",
d5efc18b 273 .help = "System calls that are always permitted",
40eb6a80
ZJS
274 .value =
275 "clock_getres\0"
276 "clock_gettime\0"
277 "clock_nanosleep\0"
278 "execve\0"
279 "exit\0"
280 "exit_group\0"
e41b0f42
LP
281 "futex\0"
282 "get_robust_list\0"
283 "get_thread_area\0"
09d3020b
DH
284 "getegid\0"
285 "getegid32\0"
286 "geteuid\0"
287 "geteuid32\0"
288 "getgid\0"
289 "getgid32\0"
290 "getgroups\0"
291 "getgroups32\0"
292 "getpgid\0"
293 "getpgrp\0"
294 "getpid\0"
295 "getppid\0"
296 "getresgid\0"
297 "getresgid32\0"
298 "getresuid\0"
299 "getresuid32\0"
40eb6a80 300 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
301 "getsid\0"
302 "gettid\0"
40eb6a80 303 "gettimeofday\0"
09d3020b
DH
304 "getuid\0"
305 "getuid32\0"
e41b0f42 306 "membarrier\0"
40eb6a80
ZJS
307 "nanosleep\0"
308 "pause\0"
4c3a9176 309 "prlimit64\0"
e41b0f42 310 "restart_syscall\0"
40eb6a80 311 "rt_sigreturn\0"
8f44de08 312 "sched_yield\0"
e41b0f42
LP
313 "set_robust_list\0"
314 "set_thread_area\0"
315 "set_tid_address\0"
40eb6a80
ZJS
316 "sigreturn\0"
317 "time\0"
4c3a9176 318 "ugetrlimit\0"
40eb6a80 319 },
44898c53
LP
320 [SYSCALL_FILTER_SET_AIO] = {
321 .name = "@aio",
322 .help = "Asynchronous IO",
323 .value =
324 "io_cancel\0"
325 "io_destroy\0"
326 "io_getevents\0"
327 "io_setup\0"
328 "io_submit\0"
329 },
133ddbbe 330 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 331 .name = "@basic-io",
d5efc18b 332 .help = "Basic IO",
133ddbbe 333 .value =
648a0ed0 334 "_llseek\0"
133ddbbe 335 "close\0"
648a0ed0 336 "dup\0"
133ddbbe
LP
337 "dup2\0"
338 "dup3\0"
133ddbbe
LP
339 "lseek\0"
340 "pread64\0"
341 "preadv\0"
44898c53 342 "preadv2\0"
133ddbbe
LP
343 "pwrite64\0"
344 "pwritev\0"
44898c53 345 "pwritev2\0"
133ddbbe
LP
346 "read\0"
347 "readv\0"
348 "write\0"
349 "writev\0"
350 },
44898c53
LP
351 [SYSCALL_FILTER_SET_CHOWN] = {
352 .name = "@chown",
353 .help = "Change ownership of files and directories",
354 .value =
355 "chown\0"
356 "chown32\0"
357 "fchown\0"
358 "fchown32\0"
359 "fchownat\0"
360 "lchown\0"
361 "lchown32\0"
362 },
8130926d 363 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 364 .name = "@clock",
d5efc18b 365 .help = "Change the system time",
201c1cc2
TM
366 .value =
367 "adjtimex\0"
1f9ac68b
LP
368 "clock_adjtime\0"
369 "clock_settime\0"
201c1cc2 370 "settimeofday\0"
1f9ac68b 371 "stime\0"
8130926d
LP
372 },
373 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 374 .name = "@cpu-emulation",
d5efc18b 375 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
376 .value =
377 "modify_ldt\0"
378 "subpage_prot\0"
379 "switch_endian\0"
380 "vm86\0"
381 "vm86old\0"
8130926d
LP
382 },
383 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 384 .name = "@debug",
d5efc18b 385 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
386 .value =
387 "lookup_dcookie\0"
388 "perf_event_open\0"
389 "process_vm_readv\0"
390 "process_vm_writev\0"
391 "ptrace\0"
392 "rtas\0"
8130926d 393#ifdef __NR_s390_runtime_instr
1f9ac68b 394 "s390_runtime_instr\0"
8130926d 395#endif
1f9ac68b 396 "sys_debug_setcontext\0"
8130926d 397 },
1a1b13c9
LP
398 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
399 .name = "@file-system",
400 .help = "File system operations",
401 .value =
402 "access\0"
403 "chdir\0"
404 "chmod\0"
405 "close\0"
406 "creat\0"
407 "faccessat\0"
408 "fallocate\0"
409 "fchdir\0"
410 "fchmod\0"
411 "fchmodat\0"
1a1b13c9 412 "fcntl\0"
ceaa6aa7 413 "fcntl64\0"
1a1b13c9
LP
414 "fgetxattr\0"
415 "flistxattr\0"
ceaa6aa7 416 "fremovexattr\0"
1a1b13c9 417 "fsetxattr\0"
1a1b13c9 418 "fstat\0"
ceaa6aa7 419 "fstat64\0"
1a1b13c9 420 "fstatat64\0"
1a1b13c9 421 "fstatfs\0"
ceaa6aa7 422 "fstatfs64\0"
1a1b13c9 423 "ftruncate\0"
ceaa6aa7 424 "ftruncate64\0"
1a1b13c9
LP
425 "futimesat\0"
426 "getcwd\0"
1a1b13c9 427 "getdents\0"
ceaa6aa7 428 "getdents64\0"
1a1b13c9
LP
429 "getxattr\0"
430 "inotify_add_watch\0"
ceaa6aa7 431 "inotify_init\0"
1a1b13c9
LP
432 "inotify_init1\0"
433 "inotify_rm_watch\0"
434 "lgetxattr\0"
435 "link\0"
436 "linkat\0"
437 "listxattr\0"
438 "llistxattr\0"
439 "lremovexattr\0"
440 "lsetxattr\0"
1a1b13c9 441 "lstat\0"
ceaa6aa7 442 "lstat64\0"
1a1b13c9
LP
443 "mkdir\0"
444 "mkdirat\0"
445 "mknod\0"
446 "mknodat\0"
1a1b13c9 447 "mmap\0"
ceaa6aa7 448 "mmap2\0"
7961116e 449 "munmap\0"
1a1b13c9 450 "newfstatat\0"
ceaa6aa7
LP
451 "oldfstat\0"
452 "oldlstat\0"
453 "oldstat\0"
1a1b13c9
LP
454 "open\0"
455 "openat\0"
456 "readlink\0"
457 "readlinkat\0"
458 "removexattr\0"
459 "rename\0"
1a1b13c9 460 "renameat\0"
ceaa6aa7 461 "renameat2\0"
1a1b13c9
LP
462 "rmdir\0"
463 "setxattr\0"
1a1b13c9 464 "stat\0"
ceaa6aa7 465 "stat64\0"
1a1b13c9 466 "statfs\0"
ceaa6aa7
LP
467 "statfs64\0"
468#ifdef __PNR_statx
a4135a74 469 "statx\0"
ceaa6aa7 470#endif
1a1b13c9
LP
471 "symlink\0"
472 "symlinkat\0"
1a1b13c9 473 "truncate\0"
ceaa6aa7 474 "truncate64\0"
1a1b13c9
LP
475 "unlink\0"
476 "unlinkat\0"
ceaa6aa7 477 "utime\0"
1a1b13c9
LP
478 "utimensat\0"
479 "utimes\0"
480 },
8130926d 481 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 482 .name = "@io-event",
d5efc18b 483 .help = "Event loop system calls",
201c1cc2
TM
484 .value =
485 "_newselect\0"
201c1cc2 486 "epoll_create\0"
215728ff 487 "epoll_create1\0"
201c1cc2
TM
488 "epoll_ctl\0"
489 "epoll_ctl_old\0"
490 "epoll_pwait\0"
491 "epoll_wait\0"
492 "epoll_wait_old\0"
201c1cc2 493 "eventfd\0"
215728ff 494 "eventfd2\0"
201c1cc2
TM
495 "poll\0"
496 "ppoll\0"
497 "pselect6\0"
498 "select\0"
8130926d
LP
499 },
500 [SYSCALL_FILTER_SET_IPC] = {
8130926d 501 .name = "@ipc",
d5efc18b
ZJS
502 .help = "SysV IPC, POSIX Message Queues or other IPC",
503 .value =
504 "ipc\0"
cd5bfd7e 505 "memfd_create\0"
201c1cc2
TM
506 "mq_getsetattr\0"
507 "mq_notify\0"
508 "mq_open\0"
509 "mq_timedreceive\0"
510 "mq_timedsend\0"
511 "mq_unlink\0"
512 "msgctl\0"
513 "msgget\0"
514 "msgrcv\0"
515 "msgsnd\0"
cd5bfd7e 516 "pipe\0"
215728ff 517 "pipe2\0"
201c1cc2
TM
518 "process_vm_readv\0"
519 "process_vm_writev\0"
520 "semctl\0"
521 "semget\0"
522 "semop\0"
523 "semtimedop\0"
524 "shmat\0"
525 "shmctl\0"
526 "shmdt\0"
527 "shmget\0"
8130926d
LP
528 },
529 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 530 .name = "@keyring",
d5efc18b 531 .help = "Kernel keyring access",
1f9ac68b
LP
532 .value =
533 "add_key\0"
534 "keyctl\0"
535 "request_key\0"
8130926d 536 },
cd0ddf6f
LP
537 [SYSCALL_FILTER_SET_MEMLOCK] = {
538 .name = "@memlock",
539 .help = "Memory locking control",
540 .value =
541 "mlock\0"
542 "mlock2\0"
543 "mlockall\0"
544 "munlock\0"
545 "munlockall\0"
546 },
8130926d 547 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 548 .name = "@module",
d5efc18b 549 .help = "Loading and unloading of kernel modules",
201c1cc2 550 .value =
201c1cc2
TM
551 "delete_module\0"
552 "finit_module\0"
553 "init_module\0"
8130926d
LP
554 },
555 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 556 .name = "@mount",
d5efc18b 557 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
558 .value =
559 "chroot\0"
560 "mount\0"
201c1cc2 561 "pivot_root\0"
201c1cc2 562 "umount\0"
215728ff 563 "umount2\0"
8130926d
LP
564 },
565 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 566 .name = "@network-io",
d5efc18b 567 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 568 .value =
201c1cc2 569 "accept\0"
215728ff 570 "accept4\0"
201c1cc2
TM
571 "bind\0"
572 "connect\0"
573 "getpeername\0"
574 "getsockname\0"
575 "getsockopt\0"
576 "listen\0"
577 "recv\0"
578 "recvfrom\0"
579 "recvmmsg\0"
580 "recvmsg\0"
581 "send\0"
582 "sendmmsg\0"
583 "sendmsg\0"
584 "sendto\0"
585 "setsockopt\0"
586 "shutdown\0"
587 "socket\0"
588 "socketcall\0"
589 "socketpair\0"
8130926d
LP
590 },
591 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 592 /* some unknown even to libseccomp */
8130926d 593 .name = "@obsolete",
d5efc18b 594 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
595 .value =
596 "_sysctl\0"
597 "afs_syscall\0"
802fa07a 598 "bdflush\0"
201c1cc2 599 "break\0"
1f9ac68b 600 "create_module\0"
201c1cc2
TM
601 "ftime\0"
602 "get_kernel_syms\0"
201c1cc2
TM
603 "getpmsg\0"
604 "gtty\0"
7e0c3b8f 605 "idle\0"
201c1cc2 606 "lock\0"
201c1cc2 607 "mpx\0"
201c1cc2
TM
608 "prof\0"
609 "profil\0"
201c1cc2
TM
610 "putpmsg\0"
611 "query_module\0"
201c1cc2
TM
612 "security\0"
613 "sgetmask\0"
614 "ssetmask\0"
615 "stty\0"
1f9ac68b 616 "sysfs\0"
201c1cc2
TM
617 "tuxcall\0"
618 "ulimit\0"
619 "uselib\0"
1f9ac68b 620 "ustat\0"
201c1cc2 621 "vserver\0"
8130926d
LP
622 },
623 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 624 .name = "@privileged",
d5efc18b 625 .help = "All system calls which need super-user capabilities",
201c1cc2 626 .value =
44898c53 627 "@chown\0"
201c1cc2
TM
628 "@clock\0"
629 "@module\0"
630 "@raw-io\0"
215728ff 631 "_sysctl\0"
201c1cc2 632 "acct\0"
201c1cc2 633 "bpf\0"
1f9ac68b 634 "capset\0"
201c1cc2 635 "chroot\0"
201c1cc2
TM
636 "kexec_file_load\0"
637 "kexec_load\0"
201c1cc2
TM
638 "nfsservctl\0"
639 "pivot_root\0"
640 "quotactl\0"
641 "reboot\0"
642 "setdomainname\0"
201c1cc2 643 "setfsuid\0"
215728ff 644 "setfsuid32\0"
201c1cc2 645 "setgroups\0"
215728ff 646 "setgroups32\0"
201c1cc2 647 "sethostname\0"
201c1cc2 648 "setresuid\0"
215728ff 649 "setresuid32\0"
201c1cc2 650 "setreuid\0"
215728ff 651 "setreuid32\0"
201c1cc2 652 "setuid\0"
215728ff 653 "setuid32\0"
201c1cc2
TM
654 "swapoff\0"
655 "swapon\0"
201c1cc2 656 "vhangup\0"
8130926d
LP
657 },
658 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 659 .name = "@process",
d5efc18b 660 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
661 .value =
662 "arch_prctl\0"
09d3020b 663 "capget\0" /* Able to query arbitrary processes */
201c1cc2 664 "clone\0"
201c1cc2
TM
665 "execveat\0"
666 "fork\0"
b887d2eb 667 "getrusage\0"
201c1cc2
TM
668 "kill\0"
669 "prctl\0"
b887d2eb
LP
670 "rt_sigqueueinfo\0"
671 "rt_tgsigqueueinfo\0"
201c1cc2
TM
672 "setns\0"
673 "tgkill\0"
b887d2eb 674 "times\0"
201c1cc2
TM
675 "tkill\0"
676 "unshare\0"
677 "vfork\0"
b887d2eb
LP
678 "wait4\0"
679 "waitid\0"
680 "waitpid\0"
8130926d
LP
681 },
682 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 683 .name = "@raw-io",
d5efc18b 684 .help = "Raw I/O port access",
201c1cc2
TM
685 .value =
686 "ioperm\0"
687 "iopl\0"
1f9ac68b 688 "pciconfig_iobase\0"
201c1cc2
TM
689 "pciconfig_read\0"
690 "pciconfig_write\0"
8130926d 691#ifdef __NR_s390_pci_mmio_read
201c1cc2 692 "s390_pci_mmio_read\0"
8130926d
LP
693#endif
694#ifdef __NR_s390_pci_mmio_write
201c1cc2 695 "s390_pci_mmio_write\0"
8130926d
LP
696#endif
697 },
bd2ab3f4
LP
698 [SYSCALL_FILTER_SET_REBOOT] = {
699 .name = "@reboot",
700 .help = "Reboot and reboot preparation/kexec",
701 .value =
702 "kexec\0"
703 "kexec_file_load\0"
704 "reboot\0"
705 },
133ddbbe 706 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 707 .name = "@resources",
58a8f68b 708 .help = "Alter resource settings",
133ddbbe 709 .value =
0963c053
LP
710 "ioprio_set\0"
711 "mbind\0"
712 "migrate_pages\0"
713 "move_pages\0"
714 "nice\0"
0963c053
LP
715 "sched_setaffinity\0"
716 "sched_setattr\0"
133ddbbe
LP
717 "sched_setparam\0"
718 "sched_setscheduler\0"
0963c053 719 "set_mempolicy\0"
133ddbbe
LP
720 "setpriority\0"
721 "setrlimit\0"
133ddbbe 722 },
6eaaeee9
LP
723 [SYSCALL_FILTER_SET_SETUID] = {
724 .name = "@setuid",
725 .help = "Operations for changing user/group credentials",
726 .value =
6eaaeee9 727 "setgid\0"
215728ff 728 "setgid32\0"
6eaaeee9 729 "setgroups\0"
215728ff 730 "setgroups32\0"
6eaaeee9 731 "setregid\0"
215728ff 732 "setregid32\0"
6eaaeee9 733 "setresgid\0"
215728ff 734 "setresgid32\0"
6eaaeee9 735 "setresuid\0"
215728ff 736 "setresuid32\0"
6eaaeee9 737 "setreuid\0"
215728ff 738 "setreuid32\0"
6eaaeee9 739 "setuid\0"
215728ff 740 "setuid32\0"
6eaaeee9 741 },
cd0ddf6f
LP
742 [SYSCALL_FILTER_SET_SIGNAL] = {
743 .name = "@signal",
744 .help = "Process signal handling",
745 .value =
746 "rt_sigaction\0"
747 "rt_sigpending\0"
748 "rt_sigprocmask\0"
749 "rt_sigsuspend\0"
750 "rt_sigtimedwait\0"
751 "sigaction\0"
752 "sigaltstack\0"
753 "signal\0"
754 "signalfd\0"
755 "signalfd4\0"
756 "sigpending\0"
757 "sigprocmask\0"
758 "sigsuspend\0"
759 },
bd2ab3f4
LP
760 [SYSCALL_FILTER_SET_SWAP] = {
761 .name = "@swap",
762 .help = "Enable/disable swap devices",
763 .value =
764 "swapoff\0"
765 "swapon\0"
766 },
44898c53
LP
767 [SYSCALL_FILTER_SET_SYNC] = {
768 .name = "@sync",
769 .help = "Synchronize files and memory to storage",
770 .value =
771 "fdatasync\0"
772 "fsync\0"
773 "msync\0"
774 "sync\0"
775 "sync_file_range\0"
776 "syncfs\0"
777 },
cd0ddf6f
LP
778 [SYSCALL_FILTER_SET_TIMER] = {
779 .name = "@timer",
780 .help = "Schedule operations by time",
781 .value =
782 "alarm\0"
783 "getitimer\0"
784 "setitimer\0"
785 "timer_create\0"
786 "timer_delete\0"
787 "timer_getoverrun\0"
788 "timer_gettime\0"
789 "timer_settime\0"
790 "timerfd_create\0"
791 "timerfd_gettime\0"
792 "timerfd_settime\0"
793 "times\0"
794 },
201c1cc2 795};
8130926d
LP
796
797const SyscallFilterSet *syscall_filter_set_find(const char *name) {
798 unsigned i;
799
800 if (isempty(name) || name[0] != '@')
801 return NULL;
802
803 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
804 if (streq(syscall_filter_sets[i].name, name))
805 return syscall_filter_sets + i;
806
807 return NULL;
808}
809
960e4569 810static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
69b1b241 811
960e4569 812int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
69b1b241
LP
813 int r;
814
815 assert(seccomp);
816 assert(name);
817
960e4569
LP
818 if (strv_contains(exclude, name))
819 return 0;
820
69b1b241
LP
821 if (name[0] == '@') {
822 const SyscallFilterSet *other;
823
824 other = syscall_filter_set_find(name);
cff7bff8
LP
825 if (!other) {
826 log_debug("Filter set %s is not known!", name);
69b1b241 827 return -EINVAL;
cff7bff8 828 }
69b1b241 829
960e4569 830 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
69b1b241
LP
831 if (r < 0)
832 return r;
833 } else {
834 int id;
835
836 id = seccomp_syscall_resolve_name(name);
cff7bff8 837 if (id == __NR_SCMP_ERROR) {
ff217dc3
LP
838 log_debug("System call %s is not known, ignoring.", name);
839 return 0;
cff7bff8 840 }
69b1b241
LP
841
842 r = seccomp_rule_add_exact(seccomp, action, id, 0);
843 if (r < 0)
844 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
845 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
846 }
847
848 return 0;
849}
850
469830d1
LP
851static int seccomp_add_syscall_filter_set(
852 scmp_filter_ctx seccomp,
469830d1 853 const SyscallFilterSet *set,
960e4569
LP
854 uint32_t action,
855 char **exclude) {
469830d1 856
8130926d
LP
857 const char *sys;
858 int r;
859
860 assert(seccomp);
861 assert(set);
862
863 NULSTR_FOREACH(sys, set->value) {
960e4569 864 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
69b1b241
LP
865 if (r < 0)
866 return r;
469830d1
LP
867 }
868
869 return 0;
870}
871
872int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
873 uint32_t arch;
874 int r;
875
876 assert(set);
877
878 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
879 * earch local arch. */
880
881 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
882 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
883
884 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
885
886 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
887 if (r < 0)
888 return r;
469830d1 889
960e4569 890 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
469830d1
LP
891 if (r < 0) {
892 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
893 continue;
894 }
895
896 r = seccomp_load(seccomp);
897 if (IN_SET(r, -EPERM, -EACCES))
898 return r;
899 if (r < 0)
900 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
901 }
902
903 return 0;
904}
a3be2849 905
469830d1
LP
906int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
907 uint32_t arch;
a3be2849
LP
908 int r;
909
469830d1
LP
910 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
911 * SyscallFilterSet* table. */
a3be2849 912
469830d1
LP
913 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
914 return 0;
a3be2849 915
469830d1
LP
916 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
917 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
918 Iterator i;
919 void *id;
a3be2849 920
469830d1 921 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 922
469830d1
LP
923 r = seccomp_init_for_arch(&seccomp, arch, default_action);
924 if (r < 0)
925 return r;
a3be2849 926
469830d1
LP
927 SET_FOREACH(id, set, i) {
928 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
929 if (r < 0) {
930 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
931 _cleanup_free_ char *n = NULL;
932
933 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
e7854c46 934 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
469830d1
LP
935 }
936 }
937
938 r = seccomp_load(seccomp);
939 if (IN_SET(r, -EPERM, -EACCES))
940 return r;
941 if (r < 0)
942 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
943 }
944
945 return 0;
add00535
LP
946}
947
948int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 949 uint32_t arch;
add00535
LP
950 int r;
951
952 if (log_get_max_level() >= LOG_DEBUG) {
953 _cleanup_free_ char *s = NULL;
954
955 (void) namespace_flag_to_string_many(retain, &s);
956 log_debug("Restricting namespace to: %s.", strna(s));
957 }
958
959 /* NOOP? */
960 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
961 return 0;
962
469830d1
LP
963 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
964 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
965 unsigned i;
add00535 966
469830d1
LP
967 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
968
969 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
970 if (r < 0)
971 return r;
972
973 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
974 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
975 * altogether. */
976 r = seccomp_rule_add_exact(
977 seccomp,
978 SCMP_ACT_ERRNO(EPERM),
979 SCMP_SYS(setns),
980 0);
981 else
982 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
983 * special invocation with a zero flags argument, right here. */
984 r = seccomp_rule_add_exact(
985 seccomp,
986 SCMP_ACT_ERRNO(EPERM),
987 SCMP_SYS(setns),
988 1,
989 SCMP_A1(SCMP_CMP_EQ, 0));
990 if (r < 0) {
991 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
992 continue;
993 }
994
995 for (i = 0; namespace_flag_map[i].name; i++) {
996 unsigned long f;
997
998 f = namespace_flag_map[i].flag;
999 if ((retain & f) == f) {
1000 log_debug("Permitting %s.", namespace_flag_map[i].name);
1001 continue;
1002 }
1003
1004 log_debug("Blocking %s.", namespace_flag_map[i].name);
1005
1006 r = seccomp_rule_add_exact(
1007 seccomp,
1008 SCMP_ACT_ERRNO(EPERM),
1009 SCMP_SYS(unshare),
1010 1,
1011 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1012 if (r < 0) {
1013 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1014 break;
1015 }
1016
511ceb1f
ZJS
1017 /* On s390/s390x the first two parameters to clone are switched */
1018 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1019 r = seccomp_rule_add_exact(
1020 seccomp,
1021 SCMP_ACT_ERRNO(EPERM),
1022 SCMP_SYS(clone),
1023 1,
1024 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1025 else
1026 r = seccomp_rule_add_exact(
1027 seccomp,
1028 SCMP_ACT_ERRNO(EPERM),
1029 SCMP_SYS(clone),
1030 1,
1031 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1032 if (r < 0) {
1033 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1034 break;
1035 }
1036
1037 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1038 r = seccomp_rule_add_exact(
1039 seccomp,
1040 SCMP_ACT_ERRNO(EPERM),
1041 SCMP_SYS(setns),
1042 1,
1043 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1044 if (r < 0) {
1045 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1046 break;
1047 }
1048 }
1049 }
1050 if (r < 0)
1051 continue;
1052
1053 r = seccomp_load(seccomp);
1054 if (IN_SET(r, -EPERM, -EACCES))
1055 return r;
1056 if (r < 0)
1057 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 }
1059
1060 return 0;
1061}
1062
1063int seccomp_protect_sysctl(void) {
1064 uint32_t arch;
1065 int r;
1066
1067 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1068 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1069
1070 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1071
2e64e8f4
ZJS
1072 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1073 /* No _sysctl syscall */
1074 continue;
1075
469830d1
LP
1076 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1077 if (r < 0)
1078 return r;
1079
1080 r = seccomp_rule_add_exact(
add00535
LP
1081 seccomp,
1082 SCMP_ACT_ERRNO(EPERM),
469830d1 1083 SCMP_SYS(_sysctl),
add00535 1084 0);
469830d1
LP
1085 if (r < 0) {
1086 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1087 continue;
1088 }
1089
1090 r = seccomp_load(seccomp);
1091 if (IN_SET(r, -EPERM, -EACCES))
1092 return r;
1093 if (r < 0)
1094 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1095 }
1096
1097 return 0;
1098}
1099
1100int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1101 uint32_t arch;
1102 int r;
1103
1104 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1105 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1106 bool supported;
469830d1
LP
1107 Iterator i;
1108
1109 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1110
9606bc4b
LP
1111 switch (arch) {
1112
1113 case SCMP_ARCH_X86_64:
1114 case SCMP_ARCH_X32:
1115 case SCMP_ARCH_ARM:
1116 case SCMP_ARCH_AARCH64:
da1921a5
ZJS
1117 case SCMP_ARCH_PPC64:
1118 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1119 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1120 supported = true;
1121 break;
1122
9606bc4b
LP
1123 case SCMP_ARCH_S390:
1124 case SCMP_ARCH_S390X:
1125 case SCMP_ARCH_PPC:
da1921a5 1126 case SCMP_ARCH_X86:
9606bc4b
LP
1127 default:
1128 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1129 * don't know */
1130 supported = false;
1131 break;
1132 }
1133
1134 if (!supported)
1135 continue;
1136
469830d1
LP
1137 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1138 if (r < 0)
1139 return r;
1140
1141 if (whitelist) {
1142 int af, first = 0, last = 0;
1143 void *afp;
1144
1145 /* If this is a whitelist, we first block the address families that are out of range and then
1146 * everything that is not in the set. First, we find the lowest and highest address family in
1147 * the set. */
1148
1149 SET_FOREACH(afp, address_families, i) {
1150 af = PTR_TO_INT(afp);
1151
1152 if (af <= 0 || af >= af_max())
1153 continue;
1154
1155 if (first == 0 || af < first)
1156 first = af;
1157
1158 if (last == 0 || af > last)
1159 last = af;
1160 }
1161
1162 assert((first == 0) == (last == 0));
1163
1164 if (first == 0) {
1165
1166 /* No entries in the valid range, block everything */
1167 r = seccomp_rule_add_exact(
1168 seccomp,
1169 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1170 SCMP_SYS(socket),
1171 0);
1172 if (r < 0) {
1173 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1174 continue;
1175 }
1176
1177 } else {
1178
1179 /* Block everything below the first entry */
1180 r = seccomp_rule_add_exact(
1181 seccomp,
1182 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1183 SCMP_SYS(socket),
1184 1,
1185 SCMP_A0(SCMP_CMP_LT, first));
1186 if (r < 0) {
1187 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1188 continue;
1189 }
1190
1191 /* Block everything above the last entry */
1192 r = seccomp_rule_add_exact(
1193 seccomp,
1194 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1195 SCMP_SYS(socket),
1196 1,
1197 SCMP_A0(SCMP_CMP_GT, last));
1198 if (r < 0) {
1199 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 continue;
1201 }
1202
1203 /* Block everything between the first and last entry */
1204 for (af = 1; af < af_max(); af++) {
1205
1206 if (set_contains(address_families, INT_TO_PTR(af)))
1207 continue;
1208
1209 r = seccomp_rule_add_exact(
1210 seccomp,
1211 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1212 SCMP_SYS(socket),
1213 1,
1214 SCMP_A0(SCMP_CMP_EQ, af));
1215 if (r < 0)
1216 break;
1217 }
469830d1
LP
1218 if (r < 0) {
1219 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1220 continue;
1221 }
1222 }
1223
1224 } else {
1225 void *af;
1226
1227 /* If this is a blacklist, then generate one rule for
1228 * each address family that are then combined in OR
1229 * checks. */
1230
1231 SET_FOREACH(af, address_families, i) {
1232
1233 r = seccomp_rule_add_exact(
1234 seccomp,
1235 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1236 SCMP_SYS(socket),
1237 1,
1238 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1239 if (r < 0)
1240 break;
1241 }
469830d1
LP
1242 if (r < 0) {
1243 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1244 continue;
1245 }
1246 }
1247
1248 r = seccomp_load(seccomp);
1249 if (IN_SET(r, -EPERM, -EACCES))
1250 return r;
1251 if (r < 0)
1252 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1253 }
1254
1255 return 0;
1256}
1257
1258int seccomp_restrict_realtime(void) {
1259 static const int permitted_policies[] = {
1260 SCHED_OTHER,
1261 SCHED_BATCH,
1262 SCHED_IDLE,
1263 };
1264
1265 int r, max_policy = 0;
1266 uint32_t arch;
1267 unsigned i;
1268
1269 /* Determine the highest policy constant we want to allow */
1270 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1271 if (permitted_policies[i] > max_policy)
1272 max_policy = permitted_policies[i];
1273
1274 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1275 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1276 int p;
1277
1278 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1279
1280 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1281 if (r < 0)
1282 return r;
1283
1284 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1285 * whitelist. */
1286 for (p = 0; p < max_policy; p++) {
1287 bool good = false;
1288
1289 /* Check if this is in the whitelist. */
1290 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1291 if (permitted_policies[i] == p) {
1292 good = true;
1293 break;
1294 }
1295
1296 if (good)
1297 continue;
1298
1299 /* Deny this policy */
1300 r = seccomp_rule_add_exact(
1301 seccomp,
1302 SCMP_ACT_ERRNO(EPERM),
1303 SCMP_SYS(sched_setscheduler),
1304 1,
1305 SCMP_A1(SCMP_CMP_EQ, p));
1306 if (r < 0) {
1307 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1308 continue;
1309 }
1310 }
1311
1312 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1313 * unsigned here, hence no need no check for < 0 values. */
1314 r = seccomp_rule_add_exact(
add00535
LP
1315 seccomp,
1316 SCMP_ACT_ERRNO(EPERM),
469830d1 1317 SCMP_SYS(sched_setscheduler),
add00535 1318 1,
469830d1
LP
1319 SCMP_A1(SCMP_CMP_GT, max_policy));
1320 if (r < 0) {
1321 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1322 continue;
1323 }
add00535 1324
469830d1
LP
1325 r = seccomp_load(seccomp);
1326 if (IN_SET(r, -EPERM, -EACCES))
1327 return r;
1328 if (r < 0)
1329 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1330 }
1331
1332 return 0;
1333}
1334
6dc66688
ZJS
1335static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1336 uint32_t arch,
1337 int nr,
1338 unsigned int arg_cnt,
1339 const struct scmp_arg_cmp arg) {
1340 int r;
1341
1342 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1343 if (r < 0) {
1344 _cleanup_free_ char *n = NULL;
1345
1346 n = seccomp_syscall_resolve_num_arch(arch, nr);
1347 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1348 strna(n),
1349 seccomp_arch_to_string(arch));
1350 }
1351
1352 return r;
1353}
1354
2a8d6e63 1355/* For known architectures, check that syscalls are indeed defined or not. */
4278d1f5 1356#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1357assert_cc(SCMP_SYS(shmget) > 0);
1358assert_cc(SCMP_SYS(shmat) > 0);
1359assert_cc(SCMP_SYS(shmdt) > 0);
1360#elif defined(__i386__) || defined(__powerpc64__)
1361assert_cc(SCMP_SYS(shmget) < 0);
1362assert_cc(SCMP_SYS(shmat) < 0);
1363assert_cc(SCMP_SYS(shmdt) < 0);
1364#endif
6dc66688 1365
469830d1 1366int seccomp_memory_deny_write_execute(void) {
8a50cf69 1367
469830d1
LP
1368 uint32_t arch;
1369 int r;
1370
1371 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1372 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1373 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1374
469830d1
LP
1375 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1376
8a50cf69
LP
1377 switch (arch) {
1378
1379 case SCMP_ARCH_X86:
1380 filter_syscall = SCMP_SYS(mmap2);
1381 block_syscall = SCMP_SYS(mmap);
2a8d6e63
ZJS
1382 break;
1383
1384 case SCMP_ARCH_PPC64:
1385 case SCMP_ARCH_PPC64LE:
1386 filter_syscall = SCMP_SYS(mmap);
1387
1388 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1389 * We ignore that here, which means there's still a way to get writable/executable
1390 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1391
8a50cf69
LP
1392 break;
1393
4278d1f5
ZJS
1394 case SCMP_ARCH_ARM:
1395 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1396 shmat_syscall = SCMP_SYS(shmat);
1397 break;
1398
8a50cf69
LP
1399 case SCMP_ARCH_X86_64:
1400 case SCMP_ARCH_X32:
79873bc8
ZJS
1401 case SCMP_ARCH_AARCH64:
1402 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
8a50cf69
LP
1403 shmat_syscall = SCMP_SYS(shmat);
1404 break;
1405
1406 /* Please add more definitions here, if you port systemd to other architectures! */
1407
4278d1f5 1408#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
8a50cf69
LP
1409#warning "Consider adding the right mmap() syscall definitions here!"
1410#endif
1411 }
1412
1413 /* Can't filter mmap() on this arch, then skip it */
1414 if (filter_syscall == 0)
1415 continue;
1416
469830d1
LP
1417 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1418 if (r < 0)
1419 return r;
1420
6dc66688
ZJS
1421 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1422 1,
1423 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1424 if (r < 0)
1425 continue;
8a50cf69
LP
1426
1427 if (block_syscall != 0) {
6dc66688
ZJS
1428 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1429 if (r < 0)
8a50cf69 1430 continue;
add00535 1431 }
a3be2849 1432
6dc66688
ZJS
1433 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1434 1,
1435 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1436 if (r < 0)
469830d1 1437 continue;
add00535 1438
8a50cf69 1439 if (shmat_syscall != 0) {
6dc66688
ZJS
1440 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1441 1,
1442 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1443 if (r < 0)
8a50cf69 1444 continue;
469830d1
LP
1445 }
1446
1447 r = seccomp_load(seccomp);
1448 if (IN_SET(r, -EPERM, -EACCES))
1449 return r;
add00535 1450 if (r < 0)
469830d1
LP
1451 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1452 }
add00535 1453
469830d1
LP
1454 return 0;
1455}
1456
1457int seccomp_restrict_archs(Set *archs) {
1458 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1459 Iterator i;
1460 void *id;
1461 int r;
1462
1463 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1464 * list. */
1465
1466 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1467 if (!seccomp)
1468 return -ENOMEM;
1469
1470 SET_FOREACH(id, archs, i) {
1471 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1472 if (r == -EEXIST)
1473 continue;
1474 if (r < 0)
1475 return r;
add00535
LP
1476 }
1477
469830d1
LP
1478 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1479 if (r < 0)
1480 return r;
add00535 1481
1c6af69b
LP
1482 r = seccomp_load(seccomp);
1483 if (IN_SET(r, -EPERM, -EACCES))
1484 return r;
1485 if (r < 0)
1486 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1487
1488 return 0;
a3be2849 1489}
b16bd535
YW
1490
1491int parse_syscall_archs(char **l, Set **archs) {
1492 _cleanup_set_free_ Set *_archs;
1493 char **s;
1494 int r;
1495
1496 assert(l);
1497 assert(archs);
1498
1499 r = set_ensure_allocated(&_archs, NULL);
1500 if (r < 0)
1501 return r;
1502
1503 STRV_FOREACH(s, l) {
1504 uint32_t a;
1505
1506 r = seccomp_arch_from_string(*s, &a);
1507 if (r < 0)
1508 return -EINVAL;
1509
1510 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1511 if (r < 0)
1512 return -ENOMEM;
1513 }
1514
1515 *archs = _archs;
1516 _archs = NULL;
1517
1518 return 0;
1519}
165a31c0
LP
1520
1521int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
1522 const char *i;
1523 int r;
1524
1525 assert(set);
1526
1527 NULSTR_FOREACH(i, set->value) {
1528
1529 if (i[0] == '@') {
1530 const SyscallFilterSet *more;
1531
1532 more = syscall_filter_set_find(i);
1533 if (!more)
1534 return -ENXIO;
1535
165a31c0
LP
1536 r = seccomp_filter_set_add(filter, add, more);
1537 if (r < 0)
1538 return r;
1539 } else {
1540 int id;
1541
1542 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1543 if (id == __NR_SCMP_ERROR) {
1544 log_debug("Couldn't resolve system call, ignoring: %s", i);
1545 continue;
1546 }
165a31c0
LP
1547
1548 if (add) {
1549 r = set_put(filter, INT_TO_PTR(id + 1));
1550 if (r < 0)
1551 return r;
1552 } else
1553 (void) set_remove(filter, INT_TO_PTR(id + 1));
1554 }
1555 }
1556
1557 return 0;
1558}
78e864e5
TM
1559
1560int seccomp_lock_personality(unsigned long personality) {
72eafe71 1561 uint32_t arch;
78e864e5
TM
1562 int r;
1563
72eafe71
LP
1564 if (personality >= PERSONALITY_INVALID)
1565 return -EINVAL;
78e864e5 1566
72eafe71
LP
1567 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1568 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1569
72eafe71
LP
1570 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1571 if (r < 0)
1572 return r;
1573
1574 r = seccomp_rule_add_exact(
1575 seccomp,
1576 SCMP_ACT_ERRNO(EPERM),
1577 SCMP_SYS(personality),
1578 1,
1579 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1580 if (r < 0) {
1581 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1582 continue;
1583 }
72eafe71
LP
1584
1585 r = seccomp_load(seccomp);
1586 if (IN_SET(r, -EPERM, -EACCES))
1587 return r;
1588 if (r < 0)
1589 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1590 }
1591
1592 return 0;
78e864e5 1593}