]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
seccomp: add two new filter sets: @reboot and @swap
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <seccomp.h>
22 #include <stddef.h>
23 #include <sys/prctl.h>
24 #include <linux/seccomp.h>
25
26 #include "alloc-util.h"
27 #include "macro.h"
28 #include "nsflags.h"
29 #include "seccomp-util.h"
30 #include "string-util.h"
31 #include "util.h"
32
33 const char* seccomp_arch_to_string(uint32_t c) {
34 /* Maintain order used in <seccomp.h>.
35 *
36 * Names used here should be the same as those used for ConditionArchitecture=,
37 * except for "subarchitectures" like x32. */
38
39 switch(c) {
40 case SCMP_ARCH_NATIVE:
41 return "native";
42 case SCMP_ARCH_X86:
43 return "x86";
44 case SCMP_ARCH_X86_64:
45 return "x86-64";
46 case SCMP_ARCH_X32:
47 return "x32";
48 case SCMP_ARCH_ARM:
49 return "arm";
50 case SCMP_ARCH_AARCH64:
51 return "arm64";
52 case SCMP_ARCH_MIPS:
53 return "mips";
54 case SCMP_ARCH_MIPS64:
55 return "mips64";
56 case SCMP_ARCH_MIPS64N32:
57 return "mips64-n32";
58 case SCMP_ARCH_MIPSEL:
59 return "mips-le";
60 case SCMP_ARCH_MIPSEL64:
61 return "mips64-le";
62 case SCMP_ARCH_MIPSEL64N32:
63 return "mips64-le-n32";
64 case SCMP_ARCH_PPC:
65 return "ppc";
66 case SCMP_ARCH_PPC64:
67 return "ppc64";
68 case SCMP_ARCH_PPC64LE:
69 return "ppc64-le";
70 case SCMP_ARCH_S390:
71 return "s390";
72 case SCMP_ARCH_S390X:
73 return "s390x";
74 default:
75 return NULL;
76 }
77 }
78
79 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
80 if (!n)
81 return -EINVAL;
82
83 assert(ret);
84
85 if (streq(n, "native"))
86 *ret = SCMP_ARCH_NATIVE;
87 else if (streq(n, "x86"))
88 *ret = SCMP_ARCH_X86;
89 else if (streq(n, "x86-64"))
90 *ret = SCMP_ARCH_X86_64;
91 else if (streq(n, "x32"))
92 *ret = SCMP_ARCH_X32;
93 else if (streq(n, "arm"))
94 *ret = SCMP_ARCH_ARM;
95 else if (streq(n, "arm64"))
96 *ret = SCMP_ARCH_AARCH64;
97 else if (streq(n, "mips"))
98 *ret = SCMP_ARCH_MIPS;
99 else if (streq(n, "mips64"))
100 *ret = SCMP_ARCH_MIPS64;
101 else if (streq(n, "mips64-n32"))
102 *ret = SCMP_ARCH_MIPS64N32;
103 else if (streq(n, "mips-le"))
104 *ret = SCMP_ARCH_MIPSEL;
105 else if (streq(n, "mips64-le"))
106 *ret = SCMP_ARCH_MIPSEL64;
107 else if (streq(n, "mips64-le-n32"))
108 *ret = SCMP_ARCH_MIPSEL64N32;
109 else if (streq(n, "ppc"))
110 *ret = SCMP_ARCH_PPC;
111 else if (streq(n, "ppc64"))
112 *ret = SCMP_ARCH_PPC64;
113 else if (streq(n, "ppc64-le"))
114 *ret = SCMP_ARCH_PPC64LE;
115 else if (streq(n, "s390"))
116 *ret = SCMP_ARCH_S390;
117 else if (streq(n, "s390x"))
118 *ret = SCMP_ARCH_S390X;
119 else
120 return -EINVAL;
121
122 return 0;
123 }
124
125 int seccomp_init_conservative(scmp_filter_ctx *ret, uint32_t default_action) {
126 scmp_filter_ctx seccomp;
127 int r;
128
129 /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
130 * added by default, and NNP is turned off. */
131
132 seccomp = seccomp_init(default_action);
133 if (!seccomp)
134 return -ENOMEM;
135
136 r = seccomp_add_secondary_archs(seccomp);
137 if (r < 0)
138 goto finish;
139
140 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
141 if (r < 0)
142 goto finish;
143
144 *ret = seccomp;
145 return 0;
146
147 finish:
148 seccomp_release(seccomp);
149 return r;
150 }
151
152 int seccomp_add_secondary_archs(scmp_filter_ctx ctx) {
153
154 /* Add in all possible secondary archs we are aware of that
155 * this kernel might support. */
156
157 static const int seccomp_arches[] = {
158 #if defined(__i386__) || defined(__x86_64__)
159 SCMP_ARCH_X86,
160 SCMP_ARCH_X86_64,
161 SCMP_ARCH_X32,
162
163 #elif defined(__arm__) || defined(__aarch64__)
164 SCMP_ARCH_ARM,
165 SCMP_ARCH_AARCH64,
166
167 #elif defined(__arm__) || defined(__aarch64__)
168 SCMP_ARCH_ARM,
169 SCMP_ARCH_AARCH64,
170
171 #elif defined(__mips__) || defined(__mips64__)
172 SCMP_ARCH_MIPS,
173 SCMP_ARCH_MIPS64,
174 SCMP_ARCH_MIPS64N32,
175 SCMP_ARCH_MIPSEL,
176 SCMP_ARCH_MIPSEL64,
177 SCMP_ARCH_MIPSEL64N32,
178
179 #elif defined(__powerpc__) || defined(__powerpc64__)
180 SCMP_ARCH_PPC,
181 SCMP_ARCH_PPC64,
182 SCMP_ARCH_PPC64LE,
183
184 #elif defined(__s390__) || defined(__s390x__)
185 SCMP_ARCH_S390,
186 SCMP_ARCH_S390X,
187 #endif
188 };
189
190 unsigned i;
191 int r;
192
193 for (i = 0; i < ELEMENTSOF(seccomp_arches); i++) {
194 r = seccomp_arch_add(ctx, seccomp_arches[i]);
195 if (r < 0 && r != -EEXIST)
196 return r;
197 }
198
199 return 0;
200 }
201
202 static bool is_basic_seccomp_available(void) {
203 int r;
204 r = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
205 return r >= 0;
206 }
207
208 static bool is_seccomp_filter_available(void) {
209 int r;
210 r = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
211 return r < 0 && errno == EFAULT;
212 }
213
214 bool is_seccomp_available(void) {
215 static int cached_enabled = -1;
216 if (cached_enabled < 0)
217 cached_enabled = is_basic_seccomp_available() && is_seccomp_filter_available();
218 return cached_enabled;
219 }
220
221 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
222 [SYSCALL_FILTER_SET_DEFAULT] = {
223 .name = "@default",
224 .help = "System calls that are always permitted",
225 .value =
226 "clock_getres\0"
227 "clock_gettime\0"
228 "clock_nanosleep\0"
229 "execve\0"
230 "exit\0"
231 "exit_group\0"
232 "getrlimit\0" /* make sure processes can query stack size and such */
233 "gettimeofday\0"
234 "nanosleep\0"
235 "pause\0"
236 "rt_sigreturn\0"
237 "sigreturn\0"
238 "time\0"
239 },
240 [SYSCALL_FILTER_SET_BASIC_IO] = {
241 .name = "@basic-io",
242 .help = "Basic IO",
243 .value =
244 "close\0"
245 "dup2\0"
246 "dup3\0"
247 "dup\0"
248 "lseek\0"
249 "pread64\0"
250 "preadv\0"
251 "pwrite64\0"
252 "pwritev\0"
253 "read\0"
254 "readv\0"
255 "write\0"
256 "writev\0"
257 },
258 [SYSCALL_FILTER_SET_CLOCK] = {
259 .name = "@clock",
260 .help = "Change the system time",
261 .value =
262 "adjtimex\0"
263 "clock_adjtime\0"
264 "clock_settime\0"
265 "settimeofday\0"
266 "stime\0"
267 },
268 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
269 .name = "@cpu-emulation",
270 .help = "System calls for CPU emulation functionality",
271 .value =
272 "modify_ldt\0"
273 "subpage_prot\0"
274 "switch_endian\0"
275 "vm86\0"
276 "vm86old\0"
277 },
278 [SYSCALL_FILTER_SET_DEBUG] = {
279 .name = "@debug",
280 .help = "Debugging, performance monitoring and tracing functionality",
281 .value =
282 "lookup_dcookie\0"
283 "perf_event_open\0"
284 "process_vm_readv\0"
285 "process_vm_writev\0"
286 "ptrace\0"
287 "rtas\0"
288 #ifdef __NR_s390_runtime_instr
289 "s390_runtime_instr\0"
290 #endif
291 "sys_debug_setcontext\0"
292 },
293 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
294 .name = "@file-system",
295 .help = "File system operations",
296 .value =
297 "access\0"
298 "chdir\0"
299 "chmod\0"
300 "close\0"
301 "creat\0"
302 "faccessat\0"
303 "fallocate\0"
304 "fchdir\0"
305 "fchmod\0"
306 "fchmodat\0"
307 "fcntl64\0"
308 "fcntl\0"
309 "fgetxattr\0"
310 "flistxattr\0"
311 "fsetxattr\0"
312 "fstat64\0"
313 "fstat\0"
314 "fstatat64\0"
315 "fstatfs64\0"
316 "fstatfs\0"
317 "ftruncate64\0"
318 "ftruncate\0"
319 "futimesat\0"
320 "getcwd\0"
321 "getdents64\0"
322 "getdents\0"
323 "getxattr\0"
324 "inotify_add_watch\0"
325 "inotify_init1\0"
326 "inotify_rm_watch\0"
327 "lgetxattr\0"
328 "link\0"
329 "linkat\0"
330 "listxattr\0"
331 "llistxattr\0"
332 "lremovexattr\0"
333 "lsetxattr\0"
334 "lstat64\0"
335 "lstat\0"
336 "mkdir\0"
337 "mkdirat\0"
338 "mknod\0"
339 "mknodat\0"
340 "mmap2\0"
341 "mmap\0"
342 "newfstatat\0"
343 "open\0"
344 "openat\0"
345 "readlink\0"
346 "readlinkat\0"
347 "removexattr\0"
348 "rename\0"
349 "renameat2\0"
350 "renameat\0"
351 "rmdir\0"
352 "setxattr\0"
353 "stat64\0"
354 "stat\0"
355 "statfs\0"
356 "symlink\0"
357 "symlinkat\0"
358 "truncate64\0"
359 "truncate\0"
360 "unlink\0"
361 "unlinkat\0"
362 "utimensat\0"
363 "utimes\0"
364 },
365 [SYSCALL_FILTER_SET_IO_EVENT] = {
366 .name = "@io-event",
367 .help = "Event loop system calls",
368 .value =
369 "_newselect\0"
370 "epoll_create1\0"
371 "epoll_create\0"
372 "epoll_ctl\0"
373 "epoll_ctl_old\0"
374 "epoll_pwait\0"
375 "epoll_wait\0"
376 "epoll_wait_old\0"
377 "eventfd2\0"
378 "eventfd\0"
379 "poll\0"
380 "ppoll\0"
381 "pselect6\0"
382 "select\0"
383 },
384 [SYSCALL_FILTER_SET_IPC] = {
385 .name = "@ipc",
386 .help = "SysV IPC, POSIX Message Queues or other IPC",
387 .value =
388 "ipc\0"
389 "memfd_create\0"
390 "mq_getsetattr\0"
391 "mq_notify\0"
392 "mq_open\0"
393 "mq_timedreceive\0"
394 "mq_timedsend\0"
395 "mq_unlink\0"
396 "msgctl\0"
397 "msgget\0"
398 "msgrcv\0"
399 "msgsnd\0"
400 "pipe2\0"
401 "pipe\0"
402 "process_vm_readv\0"
403 "process_vm_writev\0"
404 "semctl\0"
405 "semget\0"
406 "semop\0"
407 "semtimedop\0"
408 "shmat\0"
409 "shmctl\0"
410 "shmdt\0"
411 "shmget\0"
412 },
413 [SYSCALL_FILTER_SET_KEYRING] = {
414 .name = "@keyring",
415 .help = "Kernel keyring access",
416 .value =
417 "add_key\0"
418 "keyctl\0"
419 "request_key\0"
420 },
421 [SYSCALL_FILTER_SET_MODULE] = {
422 .name = "@module",
423 .help = "Loading and unloading of kernel modules",
424 .value =
425 "delete_module\0"
426 "finit_module\0"
427 "init_module\0"
428 },
429 [SYSCALL_FILTER_SET_MOUNT] = {
430 .name = "@mount",
431 .help = "Mounting and unmounting of file systems",
432 .value =
433 "chroot\0"
434 "mount\0"
435 "pivot_root\0"
436 "umount2\0"
437 "umount\0"
438 },
439 [SYSCALL_FILTER_SET_NETWORK_IO] = {
440 .name = "@network-io",
441 .help = "Network or Unix socket IO, should not be needed if not network facing",
442 .value =
443 "accept4\0"
444 "accept\0"
445 "bind\0"
446 "connect\0"
447 "getpeername\0"
448 "getsockname\0"
449 "getsockopt\0"
450 "listen\0"
451 "recv\0"
452 "recvfrom\0"
453 "recvmmsg\0"
454 "recvmsg\0"
455 "send\0"
456 "sendmmsg\0"
457 "sendmsg\0"
458 "sendto\0"
459 "setsockopt\0"
460 "shutdown\0"
461 "socket\0"
462 "socketcall\0"
463 "socketpair\0"
464 },
465 [SYSCALL_FILTER_SET_OBSOLETE] = {
466 /* some unknown even to libseccomp */
467 .name = "@obsolete",
468 .help = "Unusual, obsolete or unimplemented system calls",
469 .value =
470 "_sysctl\0"
471 "afs_syscall\0"
472 "break\0"
473 "create_module\0"
474 "ftime\0"
475 "get_kernel_syms\0"
476 "getpmsg\0"
477 "gtty\0"
478 "lock\0"
479 "mpx\0"
480 "prof\0"
481 "profil\0"
482 "putpmsg\0"
483 "query_module\0"
484 "security\0"
485 "sgetmask\0"
486 "ssetmask\0"
487 "stty\0"
488 "sysfs\0"
489 "tuxcall\0"
490 "ulimit\0"
491 "uselib\0"
492 "ustat\0"
493 "vserver\0"
494 },
495 [SYSCALL_FILTER_SET_PRIVILEGED] = {
496 .name = "@privileged",
497 .help = "All system calls which need super-user capabilities",
498 .value =
499 "@clock\0"
500 "@module\0"
501 "@raw-io\0"
502 "acct\0"
503 "bdflush\0"
504 "bpf\0"
505 "capset\0"
506 "chown32\0"
507 "chown\0"
508 "chroot\0"
509 "fchown32\0"
510 "fchown\0"
511 "fchownat\0"
512 "kexec_file_load\0"
513 "kexec_load\0"
514 "lchown32\0"
515 "lchown\0"
516 "nfsservctl\0"
517 "pivot_root\0"
518 "quotactl\0"
519 "reboot\0"
520 "setdomainname\0"
521 "setfsuid32\0"
522 "setfsuid\0"
523 "setgroups32\0"
524 "setgroups\0"
525 "sethostname\0"
526 "setresuid32\0"
527 "setresuid\0"
528 "setreuid32\0"
529 "setreuid\0"
530 "setuid32\0"
531 "setuid\0"
532 "swapoff\0"
533 "swapon\0"
534 "_sysctl\0"
535 "vhangup\0"
536 },
537 [SYSCALL_FILTER_SET_PROCESS] = {
538 .name = "@process",
539 .help = "Process control, execution, namespaceing operations",
540 .value =
541 "arch_prctl\0"
542 "clone\0"
543 "execveat\0"
544 "fork\0"
545 "kill\0"
546 "prctl\0"
547 "setns\0"
548 "tgkill\0"
549 "tkill\0"
550 "unshare\0"
551 "vfork\0"
552 },
553 [SYSCALL_FILTER_SET_RAW_IO] = {
554 .name = "@raw-io",
555 .help = "Raw I/O port access",
556 .value =
557 "ioperm\0"
558 "iopl\0"
559 "pciconfig_iobase\0"
560 "pciconfig_read\0"
561 "pciconfig_write\0"
562 #ifdef __NR_s390_pci_mmio_read
563 "s390_pci_mmio_read\0"
564 #endif
565 #ifdef __NR_s390_pci_mmio_write
566 "s390_pci_mmio_write\0"
567 #endif
568 },
569 [SYSCALL_FILTER_SET_REBOOT] = {
570 .name = "@reboot",
571 .help = "Reboot and reboot preparation/kexec",
572 .value =
573 "kexec\0"
574 "kexec_file_load\0"
575 "reboot\0"
576 },
577 [SYSCALL_FILTER_SET_RESOURCES] = {
578 /* Alter resource settings */
579 .name = "@resources",
580 .value =
581 "sched_setparam\0"
582 "sched_setscheduler\0"
583 "sched_setaffinity\0"
584 "setpriority\0"
585 "setrlimit\0"
586 "set_mempolicy\0"
587 "migrate_pages\0"
588 "move_pages\0"
589 "mbind\0"
590 "sched_setattr\0"
591 "prlimit64\0"
592 },
593 [SYSCALL_FILTER_SET_SWAP] = {
594 .name = "@swap",
595 .help = "Enable/disable swap devices",
596 .value =
597 "swapoff\0"
598 "swapon\0"
599 },
600 };
601
602 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
603 unsigned i;
604
605 if (isempty(name) || name[0] != '@')
606 return NULL;
607
608 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
609 if (streq(syscall_filter_sets[i].name, name))
610 return syscall_filter_sets + i;
611
612 return NULL;
613 }
614
615 int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action) {
616 const char *sys;
617 int r;
618
619 assert(seccomp);
620 assert(set);
621
622 NULSTR_FOREACH(sys, set->value) {
623 int id;
624
625 if (sys[0] == '@') {
626 const SyscallFilterSet *other;
627
628 other = syscall_filter_set_find(sys);
629 if (!other)
630 return -EINVAL;
631
632 r = seccomp_add_syscall_filter_set(seccomp, other, action);
633 } else {
634 id = seccomp_syscall_resolve_name(sys);
635 if (id == __NR_SCMP_ERROR)
636 return -EINVAL;
637
638 r = seccomp_rule_add(seccomp, action, id, 0);
639 }
640 if (r < 0)
641 return r;
642 }
643
644 return 0;
645 }
646
647 int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
648 scmp_filter_ctx seccomp;
649 int r;
650
651 assert(set);
652
653 /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
654
655 r = seccomp_init_conservative(&seccomp, default_action);
656 if (r < 0)
657 return r;
658
659 r = seccomp_add_syscall_filter_set(seccomp, set, action);
660 if (r < 0)
661 goto finish;
662
663 r = seccomp_load(seccomp);
664
665 finish:
666 seccomp_release(seccomp);
667 return r;
668 }
669
670 int seccomp_restrict_namespaces(unsigned long retain) {
671 scmp_filter_ctx seccomp;
672 unsigned i;
673 int r;
674
675 if (log_get_max_level() >= LOG_DEBUG) {
676 _cleanup_free_ char *s = NULL;
677
678 (void) namespace_flag_to_string_many(retain, &s);
679 log_debug("Restricting namespace to: %s.", strna(s));
680 }
681
682 /* NOOP? */
683 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
684 return 0;
685
686 r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW);
687 if (r < 0)
688 return r;
689
690 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
691 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
692 * altogether. */
693 r = seccomp_rule_add(
694 seccomp,
695 SCMP_ACT_ERRNO(EPERM),
696 SCMP_SYS(setns),
697 0);
698 else
699 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
700 * special invocation with a zero flags argument, right here. */
701 r = seccomp_rule_add(
702 seccomp,
703 SCMP_ACT_ERRNO(EPERM),
704 SCMP_SYS(setns),
705 1,
706 SCMP_A1(SCMP_CMP_EQ, 0));
707 if (r < 0)
708 goto finish;
709
710 for (i = 0; namespace_flag_map[i].name; i++) {
711 unsigned long f;
712
713 f = namespace_flag_map[i].flag;
714 if ((retain & f) == f) {
715 log_debug("Permitting %s.", namespace_flag_map[i].name);
716 continue;
717 }
718
719 log_debug("Blocking %s.", namespace_flag_map[i].name);
720
721 r = seccomp_rule_add(
722 seccomp,
723 SCMP_ACT_ERRNO(EPERM),
724 SCMP_SYS(unshare),
725 1,
726 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
727 if (r < 0)
728 goto finish;
729
730 r = seccomp_rule_add(
731 seccomp,
732 SCMP_ACT_ERRNO(EPERM),
733 SCMP_SYS(clone),
734 1,
735 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
736 if (r < 0)
737 goto finish;
738
739 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
740 r = seccomp_rule_add(
741 seccomp,
742 SCMP_ACT_ERRNO(EPERM),
743 SCMP_SYS(setns),
744 1,
745 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
746 if (r < 0)
747 goto finish;
748 }
749 }
750
751 r = seccomp_load(seccomp);
752
753 finish:
754 seccomp_release(seccomp);
755 return r;
756 }