2 This file is part of systemd.
4 Copyright 2014 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/prctl.h>
24 #include <linux/seccomp.h>
26 #include "alloc-util.h"
29 #include "seccomp-util.h"
30 #include "string-util.h"
33 const char* seccomp_arch_to_string(uint32_t c
) {
34 /* Maintain order used in <seccomp.h>.
36 * Names used here should be the same as those used for ConditionArchitecture=,
37 * except for "subarchitectures" like x32. */
40 case SCMP_ARCH_NATIVE
:
44 case SCMP_ARCH_X86_64
:
50 case SCMP_ARCH_AARCH64
:
54 case SCMP_ARCH_MIPS64
:
56 case SCMP_ARCH_MIPS64N32
:
58 case SCMP_ARCH_MIPSEL
:
60 case SCMP_ARCH_MIPSEL64
:
62 case SCMP_ARCH_MIPSEL64N32
:
63 return "mips64-le-n32";
68 case SCMP_ARCH_PPC64LE
:
79 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
85 if (streq(n
, "native"))
86 *ret
= SCMP_ARCH_NATIVE
;
87 else if (streq(n
, "x86"))
89 else if (streq(n
, "x86-64"))
90 *ret
= SCMP_ARCH_X86_64
;
91 else if (streq(n
, "x32"))
93 else if (streq(n
, "arm"))
95 else if (streq(n
, "arm64"))
96 *ret
= SCMP_ARCH_AARCH64
;
97 else if (streq(n
, "mips"))
98 *ret
= SCMP_ARCH_MIPS
;
99 else if (streq(n
, "mips64"))
100 *ret
= SCMP_ARCH_MIPS64
;
101 else if (streq(n
, "mips64-n32"))
102 *ret
= SCMP_ARCH_MIPS64N32
;
103 else if (streq(n
, "mips-le"))
104 *ret
= SCMP_ARCH_MIPSEL
;
105 else if (streq(n
, "mips64-le"))
106 *ret
= SCMP_ARCH_MIPSEL64
;
107 else if (streq(n
, "mips64-le-n32"))
108 *ret
= SCMP_ARCH_MIPSEL64N32
;
109 else if (streq(n
, "ppc"))
110 *ret
= SCMP_ARCH_PPC
;
111 else if (streq(n
, "ppc64"))
112 *ret
= SCMP_ARCH_PPC64
;
113 else if (streq(n
, "ppc64-le"))
114 *ret
= SCMP_ARCH_PPC64LE
;
115 else if (streq(n
, "s390"))
116 *ret
= SCMP_ARCH_S390
;
117 else if (streq(n
, "s390x"))
118 *ret
= SCMP_ARCH_S390X
;
125 int seccomp_init_conservative(scmp_filter_ctx
*ret
, uint32_t default_action
) {
126 scmp_filter_ctx seccomp
;
129 /* Much like seccomp_init(), but tries to be a bit more conservative in its defaults: all secondary archs are
130 * added by default, and NNP is turned off. */
132 seccomp
= seccomp_init(default_action
);
136 r
= seccomp_add_secondary_archs(seccomp
);
140 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
148 seccomp_release(seccomp
);
152 int seccomp_add_secondary_archs(scmp_filter_ctx ctx
) {
154 /* Add in all possible secondary archs we are aware of that
155 * this kernel might support. */
157 static const int seccomp_arches
[] = {
158 #if defined(__i386__) || defined(__x86_64__)
163 #elif defined(__arm__) || defined(__aarch64__)
167 #elif defined(__arm__) || defined(__aarch64__)
171 #elif defined(__mips__) || defined(__mips64__)
177 SCMP_ARCH_MIPSEL64N32
,
179 #elif defined(__powerpc__) || defined(__powerpc64__)
184 #elif defined(__s390__) || defined(__s390x__)
193 for (i
= 0; i
< ELEMENTSOF(seccomp_arches
); i
++) {
194 r
= seccomp_arch_add(ctx
, seccomp_arches
[i
]);
195 if (r
< 0 && r
!= -EEXIST
)
202 static bool is_basic_seccomp_available(void) {
204 r
= prctl(PR_GET_SECCOMP
, 0, 0, 0, 0);
208 static bool is_seccomp_filter_available(void) {
210 r
= prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0);
211 return r
< 0 && errno
== EFAULT
;
214 bool is_seccomp_available(void) {
215 static int cached_enabled
= -1;
216 if (cached_enabled
< 0)
217 cached_enabled
= is_basic_seccomp_available() && is_seccomp_filter_available();
218 return cached_enabled
;
221 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
222 [SYSCALL_FILTER_SET_DEFAULT
] = {
224 .help
= "System calls that are always permitted",
232 "getrlimit\0" /* make sure processes can query stack size and such */
240 [SYSCALL_FILTER_SET_BASIC_IO
] = {
258 [SYSCALL_FILTER_SET_CLOCK
] = {
260 .help
= "Change the system time",
268 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
269 .name
= "@cpu-emulation",
270 .help
= "System calls for CPU emulation functionality",
278 [SYSCALL_FILTER_SET_DEBUG
] = {
280 .help
= "Debugging, performance monitoring and tracing functionality",
285 "process_vm_writev\0"
288 #ifdef __NR_s390_runtime_instr
289 "s390_runtime_instr\0"
291 "sys_debug_setcontext\0"
293 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
294 .name
= "@file-system",
295 .help
= "File system operations",
324 "inotify_add_watch\0"
365 [SYSCALL_FILTER_SET_IO_EVENT
] = {
367 .help
= "Event loop system calls",
384 [SYSCALL_FILTER_SET_IPC
] = {
386 .help
= "SysV IPC, POSIX Message Queues or other IPC",
403 "process_vm_writev\0"
413 [SYSCALL_FILTER_SET_KEYRING
] = {
415 .help
= "Kernel keyring access",
421 [SYSCALL_FILTER_SET_MODULE
] = {
423 .help
= "Loading and unloading of kernel modules",
429 [SYSCALL_FILTER_SET_MOUNT
] = {
431 .help
= "Mounting and unmounting of file systems",
439 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
440 .name
= "@network-io",
441 .help
= "Network or Unix socket IO, should not be needed if not network facing",
465 [SYSCALL_FILTER_SET_OBSOLETE
] = {
466 /* some unknown even to libseccomp */
468 .help
= "Unusual, obsolete or unimplemented system calls",
495 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
496 .name
= "@privileged",
497 .help
= "All system calls which need super-user capabilities",
537 [SYSCALL_FILTER_SET_PROCESS
] = {
539 .help
= "Process control, execution, namespaceing operations",
553 [SYSCALL_FILTER_SET_RAW_IO
] = {
555 .help
= "Raw I/O port access",
562 #ifdef __NR_s390_pci_mmio_read
563 "s390_pci_mmio_read\0"
565 #ifdef __NR_s390_pci_mmio_write
566 "s390_pci_mmio_write\0"
569 [SYSCALL_FILTER_SET_REBOOT
] = {
571 .help
= "Reboot and reboot preparation/kexec",
577 [SYSCALL_FILTER_SET_RESOURCES
] = {
578 /* Alter resource settings */
579 .name
= "@resources",
582 "sched_setscheduler\0"
583 "sched_setaffinity\0"
593 [SYSCALL_FILTER_SET_SWAP
] = {
595 .help
= "Enable/disable swap devices",
602 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
605 if (isempty(name
) || name
[0] != '@')
608 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
609 if (streq(syscall_filter_sets
[i
].name
, name
))
610 return syscall_filter_sets
+ i
;
615 int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp
, const SyscallFilterSet
*set
, uint32_t action
) {
622 NULSTR_FOREACH(sys
, set
->value
) {
626 const SyscallFilterSet
*other
;
628 other
= syscall_filter_set_find(sys
);
632 r
= seccomp_add_syscall_filter_set(seccomp
, other
, action
);
634 id
= seccomp_syscall_resolve_name(sys
);
635 if (id
== __NR_SCMP_ERROR
)
638 r
= seccomp_rule_add(seccomp
, action
, id
, 0);
647 int seccomp_load_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
) {
648 scmp_filter_ctx seccomp
;
653 /* The one-stop solution: allocate a seccomp object, add a filter to it, and apply it */
655 r
= seccomp_init_conservative(&seccomp
, default_action
);
659 r
= seccomp_add_syscall_filter_set(seccomp
, set
, action
);
663 r
= seccomp_load(seccomp
);
666 seccomp_release(seccomp
);
670 int seccomp_restrict_namespaces(unsigned long retain
) {
671 scmp_filter_ctx seccomp
;
675 if (log_get_max_level() >= LOG_DEBUG
) {
676 _cleanup_free_
char *s
= NULL
;
678 (void) namespace_flag_to_string_many(retain
, &s
);
679 log_debug("Restricting namespace to: %s.", strna(s
));
683 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
686 r
= seccomp_init_conservative(&seccomp
, SCMP_ACT_ALLOW
);
690 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
691 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
693 r
= seccomp_rule_add(
695 SCMP_ACT_ERRNO(EPERM
),
699 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
700 * special invocation with a zero flags argument, right here. */
701 r
= seccomp_rule_add(
703 SCMP_ACT_ERRNO(EPERM
),
706 SCMP_A1(SCMP_CMP_EQ
, 0));
710 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
713 f
= namespace_flag_map
[i
].flag
;
714 if ((retain
& f
) == f
) {
715 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
719 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
721 r
= seccomp_rule_add(
723 SCMP_ACT_ERRNO(EPERM
),
726 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
730 r
= seccomp_rule_add(
732 SCMP_ACT_ERRNO(EPERM
),
735 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
739 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
740 r
= seccomp_rule_add(
742 SCMP_ACT_ERRNO(EPERM
),
745 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
751 r
= seccomp_load(seccomp
);
754 seccomp_release(seccomp
);