]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn-seccomp.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / nspawn / nspawn-seccomp.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
f011b0b8
DH
2/***
3 This file is part of systemd.
4
5 Copyright 2016 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <errno.h>
22#include <linux/netlink.h>
23#include <sys/capability.h>
24#include <sys/types.h>
25
349cc4a5 26#if HAVE_SECCOMP
f011b0b8
DH
27#include <seccomp.h>
28#endif
29
469830d1 30#include "alloc-util.h"
f011b0b8 31#include "log.h"
469830d1 32#include "nspawn-seccomp.h"
349cc4a5 33#if HAVE_SECCOMP
f011b0b8
DH
34#include "seccomp-util.h"
35#endif
469830d1 36#include "string-util.h"
960e4569 37#include "strv.h"
f011b0b8 38
349cc4a5 39#if HAVE_SECCOMP
f011b0b8 40
469830d1
LP
41static int seccomp_add_default_syscall_filter(
42 scmp_filter_ctx ctx,
43 uint32_t arch,
960e4569
LP
44 uint64_t cap_list_retain,
45 char **syscall_whitelist,
46 char **syscall_blacklist) {
469830d1 47
f011b0b8
DH
48 static const struct {
49 uint64_t capability;
402530d9 50 const char* name;
96bedbe2
LP
51 } whitelist[] = {
52 /* Let's use set names where we can */
44898c53 53 { 0, "@aio" },
96bedbe2 54 { 0, "@basic-io" },
44898c53 55 { 0, "@chown" },
96bedbe2
LP
56 { 0, "@default" },
57 { 0, "@file-system" },
58 { 0, "@io-event" },
59 { 0, "@ipc" },
60 { 0, "@mount" },
61 { 0, "@network-io" },
62 { 0, "@process" },
63 { 0, "@resources" },
64 { 0, "@setuid" },
65 { 0, "@signal" },
44898c53 66 { 0, "@sync" },
96bedbe2
LP
67 { 0, "@timer" },
68
69 /* The following four are sets we optionally enable, in case the caps have been configured for it */
70 { CAP_SYS_TIME, "@clock" },
71 { CAP_SYS_MODULE, "@module" },
72 { CAP_SYS_RAWIO, "@raw-io" },
73 { CAP_IPC_LOCK, "@memlock" },
74
75 /* Plus a good set of additional syscalls which are not part of any of the groups above */
76 { 0, "brk" },
09d3020b 77 { 0, "capget" },
96bedbe2 78 { 0, "capset" },
96bedbe2
LP
79 { 0, "copy_file_range" },
80 { 0, "fadvise64" },
81 { 0, "fadvise64_64" },
96bedbe2 82 { 0, "flock" },
96bedbe2
LP
83 { 0, "get_mempolicy" },
84 { 0, "getcpu" },
85 { 0, "getpriority" },
86 { 0, "getrandom" },
96bedbe2
LP
87 { 0, "ioctl" },
88 { 0, "ioprio_get" },
89 { 0, "kcmp" },
96bedbe2
LP
90 { 0, "madvise" },
91 { 0, "mincore" },
92 { 0, "mprotect" },
93 { 0, "mremap" },
96bedbe2
LP
94 { 0, "name_to_handle_at" },
95 { 0, "oldolduname" },
96 { 0, "olduname" },
97 { 0, "personality" },
96bedbe2
LP
98 { 0, "readahead" },
99 { 0, "readdir" },
100 { 0, "remap_file_pages" },
101 { 0, "sched_get_priority_max" },
102 { 0, "sched_get_priority_min" },
103 { 0, "sched_getaffinity" },
104 { 0, "sched_getattr" },
105 { 0, "sched_getparam" },
106 { 0, "sched_getscheduler" },
107 { 0, "sched_rr_get_interval" },
108 { 0, "sched_yield" },
109 { 0, "seccomp" },
110 { 0, "sendfile" },
111 { 0, "sendfile64" },
112 { 0, "setdomainname" },
113 { 0, "setfsgid" },
114 { 0, "setfsgid32" },
115 { 0, "setfsuid" },
116 { 0, "setfsuid32" },
117 { 0, "sethostname" },
118 { 0, "setpgid" },
119 { 0, "setsid" },
120 { 0, "splice" },
96bedbe2
LP
121 { 0, "sysinfo" },
122 { 0, "tee" },
96bedbe2
LP
123 { 0, "umask" },
124 { 0, "uname" },
125 { 0, "userfaultfd" },
126 { 0, "vmsplice" },
127
128 /* The following individual syscalls are added depending on specified caps */
129 { CAP_SYS_PACCT, "acct" },
130 { CAP_SYS_PTRACE, "process_vm_readv" },
131 { CAP_SYS_PTRACE, "process_vm_writev" },
132 { CAP_SYS_PTRACE, "ptrace" },
133 { CAP_SYS_BOOT, "reboot" },
134 { CAP_SYSLOG, "syslog" },
135 { CAP_SYS_TTY_CONFIG, "vhangup" },
136
137 /*
138 * The following syscalls and groups are knowingly excluded:
139 *
140 * @cpu-emulation
141 * @keyring (NB: keyring is not namespaced!)
142 * @obsolete
143 * @swap
144 *
145 * bpf (NB: bpffs is not namespaced!)
146 * fanotify_init
147 * fanotify_mark
148 * kexec_file_load
149 * kexec_load
150 * lookup_dcookie
151 * nfsservctl
152 * open_by_handle_at
153 * perf_event_open
154 * pkey_alloc
155 * pkey_free
156 * pkey_mprotect
157 * quotactl
158 */
f011b0b8 159 };
402530d9 160
469830d1 161 int r, c = 0;
402530d9 162 size_t i;
960e4569 163 char **p;
f011b0b8 164
96bedbe2
LP
165 for (i = 0; i < ELEMENTSOF(whitelist); i++) {
166 if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
f011b0b8
DH
167 continue;
168
96bedbe2 169 r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
402530d9 170 if (r < 0)
469830d1 171 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
96bedbe2 172 log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
402530d9 173 else
469830d1 174 c++;
f011b0b8
DH
175 }
176
96bedbe2
LP
177 STRV_FOREACH(p, syscall_whitelist) {
178 r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
960e4569 179 if (r < 0)
96bedbe2 180 log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
960e4569
LP
181 else
182 c++;
183 }
184
469830d1 185 return c;
f011b0b8
DH
186}
187
960e4569 188int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
469830d1 189 uint32_t arch;
f011b0b8
DH
190 int r;
191
1cec406d 192 if (!is_seccomp_available()) {
960e4569 193 log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
1cec406d
FS
194 return 0;
195 }
196
469830d1
LP
197 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
198 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1 199
96bedbe2 200 log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
469830d1 201
96bedbe2 202 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
469830d1
LP
203 if (r < 0)
204 return log_error_errno(r, "Failed to allocate seccomp object: %m");
205
96bedbe2
LP
206 r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
207 if (r < 0)
208 return r;
209
210 r = seccomp_load(seccomp);
211 if (IN_SET(r, -EPERM, -EACCES))
212 return log_error_errno(r, "Failed to install seccomp filter: %m");
213 if (r < 0)
214 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
215 }
216
217 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
218 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
219
220 log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
221
222 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
223 if (r < 0)
224 return log_error_errno(r, "Failed to allocate seccomp object: %m");
469830d1
LP
225
226 /*
227 Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
228 container. We don't care and just turn off creation of audit sockets.
229
230 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses
231 as indication that audit is disabled in the kernel.
232 */
233
234 r = seccomp_rule_add_exact(
235 seccomp,
236 SCMP_ACT_ERRNO(EAFNOSUPPORT),
237 SCMP_SYS(socket),
238 2,
239 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
240 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
96bedbe2 241 if (r < 0) {
469830d1 242 log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
469830d1 243 continue;
96bedbe2 244 }
f011b0b8 245
469830d1
LP
246 r = seccomp_load(seccomp);
247 if (IN_SET(r, -EPERM, -EACCES))
248 return log_error_errno(r, "Failed to install seccomp audit filter: %m");
249 if (r < 0)
250 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
f011b0b8
DH
251 }
252
469830d1 253 return 0;
f011b0b8
DH
254}
255
256#else
257
960e4569 258int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
f011b0b8
DH
259 return 0;
260}
261
262#endif