]>
Commit | Line | Data |
---|---|---|
f011b0b8 DH |
1 | /*** |
2 | This file is part of systemd. | |
3 | ||
4 | Copyright 2016 Lennart Poettering | |
5 | ||
6 | systemd is free software; you can redistribute it and/or modify it | |
7 | under the terms of the GNU Lesser General Public License as published by | |
8 | the Free Software Foundation; either version 2.1 of the License, or | |
9 | (at your option) any later version. | |
10 | ||
11 | systemd is distributed in the hope that it will be useful, but | |
12 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public License | |
17 | along with systemd; If not, see <http://www.gnu.org/licenses/>. | |
18 | ***/ | |
19 | ||
20 | #include <errno.h> | |
21 | #include <linux/netlink.h> | |
22 | #include <sys/capability.h> | |
23 | #include <sys/types.h> | |
24 | ||
349cc4a5 | 25 | #if HAVE_SECCOMP |
f011b0b8 DH |
26 | #include <seccomp.h> |
27 | #endif | |
28 | ||
469830d1 | 29 | #include "alloc-util.h" |
f011b0b8 | 30 | #include "log.h" |
469830d1 | 31 | #include "nspawn-seccomp.h" |
349cc4a5 | 32 | #if HAVE_SECCOMP |
f011b0b8 DH |
33 | #include "seccomp-util.h" |
34 | #endif | |
469830d1 | 35 | #include "string-util.h" |
960e4569 | 36 | #include "strv.h" |
f011b0b8 | 37 | |
349cc4a5 | 38 | #if HAVE_SECCOMP |
f011b0b8 | 39 | |
469830d1 LP |
40 | static int seccomp_add_default_syscall_filter( |
41 | scmp_filter_ctx ctx, | |
42 | uint32_t arch, | |
960e4569 LP |
43 | uint64_t cap_list_retain, |
44 | char **syscall_whitelist, | |
45 | char **syscall_blacklist) { | |
469830d1 | 46 | |
f011b0b8 DH |
47 | static const struct { |
48 | uint64_t capability; | |
402530d9 | 49 | const char* name; |
96bedbe2 LP |
50 | } whitelist[] = { |
51 | /* Let's use set names where we can */ | |
52 | { 0, "@basic-io" }, | |
96bedbe2 LP |
53 | { 0, "@default" }, |
54 | { 0, "@file-system" }, | |
55 | { 0, "@io-event" }, | |
56 | { 0, "@ipc" }, | |
57 | { 0, "@mount" }, | |
58 | { 0, "@network-io" }, | |
59 | { 0, "@process" }, | |
60 | { 0, "@resources" }, | |
61 | { 0, "@setuid" }, | |
62 | { 0, "@signal" }, | |
63 | { 0, "@timer" }, | |
64 | ||
65 | /* The following four are sets we optionally enable, in case the caps have been configured for it */ | |
66 | { CAP_SYS_TIME, "@clock" }, | |
67 | { CAP_SYS_MODULE, "@module" }, | |
68 | { CAP_SYS_RAWIO, "@raw-io" }, | |
69 | { CAP_IPC_LOCK, "@memlock" }, | |
70 | ||
71 | /* Plus a good set of additional syscalls which are not part of any of the groups above */ | |
72 | { 0, "brk" }, | |
09d3020b | 73 | { 0, "capget" }, |
96bedbe2 LP |
74 | { 0, "capset" }, |
75 | { 0, "chown" }, | |
76 | { 0, "chown32" }, | |
77 | { 0, "copy_file_range" }, | |
78 | { 0, "fadvise64" }, | |
79 | { 0, "fadvise64_64" }, | |
80 | { 0, "fchown" }, | |
81 | { 0, "fchown32" }, | |
82 | { 0, "fchownat" }, | |
83 | { 0, "fdatasync" }, | |
84 | { 0, "flock" }, | |
85 | { 0, "fsync" }, | |
86 | { 0, "get_mempolicy" }, | |
87 | { 0, "getcpu" }, | |
88 | { 0, "getpriority" }, | |
89 | { 0, "getrandom" }, | |
90 | { 0, "io_cancel" }, | |
91 | { 0, "io_destroy" }, | |
92 | { 0, "io_getevents" }, | |
93 | { 0, "io_setup" }, | |
94 | { 0, "io_submit" }, | |
95 | { 0, "ioctl" }, | |
96 | { 0, "ioprio_get" }, | |
97 | { 0, "kcmp" }, | |
98 | { 0, "lchown" }, | |
99 | { 0, "lchown32" }, | |
100 | { 0, "madvise" }, | |
101 | { 0, "mincore" }, | |
102 | { 0, "mprotect" }, | |
103 | { 0, "mremap" }, | |
104 | { 0, "msync" }, | |
105 | { 0, "name_to_handle_at" }, | |
106 | { 0, "oldolduname" }, | |
107 | { 0, "olduname" }, | |
108 | { 0, "personality" }, | |
109 | { 0, "preadv2" }, | |
110 | { 0, "pwritev2" }, | |
111 | { 0, "readahead" }, | |
112 | { 0, "readdir" }, | |
113 | { 0, "remap_file_pages" }, | |
114 | { 0, "sched_get_priority_max" }, | |
115 | { 0, "sched_get_priority_min" }, | |
116 | { 0, "sched_getaffinity" }, | |
117 | { 0, "sched_getattr" }, | |
118 | { 0, "sched_getparam" }, | |
119 | { 0, "sched_getscheduler" }, | |
120 | { 0, "sched_rr_get_interval" }, | |
121 | { 0, "sched_yield" }, | |
122 | { 0, "seccomp" }, | |
123 | { 0, "sendfile" }, | |
124 | { 0, "sendfile64" }, | |
125 | { 0, "setdomainname" }, | |
126 | { 0, "setfsgid" }, | |
127 | { 0, "setfsgid32" }, | |
128 | { 0, "setfsuid" }, | |
129 | { 0, "setfsuid32" }, | |
130 | { 0, "sethostname" }, | |
131 | { 0, "setpgid" }, | |
132 | { 0, "setsid" }, | |
133 | { 0, "splice" }, | |
134 | { 0, "sync" }, | |
135 | { 0, "sync_file_range" }, | |
136 | { 0, "syncfs" }, | |
137 | { 0, "sysinfo" }, | |
138 | { 0, "tee" }, | |
139 | { 0, "ugetrlimit" }, | |
140 | { 0, "umask" }, | |
141 | { 0, "uname" }, | |
142 | { 0, "userfaultfd" }, | |
143 | { 0, "vmsplice" }, | |
144 | ||
145 | /* The following individual syscalls are added depending on specified caps */ | |
146 | { CAP_SYS_PACCT, "acct" }, | |
147 | { CAP_SYS_PTRACE, "process_vm_readv" }, | |
148 | { CAP_SYS_PTRACE, "process_vm_writev" }, | |
149 | { CAP_SYS_PTRACE, "ptrace" }, | |
150 | { CAP_SYS_BOOT, "reboot" }, | |
151 | { CAP_SYSLOG, "syslog" }, | |
152 | { CAP_SYS_TTY_CONFIG, "vhangup" }, | |
153 | ||
154 | /* | |
155 | * The following syscalls and groups are knowingly excluded: | |
156 | * | |
157 | * @cpu-emulation | |
158 | * @keyring (NB: keyring is not namespaced!) | |
159 | * @obsolete | |
160 | * @swap | |
161 | * | |
162 | * bpf (NB: bpffs is not namespaced!) | |
163 | * fanotify_init | |
164 | * fanotify_mark | |
165 | * kexec_file_load | |
166 | * kexec_load | |
167 | * lookup_dcookie | |
168 | * nfsservctl | |
169 | * open_by_handle_at | |
170 | * perf_event_open | |
171 | * pkey_alloc | |
172 | * pkey_free | |
173 | * pkey_mprotect | |
174 | * quotactl | |
175 | */ | |
f011b0b8 | 176 | }; |
402530d9 | 177 | |
469830d1 | 178 | int r, c = 0; |
402530d9 | 179 | size_t i; |
960e4569 | 180 | char **p; |
f011b0b8 | 181 | |
96bedbe2 LP |
182 | for (i = 0; i < ELEMENTSOF(whitelist); i++) { |
183 | if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0) | |
f011b0b8 DH |
184 | continue; |
185 | ||
96bedbe2 | 186 | r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist); |
402530d9 | 187 | if (r < 0) |
469830d1 | 188 | /* If the system call is not known on this architecture, then that's fine, let's ignore it */ |
96bedbe2 | 189 | log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch)); |
402530d9 | 190 | else |
469830d1 | 191 | c++; |
f011b0b8 DH |
192 | } |
193 | ||
96bedbe2 LP |
194 | STRV_FOREACH(p, syscall_whitelist) { |
195 | r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist); | |
960e4569 | 196 | if (r < 0) |
96bedbe2 | 197 | log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch)); |
960e4569 LP |
198 | else |
199 | c++; | |
200 | } | |
201 | ||
469830d1 | 202 | return c; |
f011b0b8 DH |
203 | } |
204 | ||
960e4569 | 205 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
469830d1 | 206 | uint32_t arch; |
f011b0b8 DH |
207 | int r; |
208 | ||
1cec406d | 209 | if (!is_seccomp_available()) { |
960e4569 | 210 | log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering"); |
1cec406d FS |
211 | return 0; |
212 | } | |
213 | ||
469830d1 LP |
214 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { |
215 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
469830d1 | 216 | |
96bedbe2 | 217 | log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch)); |
469830d1 | 218 | |
96bedbe2 | 219 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM)); |
469830d1 LP |
220 | if (r < 0) |
221 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
222 | ||
96bedbe2 LP |
223 | r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); |
224 | if (r < 0) | |
225 | return r; | |
226 | ||
227 | r = seccomp_load(seccomp); | |
228 | if (IN_SET(r, -EPERM, -EACCES)) | |
229 | return log_error_errno(r, "Failed to install seccomp filter: %m"); | |
230 | if (r < 0) | |
231 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
232 | } | |
233 | ||
234 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { | |
235 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
236 | ||
237 | log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch)); | |
238 | ||
239 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); | |
240 | if (r < 0) | |
241 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
469830d1 LP |
242 | |
243 | /* | |
244 | Audit is broken in containers, much of the userspace audit hookup will fail if running inside a | |
245 | container. We don't care and just turn off creation of audit sockets. | |
246 | ||
247 | This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses | |
248 | as indication that audit is disabled in the kernel. | |
249 | */ | |
250 | ||
251 | r = seccomp_rule_add_exact( | |
252 | seccomp, | |
253 | SCMP_ACT_ERRNO(EAFNOSUPPORT), | |
254 | SCMP_SYS(socket), | |
255 | 2, | |
256 | SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), | |
257 | SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); | |
96bedbe2 | 258 | if (r < 0) { |
469830d1 | 259 | log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); |
469830d1 | 260 | continue; |
96bedbe2 | 261 | } |
f011b0b8 | 262 | |
469830d1 LP |
263 | r = seccomp_load(seccomp); |
264 | if (IN_SET(r, -EPERM, -EACCES)) | |
265 | return log_error_errno(r, "Failed to install seccomp audit filter: %m"); | |
266 | if (r < 0) | |
267 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
f011b0b8 DH |
268 | } |
269 | ||
469830d1 | 270 | return 0; |
f011b0b8 DH |
271 | } |
272 | ||
273 | #else | |
274 | ||
960e4569 | 275 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
f011b0b8 DH |
276 | return 0; |
277 | } | |
278 | ||
279 | #endif |