]>
Commit | Line | Data |
---|---|---|
53e1b683 | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
f011b0b8 DH |
2 | /*** |
3 | This file is part of systemd. | |
4 | ||
5 | Copyright 2016 Lennart Poettering | |
6 | ||
7 | systemd is free software; you can redistribute it and/or modify it | |
8 | under the terms of the GNU Lesser General Public License as published by | |
9 | the Free Software Foundation; either version 2.1 of the License, or | |
10 | (at your option) any later version. | |
11 | ||
12 | systemd is distributed in the hope that it will be useful, but | |
13 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | Lesser General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU Lesser General Public License | |
18 | along with systemd; If not, see <http://www.gnu.org/licenses/>. | |
19 | ***/ | |
20 | ||
21 | #include <errno.h> | |
22 | #include <linux/netlink.h> | |
23 | #include <sys/capability.h> | |
24 | #include <sys/types.h> | |
25 | ||
349cc4a5 | 26 | #if HAVE_SECCOMP |
f011b0b8 DH |
27 | #include <seccomp.h> |
28 | #endif | |
29 | ||
469830d1 | 30 | #include "alloc-util.h" |
f011b0b8 | 31 | #include "log.h" |
469830d1 | 32 | #include "nspawn-seccomp.h" |
349cc4a5 | 33 | #if HAVE_SECCOMP |
f011b0b8 DH |
34 | #include "seccomp-util.h" |
35 | #endif | |
469830d1 | 36 | #include "string-util.h" |
960e4569 | 37 | #include "strv.h" |
f011b0b8 | 38 | |
349cc4a5 | 39 | #if HAVE_SECCOMP |
f011b0b8 | 40 | |
469830d1 LP |
41 | static int seccomp_add_default_syscall_filter( |
42 | scmp_filter_ctx ctx, | |
43 | uint32_t arch, | |
960e4569 LP |
44 | uint64_t cap_list_retain, |
45 | char **syscall_whitelist, | |
46 | char **syscall_blacklist) { | |
469830d1 | 47 | |
f011b0b8 DH |
48 | static const struct { |
49 | uint64_t capability; | |
402530d9 | 50 | const char* name; |
96bedbe2 LP |
51 | } whitelist[] = { |
52 | /* Let's use set names where we can */ | |
44898c53 | 53 | { 0, "@aio" }, |
96bedbe2 | 54 | { 0, "@basic-io" }, |
44898c53 | 55 | { 0, "@chown" }, |
96bedbe2 LP |
56 | { 0, "@default" }, |
57 | { 0, "@file-system" }, | |
58 | { 0, "@io-event" }, | |
59 | { 0, "@ipc" }, | |
60 | { 0, "@mount" }, | |
61 | { 0, "@network-io" }, | |
62 | { 0, "@process" }, | |
63 | { 0, "@resources" }, | |
64 | { 0, "@setuid" }, | |
65 | { 0, "@signal" }, | |
44898c53 | 66 | { 0, "@sync" }, |
96bedbe2 LP |
67 | { 0, "@timer" }, |
68 | ||
69 | /* The following four are sets we optionally enable, in case the caps have been configured for it */ | |
70 | { CAP_SYS_TIME, "@clock" }, | |
71 | { CAP_SYS_MODULE, "@module" }, | |
72 | { CAP_SYS_RAWIO, "@raw-io" }, | |
73 | { CAP_IPC_LOCK, "@memlock" }, | |
74 | ||
75 | /* Plus a good set of additional syscalls which are not part of any of the groups above */ | |
76 | { 0, "brk" }, | |
09d3020b | 77 | { 0, "capget" }, |
96bedbe2 | 78 | { 0, "capset" }, |
96bedbe2 LP |
79 | { 0, "copy_file_range" }, |
80 | { 0, "fadvise64" }, | |
81 | { 0, "fadvise64_64" }, | |
96bedbe2 | 82 | { 0, "flock" }, |
96bedbe2 LP |
83 | { 0, "get_mempolicy" }, |
84 | { 0, "getcpu" }, | |
85 | { 0, "getpriority" }, | |
86 | { 0, "getrandom" }, | |
96bedbe2 LP |
87 | { 0, "ioctl" }, |
88 | { 0, "ioprio_get" }, | |
89 | { 0, "kcmp" }, | |
96bedbe2 LP |
90 | { 0, "madvise" }, |
91 | { 0, "mincore" }, | |
92 | { 0, "mprotect" }, | |
93 | { 0, "mremap" }, | |
96bedbe2 LP |
94 | { 0, "name_to_handle_at" }, |
95 | { 0, "oldolduname" }, | |
96 | { 0, "olduname" }, | |
97 | { 0, "personality" }, | |
96bedbe2 LP |
98 | { 0, "readahead" }, |
99 | { 0, "readdir" }, | |
100 | { 0, "remap_file_pages" }, | |
101 | { 0, "sched_get_priority_max" }, | |
102 | { 0, "sched_get_priority_min" }, | |
103 | { 0, "sched_getaffinity" }, | |
104 | { 0, "sched_getattr" }, | |
105 | { 0, "sched_getparam" }, | |
106 | { 0, "sched_getscheduler" }, | |
107 | { 0, "sched_rr_get_interval" }, | |
108 | { 0, "sched_yield" }, | |
109 | { 0, "seccomp" }, | |
110 | { 0, "sendfile" }, | |
111 | { 0, "sendfile64" }, | |
112 | { 0, "setdomainname" }, | |
113 | { 0, "setfsgid" }, | |
114 | { 0, "setfsgid32" }, | |
115 | { 0, "setfsuid" }, | |
116 | { 0, "setfsuid32" }, | |
117 | { 0, "sethostname" }, | |
118 | { 0, "setpgid" }, | |
119 | { 0, "setsid" }, | |
120 | { 0, "splice" }, | |
96bedbe2 LP |
121 | { 0, "sysinfo" }, |
122 | { 0, "tee" }, | |
96bedbe2 LP |
123 | { 0, "umask" }, |
124 | { 0, "uname" }, | |
125 | { 0, "userfaultfd" }, | |
126 | { 0, "vmsplice" }, | |
127 | ||
128 | /* The following individual syscalls are added depending on specified caps */ | |
129 | { CAP_SYS_PACCT, "acct" }, | |
130 | { CAP_SYS_PTRACE, "process_vm_readv" }, | |
131 | { CAP_SYS_PTRACE, "process_vm_writev" }, | |
132 | { CAP_SYS_PTRACE, "ptrace" }, | |
133 | { CAP_SYS_BOOT, "reboot" }, | |
134 | { CAP_SYSLOG, "syslog" }, | |
135 | { CAP_SYS_TTY_CONFIG, "vhangup" }, | |
136 | ||
137 | /* | |
138 | * The following syscalls and groups are knowingly excluded: | |
139 | * | |
140 | * @cpu-emulation | |
141 | * @keyring (NB: keyring is not namespaced!) | |
142 | * @obsolete | |
143 | * @swap | |
144 | * | |
145 | * bpf (NB: bpffs is not namespaced!) | |
146 | * fanotify_init | |
147 | * fanotify_mark | |
148 | * kexec_file_load | |
149 | * kexec_load | |
150 | * lookup_dcookie | |
151 | * nfsservctl | |
152 | * open_by_handle_at | |
153 | * perf_event_open | |
154 | * pkey_alloc | |
155 | * pkey_free | |
156 | * pkey_mprotect | |
157 | * quotactl | |
158 | */ | |
f011b0b8 | 159 | }; |
402530d9 | 160 | |
469830d1 | 161 | int r, c = 0; |
402530d9 | 162 | size_t i; |
960e4569 | 163 | char **p; |
f011b0b8 | 164 | |
96bedbe2 LP |
165 | for (i = 0; i < ELEMENTSOF(whitelist); i++) { |
166 | if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0) | |
f011b0b8 DH |
167 | continue; |
168 | ||
96bedbe2 | 169 | r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist); |
402530d9 | 170 | if (r < 0) |
469830d1 | 171 | /* If the system call is not known on this architecture, then that's fine, let's ignore it */ |
96bedbe2 | 172 | log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch)); |
402530d9 | 173 | else |
469830d1 | 174 | c++; |
f011b0b8 DH |
175 | } |
176 | ||
96bedbe2 LP |
177 | STRV_FOREACH(p, syscall_whitelist) { |
178 | r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist); | |
960e4569 | 179 | if (r < 0) |
96bedbe2 | 180 | log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch)); |
960e4569 LP |
181 | else |
182 | c++; | |
183 | } | |
184 | ||
469830d1 | 185 | return c; |
f011b0b8 DH |
186 | } |
187 | ||
960e4569 | 188 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
469830d1 | 189 | uint32_t arch; |
f011b0b8 DH |
190 | int r; |
191 | ||
1cec406d | 192 | if (!is_seccomp_available()) { |
960e4569 | 193 | log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering"); |
1cec406d FS |
194 | return 0; |
195 | } | |
196 | ||
469830d1 LP |
197 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { |
198 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
469830d1 | 199 | |
96bedbe2 | 200 | log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch)); |
469830d1 | 201 | |
96bedbe2 | 202 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM)); |
469830d1 LP |
203 | if (r < 0) |
204 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
205 | ||
96bedbe2 LP |
206 | r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); |
207 | if (r < 0) | |
208 | return r; | |
209 | ||
210 | r = seccomp_load(seccomp); | |
211 | if (IN_SET(r, -EPERM, -EACCES)) | |
212 | return log_error_errno(r, "Failed to install seccomp filter: %m"); | |
213 | if (r < 0) | |
214 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
215 | } | |
216 | ||
217 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { | |
218 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
219 | ||
220 | log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch)); | |
221 | ||
222 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); | |
223 | if (r < 0) | |
224 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
469830d1 LP |
225 | |
226 | /* | |
227 | Audit is broken in containers, much of the userspace audit hookup will fail if running inside a | |
228 | container. We don't care and just turn off creation of audit sockets. | |
229 | ||
230 | This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses | |
231 | as indication that audit is disabled in the kernel. | |
232 | */ | |
233 | ||
234 | r = seccomp_rule_add_exact( | |
235 | seccomp, | |
236 | SCMP_ACT_ERRNO(EAFNOSUPPORT), | |
237 | SCMP_SYS(socket), | |
238 | 2, | |
239 | SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), | |
240 | SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); | |
96bedbe2 | 241 | if (r < 0) { |
469830d1 | 242 | log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); |
469830d1 | 243 | continue; |
96bedbe2 | 244 | } |
f011b0b8 | 245 | |
469830d1 LP |
246 | r = seccomp_load(seccomp); |
247 | if (IN_SET(r, -EPERM, -EACCES)) | |
248 | return log_error_errno(r, "Failed to install seccomp audit filter: %m"); | |
249 | if (r < 0) | |
250 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
f011b0b8 DH |
251 | } |
252 | ||
469830d1 | 253 | return 0; |
f011b0b8 DH |
254 | } |
255 | ||
256 | #else | |
257 | ||
960e4569 | 258 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
f011b0b8 DH |
259 | return 0; |
260 | } | |
261 | ||
262 | #endif |