]>
Commit | Line | Data |
---|---|---|
53e1b683 | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
f011b0b8 DH |
2 | |
3 | #include <errno.h> | |
4 | #include <linux/netlink.h> | |
5 | #include <sys/capability.h> | |
6 | #include <sys/types.h> | |
7 | ||
349cc4a5 | 8 | #if HAVE_SECCOMP |
f011b0b8 DH |
9 | #include <seccomp.h> |
10 | #endif | |
11 | ||
469830d1 | 12 | #include "alloc-util.h" |
f011b0b8 | 13 | #include "log.h" |
469830d1 | 14 | #include "nspawn-seccomp.h" |
349cc4a5 | 15 | #if HAVE_SECCOMP |
f011b0b8 DH |
16 | #include "seccomp-util.h" |
17 | #endif | |
469830d1 | 18 | #include "string-util.h" |
960e4569 | 19 | #include "strv.h" |
f011b0b8 | 20 | |
349cc4a5 | 21 | #if HAVE_SECCOMP |
f011b0b8 | 22 | |
469830d1 LP |
23 | static int seccomp_add_default_syscall_filter( |
24 | scmp_filter_ctx ctx, | |
25 | uint32_t arch, | |
960e4569 LP |
26 | uint64_t cap_list_retain, |
27 | char **syscall_whitelist, | |
28 | char **syscall_blacklist) { | |
469830d1 | 29 | |
f011b0b8 DH |
30 | static const struct { |
31 | uint64_t capability; | |
402530d9 | 32 | const char* name; |
96bedbe2 LP |
33 | } whitelist[] = { |
34 | /* Let's use set names where we can */ | |
44898c53 | 35 | { 0, "@aio" }, |
96bedbe2 | 36 | { 0, "@basic-io" }, |
44898c53 | 37 | { 0, "@chown" }, |
96bedbe2 LP |
38 | { 0, "@default" }, |
39 | { 0, "@file-system" }, | |
40 | { 0, "@io-event" }, | |
41 | { 0, "@ipc" }, | |
42 | { 0, "@mount" }, | |
43 | { 0, "@network-io" }, | |
44 | { 0, "@process" }, | |
45 | { 0, "@resources" }, | |
46 | { 0, "@setuid" }, | |
47 | { 0, "@signal" }, | |
44898c53 | 48 | { 0, "@sync" }, |
96bedbe2 LP |
49 | { 0, "@timer" }, |
50 | ||
51 | /* The following four are sets we optionally enable, in case the caps have been configured for it */ | |
52 | { CAP_SYS_TIME, "@clock" }, | |
53 | { CAP_SYS_MODULE, "@module" }, | |
54 | { CAP_SYS_RAWIO, "@raw-io" }, | |
55 | { CAP_IPC_LOCK, "@memlock" }, | |
56 | ||
57 | /* Plus a good set of additional syscalls which are not part of any of the groups above */ | |
58 | { 0, "brk" }, | |
09d3020b | 59 | { 0, "capget" }, |
96bedbe2 | 60 | { 0, "capset" }, |
96bedbe2 LP |
61 | { 0, "copy_file_range" }, |
62 | { 0, "fadvise64" }, | |
63 | { 0, "fadvise64_64" }, | |
96bedbe2 | 64 | { 0, "flock" }, |
96bedbe2 LP |
65 | { 0, "get_mempolicy" }, |
66 | { 0, "getcpu" }, | |
67 | { 0, "getpriority" }, | |
68 | { 0, "getrandom" }, | |
96bedbe2 LP |
69 | { 0, "ioctl" }, |
70 | { 0, "ioprio_get" }, | |
71 | { 0, "kcmp" }, | |
96bedbe2 LP |
72 | { 0, "madvise" }, |
73 | { 0, "mincore" }, | |
74 | { 0, "mprotect" }, | |
75 | { 0, "mremap" }, | |
96bedbe2 LP |
76 | { 0, "name_to_handle_at" }, |
77 | { 0, "oldolduname" }, | |
78 | { 0, "olduname" }, | |
79 | { 0, "personality" }, | |
96bedbe2 LP |
80 | { 0, "readahead" }, |
81 | { 0, "readdir" }, | |
82 | { 0, "remap_file_pages" }, | |
83 | { 0, "sched_get_priority_max" }, | |
84 | { 0, "sched_get_priority_min" }, | |
85 | { 0, "sched_getaffinity" }, | |
86 | { 0, "sched_getattr" }, | |
87 | { 0, "sched_getparam" }, | |
88 | { 0, "sched_getscheduler" }, | |
89 | { 0, "sched_rr_get_interval" }, | |
90 | { 0, "sched_yield" }, | |
91 | { 0, "seccomp" }, | |
92 | { 0, "sendfile" }, | |
93 | { 0, "sendfile64" }, | |
94 | { 0, "setdomainname" }, | |
95 | { 0, "setfsgid" }, | |
96 | { 0, "setfsgid32" }, | |
97 | { 0, "setfsuid" }, | |
98 | { 0, "setfsuid32" }, | |
99 | { 0, "sethostname" }, | |
100 | { 0, "setpgid" }, | |
101 | { 0, "setsid" }, | |
102 | { 0, "splice" }, | |
96bedbe2 LP |
103 | { 0, "sysinfo" }, |
104 | { 0, "tee" }, | |
96bedbe2 LP |
105 | { 0, "umask" }, |
106 | { 0, "uname" }, | |
107 | { 0, "userfaultfd" }, | |
108 | { 0, "vmsplice" }, | |
109 | ||
110 | /* The following individual syscalls are added depending on specified caps */ | |
111 | { CAP_SYS_PACCT, "acct" }, | |
112 | { CAP_SYS_PTRACE, "process_vm_readv" }, | |
113 | { CAP_SYS_PTRACE, "process_vm_writev" }, | |
114 | { CAP_SYS_PTRACE, "ptrace" }, | |
115 | { CAP_SYS_BOOT, "reboot" }, | |
116 | { CAP_SYSLOG, "syslog" }, | |
117 | { CAP_SYS_TTY_CONFIG, "vhangup" }, | |
118 | ||
119 | /* | |
120 | * The following syscalls and groups are knowingly excluded: | |
121 | * | |
122 | * @cpu-emulation | |
123 | * @keyring (NB: keyring is not namespaced!) | |
124 | * @obsolete | |
125 | * @swap | |
126 | * | |
127 | * bpf (NB: bpffs is not namespaced!) | |
128 | * fanotify_init | |
129 | * fanotify_mark | |
130 | * kexec_file_load | |
131 | * kexec_load | |
132 | * lookup_dcookie | |
133 | * nfsservctl | |
134 | * open_by_handle_at | |
135 | * perf_event_open | |
136 | * pkey_alloc | |
137 | * pkey_free | |
138 | * pkey_mprotect | |
139 | * quotactl | |
140 | */ | |
f011b0b8 | 141 | }; |
402530d9 | 142 | |
469830d1 | 143 | int r, c = 0; |
402530d9 | 144 | size_t i; |
960e4569 | 145 | char **p; |
f011b0b8 | 146 | |
96bedbe2 LP |
147 | for (i = 0; i < ELEMENTSOF(whitelist); i++) { |
148 | if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0) | |
f011b0b8 DH |
149 | continue; |
150 | ||
96bedbe2 | 151 | r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist); |
402530d9 | 152 | if (r < 0) |
469830d1 | 153 | /* If the system call is not known on this architecture, then that's fine, let's ignore it */ |
96bedbe2 | 154 | log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch)); |
402530d9 | 155 | else |
469830d1 | 156 | c++; |
f011b0b8 DH |
157 | } |
158 | ||
96bedbe2 LP |
159 | STRV_FOREACH(p, syscall_whitelist) { |
160 | r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist); | |
960e4569 | 161 | if (r < 0) |
96bedbe2 | 162 | log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch)); |
960e4569 LP |
163 | else |
164 | c++; | |
165 | } | |
166 | ||
469830d1 | 167 | return c; |
f011b0b8 DH |
168 | } |
169 | ||
960e4569 | 170 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
469830d1 | 171 | uint32_t arch; |
f011b0b8 DH |
172 | int r; |
173 | ||
1cec406d | 174 | if (!is_seccomp_available()) { |
960e4569 | 175 | log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering"); |
1cec406d FS |
176 | return 0; |
177 | } | |
178 | ||
469830d1 LP |
179 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { |
180 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
469830d1 | 181 | |
96bedbe2 | 182 | log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch)); |
469830d1 | 183 | |
96bedbe2 | 184 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM)); |
469830d1 LP |
185 | if (r < 0) |
186 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
187 | ||
96bedbe2 LP |
188 | r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); |
189 | if (r < 0) | |
190 | return r; | |
191 | ||
192 | r = seccomp_load(seccomp); | |
193 | if (IN_SET(r, -EPERM, -EACCES)) | |
194 | return log_error_errno(r, "Failed to install seccomp filter: %m"); | |
195 | if (r < 0) | |
196 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
197 | } | |
198 | ||
199 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { | |
200 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
201 | ||
202 | log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch)); | |
203 | ||
204 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); | |
205 | if (r < 0) | |
206 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
469830d1 LP |
207 | |
208 | /* | |
209 | Audit is broken in containers, much of the userspace audit hookup will fail if running inside a | |
210 | container. We don't care and just turn off creation of audit sockets. | |
211 | ||
212 | This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses | |
213 | as indication that audit is disabled in the kernel. | |
214 | */ | |
215 | ||
216 | r = seccomp_rule_add_exact( | |
217 | seccomp, | |
218 | SCMP_ACT_ERRNO(EAFNOSUPPORT), | |
219 | SCMP_SYS(socket), | |
220 | 2, | |
221 | SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), | |
222 | SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); | |
96bedbe2 | 223 | if (r < 0) { |
469830d1 | 224 | log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); |
469830d1 | 225 | continue; |
96bedbe2 | 226 | } |
f011b0b8 | 227 | |
469830d1 LP |
228 | r = seccomp_load(seccomp); |
229 | if (IN_SET(r, -EPERM, -EACCES)) | |
230 | return log_error_errno(r, "Failed to install seccomp audit filter: %m"); | |
231 | if (r < 0) | |
232 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
f011b0b8 DH |
233 | } |
234 | ||
469830d1 | 235 | return 0; |
f011b0b8 DH |
236 | } |
237 | ||
238 | #else | |
239 | ||
960e4569 | 240 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
f011b0b8 DH |
241 | return 0; |
242 | } | |
243 | ||
244 | #endif |