]>
Commit | Line | Data |
---|---|---|
53e1b683 | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
f011b0b8 DH |
2 | |
3 | #include <errno.h> | |
4 | #include <linux/netlink.h> | |
5 | #include <sys/capability.h> | |
ca78ad1d | 6 | #include <sys/socket.h> |
f011b0b8 DH |
7 | #include <sys/types.h> |
8 | ||
349cc4a5 | 9 | #if HAVE_SECCOMP |
f011b0b8 DH |
10 | #include <seccomp.h> |
11 | #endif | |
12 | ||
469830d1 | 13 | #include "alloc-util.h" |
f011b0b8 | 14 | #include "log.h" |
469830d1 | 15 | #include "nspawn-seccomp.h" |
349cc4a5 | 16 | #if HAVE_SECCOMP |
f011b0b8 DH |
17 | #include "seccomp-util.h" |
18 | #endif | |
469830d1 | 19 | #include "string-util.h" |
960e4569 | 20 | #include "strv.h" |
f011b0b8 | 21 | |
349cc4a5 | 22 | #if HAVE_SECCOMP |
f011b0b8 | 23 | |
469830d1 LP |
24 | static int seccomp_add_default_syscall_filter( |
25 | scmp_filter_ctx ctx, | |
26 | uint32_t arch, | |
960e4569 LP |
27 | uint64_t cap_list_retain, |
28 | char **syscall_whitelist, | |
29 | char **syscall_blacklist) { | |
469830d1 | 30 | |
f011b0b8 DH |
31 | static const struct { |
32 | uint64_t capability; | |
402530d9 | 33 | const char* name; |
96bedbe2 LP |
34 | } whitelist[] = { |
35 | /* Let's use set names where we can */ | |
44898c53 | 36 | { 0, "@aio" }, |
96bedbe2 | 37 | { 0, "@basic-io" }, |
44898c53 | 38 | { 0, "@chown" }, |
96bedbe2 LP |
39 | { 0, "@default" }, |
40 | { 0, "@file-system" }, | |
41 | { 0, "@io-event" }, | |
42 | { 0, "@ipc" }, | |
43 | { 0, "@mount" }, | |
44 | { 0, "@network-io" }, | |
45 | { 0, "@process" }, | |
46 | { 0, "@resources" }, | |
47 | { 0, "@setuid" }, | |
48 | { 0, "@signal" }, | |
44898c53 | 49 | { 0, "@sync" }, |
96bedbe2 LP |
50 | { 0, "@timer" }, |
51 | ||
52 | /* The following four are sets we optionally enable, in case the caps have been configured for it */ | |
53 | { CAP_SYS_TIME, "@clock" }, | |
54 | { CAP_SYS_MODULE, "@module" }, | |
55 | { CAP_SYS_RAWIO, "@raw-io" }, | |
56 | { CAP_IPC_LOCK, "@memlock" }, | |
57 | ||
58 | /* Plus a good set of additional syscalls which are not part of any of the groups above */ | |
59 | { 0, "brk" }, | |
09d3020b | 60 | { 0, "capget" }, |
96bedbe2 | 61 | { 0, "capset" }, |
96bedbe2 LP |
62 | { 0, "copy_file_range" }, |
63 | { 0, "fadvise64" }, | |
64 | { 0, "fadvise64_64" }, | |
96bedbe2 | 65 | { 0, "flock" }, |
96bedbe2 LP |
66 | { 0, "get_mempolicy" }, |
67 | { 0, "getcpu" }, | |
68 | { 0, "getpriority" }, | |
69 | { 0, "getrandom" }, | |
96bedbe2 LP |
70 | { 0, "ioctl" }, |
71 | { 0, "ioprio_get" }, | |
72 | { 0, "kcmp" }, | |
96bedbe2 LP |
73 | { 0, "madvise" }, |
74 | { 0, "mincore" }, | |
75 | { 0, "mprotect" }, | |
76 | { 0, "mremap" }, | |
96bedbe2 LP |
77 | { 0, "name_to_handle_at" }, |
78 | { 0, "oldolduname" }, | |
79 | { 0, "olduname" }, | |
80 | { 0, "personality" }, | |
96bedbe2 LP |
81 | { 0, "readahead" }, |
82 | { 0, "readdir" }, | |
83 | { 0, "remap_file_pages" }, | |
84 | { 0, "sched_get_priority_max" }, | |
85 | { 0, "sched_get_priority_min" }, | |
86 | { 0, "sched_getaffinity" }, | |
87 | { 0, "sched_getattr" }, | |
88 | { 0, "sched_getparam" }, | |
89 | { 0, "sched_getscheduler" }, | |
90 | { 0, "sched_rr_get_interval" }, | |
91 | { 0, "sched_yield" }, | |
92 | { 0, "seccomp" }, | |
93 | { 0, "sendfile" }, | |
94 | { 0, "sendfile64" }, | |
95 | { 0, "setdomainname" }, | |
96 | { 0, "setfsgid" }, | |
97 | { 0, "setfsgid32" }, | |
98 | { 0, "setfsuid" }, | |
99 | { 0, "setfsuid32" }, | |
100 | { 0, "sethostname" }, | |
101 | { 0, "setpgid" }, | |
102 | { 0, "setsid" }, | |
103 | { 0, "splice" }, | |
96bedbe2 LP |
104 | { 0, "sysinfo" }, |
105 | { 0, "tee" }, | |
96bedbe2 LP |
106 | { 0, "umask" }, |
107 | { 0, "uname" }, | |
108 | { 0, "userfaultfd" }, | |
109 | { 0, "vmsplice" }, | |
110 | ||
111 | /* The following individual syscalls are added depending on specified caps */ | |
112 | { CAP_SYS_PACCT, "acct" }, | |
113 | { CAP_SYS_PTRACE, "process_vm_readv" }, | |
114 | { CAP_SYS_PTRACE, "process_vm_writev" }, | |
115 | { CAP_SYS_PTRACE, "ptrace" }, | |
116 | { CAP_SYS_BOOT, "reboot" }, | |
117 | { CAP_SYSLOG, "syslog" }, | |
118 | { CAP_SYS_TTY_CONFIG, "vhangup" }, | |
119 | ||
120 | /* | |
121 | * The following syscalls and groups are knowingly excluded: | |
122 | * | |
123 | * @cpu-emulation | |
124 | * @keyring (NB: keyring is not namespaced!) | |
125 | * @obsolete | |
9493b168 | 126 | * @pkey |
96bedbe2 LP |
127 | * @swap |
128 | * | |
129 | * bpf (NB: bpffs is not namespaced!) | |
130 | * fanotify_init | |
131 | * fanotify_mark | |
132 | * kexec_file_load | |
133 | * kexec_load | |
134 | * lookup_dcookie | |
135 | * nfsservctl | |
136 | * open_by_handle_at | |
137 | * perf_event_open | |
96bedbe2 LP |
138 | * quotactl |
139 | */ | |
f011b0b8 | 140 | }; |
402530d9 | 141 | |
960e4569 | 142 | char **p; |
f47bd097 | 143 | int r; |
f011b0b8 | 144 | |
f47bd097 | 145 | for (size_t i = 0; i < ELEMENTSOF(whitelist); i++) { |
96bedbe2 | 146 | if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0) |
f011b0b8 DH |
147 | continue; |
148 | ||
b54f36c6 | 149 | r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist, false); |
402530d9 | 150 | if (r < 0) |
7e86bd73 | 151 | return log_error_errno(r, "Failed to add syscall filter item %s: %m", whitelist[i].name); |
f011b0b8 DH |
152 | } |
153 | ||
96bedbe2 | 154 | STRV_FOREACH(p, syscall_whitelist) { |
f47bd097 | 155 | r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist, true); |
960e4569 | 156 | if (r < 0) |
7e86bd73 ZJS |
157 | log_warning_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", |
158 | *p, seccomp_arch_to_string(arch)); | |
960e4569 LP |
159 | } |
160 | ||
7e86bd73 | 161 | return 0; |
f011b0b8 DH |
162 | } |
163 | ||
960e4569 | 164 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
469830d1 | 165 | uint32_t arch; |
f011b0b8 DH |
166 | int r; |
167 | ||
1cec406d | 168 | if (!is_seccomp_available()) { |
960e4569 | 169 | log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering"); |
1cec406d FS |
170 | return 0; |
171 | } | |
172 | ||
469830d1 LP |
173 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { |
174 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
469830d1 | 175 | |
96bedbe2 | 176 | log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch)); |
469830d1 | 177 | |
96bedbe2 | 178 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM)); |
469830d1 LP |
179 | if (r < 0) |
180 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
181 | ||
96bedbe2 LP |
182 | r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); |
183 | if (r < 0) | |
184 | return r; | |
185 | ||
186 | r = seccomp_load(seccomp); | |
7bc5e0b1 | 187 | if (ERRNO_IS_SECCOMP_FATAL(r)) |
96bedbe2 LP |
188 | return log_error_errno(r, "Failed to install seccomp filter: %m"); |
189 | if (r < 0) | |
190 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
191 | } | |
192 | ||
193 | SECCOMP_FOREACH_LOCAL_ARCH(arch) { | |
194 | _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; | |
195 | ||
196 | log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch)); | |
197 | ||
198 | r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); | |
199 | if (r < 0) | |
200 | return log_error_errno(r, "Failed to allocate seccomp object: %m"); | |
469830d1 LP |
201 | |
202 | /* | |
203 | Audit is broken in containers, much of the userspace audit hookup will fail if running inside a | |
204 | container. We don't care and just turn off creation of audit sockets. | |
205 | ||
206 | This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail with EAFNOSUPPORT which audit userspace uses | |
207 | as indication that audit is disabled in the kernel. | |
208 | */ | |
209 | ||
210 | r = seccomp_rule_add_exact( | |
211 | seccomp, | |
212 | SCMP_ACT_ERRNO(EAFNOSUPPORT), | |
213 | SCMP_SYS(socket), | |
214 | 2, | |
215 | SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), | |
216 | SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); | |
96bedbe2 | 217 | if (r < 0) { |
469830d1 | 218 | log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); |
469830d1 | 219 | continue; |
96bedbe2 | 220 | } |
f011b0b8 | 221 | |
469830d1 | 222 | r = seccomp_load(seccomp); |
7bc5e0b1 | 223 | if (ERRNO_IS_SECCOMP_FATAL(r)) |
469830d1 LP |
224 | return log_error_errno(r, "Failed to install seccomp audit filter: %m"); |
225 | if (r < 0) | |
226 | log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); | |
f011b0b8 DH |
227 | } |
228 | ||
469830d1 | 229 | return 0; |
f011b0b8 DH |
230 | } |
231 | ||
232 | #else | |
233 | ||
960e4569 | 234 | int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) { |
f011b0b8 DH |
235 | return 0; |
236 | } | |
237 | ||
238 | #endif |