]>
Commit | Line | Data |
---|---|---|
e9519f4f KC |
1 | .\" Copyright (C) 2014 Kees Cook <keescook@chromium.org> |
2 | .\" and Copyright (C) 2012 Will Drewry <wad@chromium.org> | |
1445a0ff MK |
3 | .\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com> |
4 | .\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com> | |
e9519f4f KC |
5 | .\" |
6 | .\" %%%LICENSE_START(VERBATIM) | |
7 | .\" Permission is granted to make and distribute verbatim copies of this | |
8 | .\" manual provided the copyright notice and this permission notice are | |
9 | .\" preserved on all copies. | |
10 | .\" | |
11 | .\" Permission is granted to copy and distribute modified versions of this | |
12 | .\" manual under the conditions for verbatim copying, provided that the | |
13 | .\" entire resulting derived work is distributed under the terms of a | |
14 | .\" permission notice identical to this one. | |
15 | .\" | |
16 | .\" Since the Linux kernel and libraries are constantly changing, this | |
17 | .\" manual page may be incorrect or out-of-date. The author(s) assume no | |
18 | .\" responsibility for errors or omissions, or for damages resulting from | |
19 | .\" the use of the information contained herein. The author(s) may not | |
20 | .\" have taken the same level of care in the production of this manual, | |
21 | .\" which is licensed free of charge, as they might when working | |
22 | .\" professionally. | |
23 | .\" | |
24 | .\" Formatted or processed versions of this manual, if unaccompanied by | |
25 | .\" the source, must acknowledge the copyright and authors of this work. | |
26 | .\" %%%LICENSE_END | |
27 | .\" | |
a5409de9 | 28 | .TH SECCOMP 2 2019-11-19 "Linux" "Linux Programmer's Manual" |
e9519f4f | 29 | .SH NAME |
50b4bc23 | 30 | seccomp \- operate on Secure Computing state of the process |
e9519f4f KC |
31 | .SH SYNOPSIS |
32 | .nf | |
33 | .B #include <linux/seccomp.h> | |
34 | .B #include <linux/filter.h> | |
35 | .B #include <linux/audit.h> | |
36 | .B #include <linux/signal.h> | |
37 | .B #include <sys/ptrace.h> | |
cbffd6ae MK |
38 | .\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will |
39 | .\" need <sys/ptrace.h> | |
dbfe9c70 | 40 | .PP |
50b4bc23 MK |
41 | .BI "int seccomp(unsigned int " operation ", unsigned int " flags \ |
42 | ", void *" args ); | |
e9519f4f KC |
43 | .fi |
44 | .SH DESCRIPTION | |
45 | The | |
46 | .BR seccomp () | |
47 | system call operates on the Secure Computing (seccomp) state of the | |
50b4bc23 | 48 | calling process. |
efeece04 | 49 | .PP |
e9519f4f KC |
50 | Currently, Linux supports the following |
51 | .IR operation | |
52 | values: | |
53 | .TP | |
54 | .BR SECCOMP_SET_MODE_STRICT | |
cbffd6ae | 55 | The only system calls that the calling thread is permitted to make are |
e9519f4f KC |
56 | .BR read (2), |
57 | .BR write (2), | |
85fbef74 MK |
58 | .BR _exit (2) |
59 | (but not | |
60 | .BR exit_group (2)), | |
e9519f4f KC |
61 | and |
62 | .BR sigreturn (2). | |
63 | Other system calls result in the delivery of a | |
64 | .BR SIGKILL | |
cbffd6ae | 65 | signal. |
87aa1b2c | 66 | Strict secure computing mode is useful for number-crunching |
e9519f4f KC |
67 | applications that may need to execute untrusted byte code, perhaps |
68 | obtained by reading from a pipe or socket. | |
efeece04 | 69 | .IP |
bec56ee2 JH |
70 | Note that although the calling thread can no longer call |
71 | .BR sigprocmask (2), | |
72 | it can use | |
73 | .BR sigreturn (2) | |
74 | to block all signals apart from | |
75 | .BR SIGKILL | |
76 | and | |
77 | .BR SIGSTOP . | |
65be1b46 | 78 | This means that |
bec56ee2 | 79 | .BR alarm (2) |
65be1b46 MK |
80 | (for example) is not sufficient for restricting the process's execution time. |
81 | Instead, to reliably terminate the process, | |
82 | .BR SIGKILL | |
83 | must be used. | |
84 | This can be done by using | |
bec56ee2 JH |
85 | .BR timer_create (2) |
86 | with | |
87 | .BR SIGEV_SIGNAL | |
88 | and | |
65be1b46 | 89 | .IR sigev_signo |
bec56ee2 | 90 | set to |
65be1b46 MK |
91 | .BR SIGKILL , |
92 | or by using | |
bec56ee2 JH |
93 | .BR setrlimit (2) |
94 | to set the hard limit for | |
95 | .BR RLIMIT_CPU . | |
efeece04 | 96 | .IP |
e9519f4f KC |
97 | This operation is available only if the kernel is configured with |
98 | .BR CONFIG_SECCOMP | |
99 | enabled. | |
efeece04 | 100 | .IP |
e9519f4f KC |
101 | The value of |
102 | .IR flags | |
103 | must be 0, and | |
104 | .IR args | |
105 | must be NULL. | |
efeece04 | 106 | .IP |
50b4bc23 | 107 | This operation is functionally identical to the call: |
efeece04 | 108 | .IP |
f36293a2 MK |
109 | .in +4n |
110 | .EX | |
111 | prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT); | |
112 | .EE | |
113 | .in | |
e9519f4f KC |
114 | .TP |
115 | .BR SECCOMP_SET_MODE_FILTER | |
116 | The system calls allowed are defined by a pointer to a Berkeley Packet | |
117 | Filter (BPF) passed via | |
118 | .IR args . | |
cbffd6ae | 119 | This argument is a pointer to a |
e9519f4f KC |
120 | .IR "struct\ sock_fprog" ; |
121 | it can be designed to filter arbitrary system calls and system call | |
50b4bc23 | 122 | arguments. |
87aa1b2c MK |
123 | If the filter is invalid, |
124 | .BR seccomp () | |
125 | fails, returning | |
cbffd6ae | 126 | .BR EINVAL |
e9519f4f KC |
127 | in |
128 | .IR errno . | |
efeece04 | 129 | .IP |
e9519f4f | 130 | If |
2d5fb4bb | 131 | .BR fork (2) |
e9519f4f | 132 | or |
2d5fb4bb MK |
133 | .BR clone (2) |
134 | is allowed by the filter, any child processes will be constrained to | |
cbffd6ae | 135 | the same system call filters as the parent. |
2d5fb4bb MK |
136 | If |
137 | .BR execve (2) | |
cbffd6ae MK |
138 | is allowed, |
139 | the existing filters will be preserved across a call to | |
2d5fb4bb | 140 | .BR execve (2). |
efeece04 | 141 | .IP |
955efdbc MK |
142 | In order to use the |
143 | .BR SECCOMP_SET_MODE_FILTER | |
400388b5 | 144 | operation, either the calling thread must have the |
e9519f4f | 145 | .BR CAP_SYS_ADMIN |
be8f1272 | 146 | capability in its user namespace, or the thread must already have the |
cbffd6ae MK |
147 | .I no_new_privs |
148 | bit set. | |
149 | If that bit was not already set by an ancestor of this thread, | |
150 | the thread must make the following call: | |
efeece04 | 151 | .IP |
f36293a2 MK |
152 | .in +4n |
153 | .EX | |
154 | prctl(PR_SET_NO_NEW_PRIVS, 1); | |
155 | .EE | |
156 | .in | |
efeece04 | 157 | .IP |
955efdbc MK |
158 | Otherwise, the |
159 | .BR SECCOMP_SET_MODE_FILTER | |
26cd31fd | 160 | operation fails and returns |
e9519f4f KC |
161 | .BR EACCES |
162 | in | |
163 | .IR errno . | |
b8ffefd7 MK |
164 | This requirement ensures that an unprivileged process cannot apply |
165 | a malicious filter and then invoke a set-user-ID or | |
166 | other privileged program using | |
167 | .BR execve (2), | |
cbffd6ae MK |
168 | thus potentially compromising that program. |
169 | (Such a malicious filter might, for example, cause an attempt to use | |
170 | .BR setuid (2) | |
58440555 | 171 | to set the caller's user IDs to nonzero values to instead |
cbffd6ae MK |
172 | return 0 without actually making the system call. |
173 | Thus, the program might be tricked into retaining superuser privileges | |
174 | in circumstances where it is possible to influence it to do | |
175 | dangerous things because it did not actually drop privileges.) | |
efeece04 | 176 | .IP |
7d596c7b | 177 | If |
e9519f4f KC |
178 | .BR prctl (2) |
179 | or | |
bf7bc8b8 | 180 | .BR seccomp () |
7d596c7b | 181 | is allowed by the attached filter, further filters may be added. |
e01ca3aa | 182 | This will increase evaluation time, but allows for further reduction of |
cbffd6ae | 183 | the attack surface during execution of a thread. |
efeece04 | 184 | .IP |
50b4bc23 MK |
185 | The |
186 | .BR SECCOMP_SET_MODE_FILTER | |
187 | operation is available only if the kernel is configured with | |
e9519f4f KC |
188 | .BR CONFIG_SECCOMP_FILTER |
189 | enabled. | |
efeece04 | 190 | .IP |
e9519f4f KC |
191 | When |
192 | .IR flags | |
50b4bc23 | 193 | is 0, this operation is functionally identical to the call: |
efeece04 | 194 | .IP |
f36293a2 MK |
195 | .in +4n |
196 | .EX | |
197 | prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args); | |
198 | .EE | |
199 | .in | |
efeece04 | 200 | .IP |
e9519f4f KC |
201 | The recognized |
202 | .IR flags | |
203 | are: | |
204 | .RS | |
205 | .TP | |
206 | .BR SECCOMP_FILTER_FLAG_TSYNC | |
87aa1b2c | 207 | When adding a new filter, synchronize all other threads of the calling |
50b4bc23 | 208 | process to the same seccomp filter tree. |
b8ffefd7 MK |
209 | A "filter tree" is the ordered list of filters attached to a thread. |
210 | (Attaching identical filters in separate | |
211 | .BR seccomp () | |
212 | calls results in different filters from this perspective.) | |
efeece04 | 213 | .IP |
b8ffefd7 | 214 | If any thread cannot synchronize to the same filter tree, |
50b4bc23 MK |
215 | the call will not attach the new seccomp filter, |
216 | and will fail, returning the first thread ID found that cannot synchronize. | |
cbffd6ae | 217 | Synchronization will fail if another thread in the same process is in |
e9519f4f | 218 | .BR SECCOMP_MODE_STRICT |
50b4bc23 MK |
219 | or if it has attached new seccomp filters to itself, |
220 | diverging from the calling thread's filter tree. | |
17c56ad0 TH |
221 | .TP |
222 | .BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)" | |
223 | .\" commit e66a39977985b1e69e17c4042cb290768eca9b02 | |
224 | All filter return actions except | |
225 | .BR SECCOMP_RET_ALLOW | |
1e94218c MK |
226 | should be logged. |
227 | An administrator may override this filter flag by preventing specific | |
228 | actions from being logged via the | |
17c56ad0 TH |
229 | .IR /proc/sys/kernel/seccomp/actions_logged |
230 | file. | |
906e981b MK |
231 | .TP |
232 | .BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)" | |
233 | .\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675 | |
234 | Disable Speculative Store Bypass mitigation. | |
e9519f4f | 235 | .RE |
6d1728da TH |
236 | .TP |
237 | .BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)" | |
238 | .\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655 | |
1e94218c MK |
239 | Test to see if an action is supported by the kernel. |
240 | This operation is helpful to confirm that the kernel knows | |
241 | of a more recently added filter return action | |
242 | since the kernel treats all unknown actions as | |
0771269c | 243 | .BR SECCOMP_RET_KILL_PROCESS . |
6d1728da TH |
244 | .IP |
245 | The value of | |
246 | .IR flags | |
247 | must be 0, and | |
248 | .IR args | |
249 | must be a pointer to an unsigned 32-bit filter return action. | |
93b9a9ee | 250 | .SS Filters |
e9519f4f KC |
251 | When adding filters via |
252 | .BR SECCOMP_SET_MODE_FILTER , | |
253 | .IR args | |
254 | points to a filter program: | |
efeece04 | 255 | .PP |
e9519f4f | 256 | .in +4n |
b8302363 | 257 | .EX |
e9519f4f | 258 | struct sock_fprog { |
1da13d15 MK |
259 | unsigned short len; /* Number of BPF instructions */ |
260 | struct sock_filter *filter; /* Pointer to array of | |
261 | BPF instructions */ | |
e9519f4f | 262 | }; |
b8302363 | 263 | .EE |
e9519f4f | 264 | .in |
efeece04 | 265 | .PP |
e9519f4f | 266 | Each program must contain one or more BPF instructions: |
efeece04 | 267 | .PP |
e9519f4f | 268 | .in +4n |
b8302363 | 269 | .EX |
1da13d15 MK |
270 | struct sock_filter { /* Filter block */ |
271 | __u16 code; /* Actual filter code */ | |
272 | __u8 jt; /* Jump true */ | |
273 | __u8 jf; /* Jump false */ | |
274 | __u32 k; /* Generic multiuse field */ | |
e9519f4f | 275 | }; |
b8302363 | 276 | .EE |
e9519f4f | 277 | .in |
efeece04 | 278 | .PP |
0c2e01b7 MK |
279 | When executing the instructions, the BPF program operates on the |
280 | system call information made available (i.e., use the | |
281 | .BR BPF_ABS | |
1a7e5113 MK |
282 | addressing mode) as a (read-only) |
283 | .\" Quoting Kees Cook: | |
284 | .\" If BPF even allows changing the data, it's not copied back to | |
285 | .\" the syscall when it runs. Anything wanting to do things like | |
3775c1ca | 286 | .\" that would need to use ptrace to catch the call and directly |
1a7e5113 MK |
287 | .\" modify the registers before continuing with the call. |
288 | buffer of the following form: | |
efeece04 | 289 | .PP |
e9519f4f | 290 | .in +4n |
b8302363 | 291 | .EX |
e9519f4f | 292 | struct seccomp_data { |
5b95c4ab | 293 | int nr; /* System call number */ |
a0a0c98b MK |
294 | __u32 arch; /* AUDIT_ARCH_* value |
295 | (see <linux/audit.h>) */ | |
e9519f4f | 296 | __u64 instruction_pointer; /* CPU instruction pointer */ |
1da13d15 | 297 | __u64 args[6]; /* Up to 6 system call arguments */ |
e9519f4f | 298 | }; |
b8302363 | 299 | .EE |
e9519f4f | 300 | .in |
efeece04 | 301 | .PP |
944a31e2 | 302 | Because numbering of system calls varies between architectures and |
65cfc712 | 303 | some architectures (e.g., x86-64) allow user-space code to use |
036b0619 MK |
304 | the calling conventions of multiple architectures |
305 | (and the convention being used may vary over the life of a process that uses | |
306 | .BR execve (2) | |
307 | to execute binaries that employ the different conventions), | |
308 | it is usually necessary to verify the value of the | |
b44088b4 JH |
309 | .IR arch |
310 | field. | |
efeece04 | 311 | .PP |
462ce23d | 312 | It is strongly recommended to use an allow-list approach whenever |
b44088b4 | 313 | possible because such an approach is more robust and simple. |
462ce23d | 314 | A deny-list will have to be updated whenever a potentially |
37daa840 | 315 | dangerous system call is added (or a dangerous flag or option if those |
462ce23d | 316 | are deny-listed), and it is often possible to alter the |
b44088b4 | 317 | representation of a value without altering its meaning, leading to |
462ce23d | 318 | a deny-list bypass. |
5069617c MK |
319 | See also |
320 | .IR Caveats | |
321 | below. | |
efeece04 | 322 | .PP |
b44088b4 JH |
323 | The |
324 | .IR arch | |
37daa840 | 325 | field is not unique for all calling conventions. |
65cfc712 | 326 | The x86-64 ABI and the x32 ABI both use |
b44088b4 JH |
327 | .BR AUDIT_ARCH_X86_64 |
328 | as | |
329 | .IR arch , | |
37daa840 MK |
330 | and they run on the same processors. |
331 | Instead, the mask | |
b44088b4 JH |
332 | .BR __X32_SYSCALL_BIT |
333 | is used on the system call number to tell the two ABIs apart. | |
8cc4d071 | 334 | .\" As noted by Dave Drysdale in a note at the end of |
3c5ab770 MK |
335 | .\" https://lwn.net/Articles/604515/ |
336 | .\" One additional detail to point out for the x32 ABI case: | |
337 | .\" the syscall number gets a high bit set (__X32_SYSCALL_BIT), | |
338 | .\" to mark it as an x32 call. | |
339 | .\" | |
340 | .\" If x32 support is included in the kernel, then __SYSCALL_MASK | |
341 | .\" will have a value that is not all-ones, and this will trigger | |
342 | .\" an extra instruction in system_call to mask off the extra bit, | |
8cc4d071 | 343 | .\" so that the syscall table indexing still works. |
efeece04 | 344 | .PP |
b44088b4 | 345 | This means that in order to create a seccomp-based |
462ce23d | 346 | deny-list for system calls performed through the x86-64 ABI, |
b44088b4 JH |
347 | it is necessary to not only check that |
348 | .IR arch | |
349 | equals | |
350 | .BR AUDIT_ARCH_X86_64 , | |
7a79bb30 | 351 | but also to explicitly reject all system calls that contain |
b44088b4 JH |
352 | .BR __X32_SYSCALL_BIT |
353 | in | |
354 | .IR nr . | |
efeece04 | 355 | .PP |
712551ea MK |
356 | The |
357 | .I instruction_pointer | |
358 | field provides the address of the machine-language instruction that | |
359 | performed the system call. | |
360 | This might be useful in conjunction with the use of | |
361 | .I /proc/[pid]/maps | |
362 | to perform checks based on which region (mapping) of the program | |
363 | made the system call. | |
364 | (Probably, it is wise to lock down the | |
365 | .BR mmap (2) | |
366 | and | |
367 | .BR mprotect (2) | |
368 | system calls to prevent the program from subverting such checks.) | |
efeece04 | 369 | .PP |
b44088b4 JH |
370 | When checking values from |
371 | .IR args | |
462ce23d | 372 | against a deny-list, keep in mind that arguments are often |
37daa840 MK |
373 | silently truncated before being processed, but after the seccomp check. |
374 | For example, this happens if the i386 ABI is used on an | |
3c5ab770 | 375 | x86-64 kernel: although the kernel will normally not look beyond |
b44088b4 | 376 | the 32 lowest bits of the arguments, the values of the full |
37daa840 | 377 | 64-bit registers will be present in the seccomp data. |
65cfc712 | 378 | A less surprising example is that if the x86-64 ABI is used to perform |
37daa840 MK |
379 | a system call that takes an argument of type |
380 | .IR int , | |
381 | the more-significant half of the argument register is ignored by | |
382 | the system call, but visible in the seccomp data. | |
efeece04 | 383 | .PP |
cbffd6ae MK |
384 | A seccomp filter returns a 32-bit value consisting of two parts: |
385 | the most significant 16 bits | |
386 | (corresponding to the mask defined by the constant | |
0c438780 | 387 | .BR SECCOMP_RET_ACTION_FULL ) |
cbffd6ae MK |
388 | contain one of the "action" values listed below; |
389 | the least significant 16-bits (defined by the constant | |
390 | .BR SECCOMP_RET_DATA ) | |
391 | are "data" to be associated with this return value. | |
efeece04 | 392 | .PP |
755a656a MK |
393 | If multiple filters exist, they are \fIall\fP executed, |
394 | in reverse order of their addition to the filter tree\(emthat is, | |
395 | the most recently installed filter is executed first. | |
d74503a5 MK |
396 | (Note that all filters will be called |
397 | even if one of the earlier filters returns | |
398 | .BR SECCOMP_RET_KILL . | |
399 | This is done to simplify the kernel code and to provide a | |
400 | tiny speed-up in the execution of sets of filters by | |
401 | avoiding a check for this uncommon case.) | |
cb453c80 MK |
402 | .\" From an Aug 2015 conversation with Kees Cook where I asked why *all* |
403 | .\" filters are applied even if one of the early filters returns | |
404 | .\" SECCOMP_RET_KILL: | |
d74503a5 MK |
405 | .\" |
406 | .\" It's just because it would be an optimization that would only speed up | |
407 | .\" the RET_KILL case, but it's the uncommon one and the one that doesn't | |
408 | .\" benefit meaningfully from such a change (you need to kill the process | |
409 | .\" really quickly?). We would speed up killing a program at the (albeit | |
410 | .\" tiny) expense to all other filtered programs. Best to keep the filter | |
411 | .\" execution logic clear, simple, and as fast as possible for all | |
412 | .\" filters. | |
cbffd6ae | 413 | The return value for the evaluation of a given system call is the first-seen |
1d530819 | 414 | action value of highest precedence (along with its accompanying data) |
cbffd6ae | 415 | returned by execution of all of the filters. |
efeece04 | 416 | .PP |
cbffd6ae | 417 | In decreasing order of precedence, |
1d530819 | 418 | the action values that may be returned by a seccomp filter are: |
e9519f4f | 419 | .TP |
51c58a6c MK |
420 | .BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)" |
421 | .\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 | |
422 | .\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 | |
423 | This value results in immediate termination of the process, | |
424 | with a core dump. | |
425 | The system call is not executed. | |
426 | By contrast with | |
427 | .BR SECCOMP_RET_KILL_THREAD | |
428 | below, all threads in the thread group are terminated. | |
429 | (For a discussion of thread groups, see the description of the | |
430 | .BR CLONE_THREAD | |
431 | flag in | |
432 | .BR clone (2).) | |
433 | .IP | |
434 | The process terminates | |
435 | .I "as though" | |
436 | killed by a | |
437 | .B SIGSYS | |
438 | signal. | |
439 | Even if a signal handler has been registered for | |
440 | .BR SIGSYS , | |
441 | the handler will be ignored in this case and the process always terminates. | |
442 | To a parent process that is waiting on this process (using | |
443 | .BR waitpid (2) | |
444 | or similar), the returned | |
445 | .I wstatus | |
446 | will indicate that its child was terminated as though by a | |
447 | .BR SIGSYS | |
448 | signal. | |
449 | .TP | |
6aa0baa4 | 450 | .BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL ) |
5cfa0627 MK |
451 | This value results in immediate termination of the thread |
452 | that made the system call. | |
453 | The system call is not executed. | |
454 | Other threads in the same thread group will continue to execute. | |
455 | .IP | |
b9c6742b MK |
456 | The thread terminates |
457 | .I "as though" | |
458 | killed by a | |
0a2a0df0 | 459 | .B SIGSYS |
5cfa0627 | 460 | signal. |
b9c6742b MK |
461 | See |
462 | .BR SECCOMP_RET_KILL_PROCESS | |
463 | above. | |
efeece04 | 464 | .IP |
089761d5 | 465 | .\" See these commits: |
6aa0baa4 MK |
466 | .\" seccomp: dump core when using SECCOMP_RET_KILL |
467 | .\" (b25e67161c295c98acda92123b2dd1e7d8642901) | |
468 | .\" seccomp: Only dump core when single-threaded | |
469 | .\" (d7276e321ff8a53106a59c85ca46d03e34288893) | |
baaf65e8 | 470 | Before Linux 4.11, |
6aa0baa4 | 471 | any process terminated in this way would not trigger a coredump |
089761d5 MF |
472 | (even though |
473 | .B SIGSYS | |
474 | is documented in | |
475 | .BR signal (7) | |
baaf65e8 MK |
476 | as having a default action of termination with a core dump). |
477 | Since Linux 4.11, | |
6aa0baa4 MK |
478 | a single-threaded process will dump core if terminated in this way. |
479 | .IP | |
480 | With the addition of | |
481 | .BR SECCOMP_RET_KILL_PROCESS | |
482 | in Linux 4.14, | |
483 | .BR SECCOMP_RET_KILL_THREAD | |
484 | was added as a synonym for | |
485 | .BR SECCOMP_RET_KILL , | |
486 | in order to more clearly distinguish the two actions. | |
e9519f4f KC |
487 | .TP |
488 | .BR SECCOMP_RET_TRAP | |
b94c5c5a | 489 | This value results in the kernel sending a thread-directed |
e9519f4f | 490 | .BR SIGSYS |
b94c5c5a MK |
491 | signal to the triggering thread. |
492 | (The system call is not executed.) | |
cbffd6ae MK |
493 | Various fields will be set in the |
494 | .I siginfo_t | |
db99c064 MK |
495 | structure (see |
496 | .BR sigaction (2)) | |
497 | associated with signal: | |
cf690e13 MK |
498 | .RS |
499 | .IP * 3 | |
500 | .I si_signo | |
501 | will contain | |
502 | .BR SIGSYS . | |
503 | .IP * | |
db99c064 | 504 | .IR si_call_addr |
cf690e13 MK |
505 | will show the address of the system call instruction. |
506 | .IP * | |
db99c064 | 507 | .IR si_syscall |
e9519f4f | 508 | and |
db99c064 | 509 | .IR si_arch |
87aa1b2c | 510 | will indicate which system call was attempted. |
cf690e13 MK |
511 | .IP * |
512 | .I si_code | |
cf690e13 MK |
513 | will contain |
514 | .BR SYS_SECCOMP . | |
515 | .IP * | |
516 | .I si_errno | |
517 | will contain the | |
518 | .BR SECCOMP_RET_DATA | |
519 | portion of the filter return value. | |
520 | .RE | |
521 | .IP | |
50b4bc23 | 522 | The program counter will be as though the system call happened |
6dfb150c | 523 | (i.e., the program counter will not point to the system call instruction). |
50b4bc23 | 524 | The return value register will contain an architecture\-dependent value; |
06865301 | 525 | if resuming execution, set it to something appropriate for the system call. |
a79566fb | 526 | (The architecture dependency is because replacing it with |
e9519f4f KC |
527 | .BR ENOSYS |
528 | could overwrite some useful information.) | |
e9519f4f KC |
529 | .TP |
530 | .BR SECCOMP_RET_ERRNO | |
610082d9 | 531 | This value results in the |
cbffd6ae MK |
532 | .B SECCOMP_RET_DATA |
533 | portion of the filter's return value being passed to user space as the | |
e9519f4f | 534 | .IR errno |
cbffd6ae | 535 | value without executing the system call. |
e9519f4f KC |
536 | .TP |
537 | .BR SECCOMP_RET_TRACE | |
87aa1b2c MK |
538 | When returned, this value will cause the kernel to attempt to notify a |
539 | .BR ptrace (2)-based | |
540 | tracer prior to executing the system call. | |
50b4bc23 | 541 | If there is no tracer present, |
3fbe3aa3 MK |
542 | the system call is not executed and returns a failure status with |
543 | .I errno | |
544 | set to | |
545 | .BR ENOSYS . | |
efeece04 | 546 | .IP |
e9519f4f KC |
547 | A tracer will be notified if it requests |
548 | .BR PTRACE_O_TRACESECCOMP | |
549 | using | |
550 | .IR ptrace(PTRACE_SETOPTIONS) . | |
551 | The tracer will be notified of a | |
552 | .BR PTRACE_EVENT_SECCOMP | |
553 | and the | |
554 | .BR SECCOMP_RET_DATA | |
cbffd6ae | 555 | portion of the filter's return value will be available to the tracer via |
e9519f4f | 556 | .BR PTRACE_GETEVENTMSG . |
efeece04 | 557 | .IP |
50b4bc23 MK |
558 | The tracer can skip the system call by changing the system call number |
559 | to \-1. | |
560 | Alternatively, the tracer can change the system call | |
87aa1b2c | 561 | requested by changing the system call to a valid system call number. |
50b4bc23 MK |
562 | If the tracer asks to skip the system call, then the system call will |
563 | appear to return the value that the tracer puts in the return value register. | |
efeece04 | 564 | .IP |
09481df3 JH |
565 | .\" This was changed in ce6526e8afa4. |
566 | .\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was | |
567 | .\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and | |
568 | .\" 0f3912fd934c for ARM. | |
569 | Before kernel 4.8, the seccomp check will not be run again after the tracer is | |
570 | notified. | |
571 | (This means that, on older kernels, seccomp-based sandboxes | |
50b4bc23 | 572 | .B "must not" |
87aa1b2c MK |
573 | allow use of |
574 | .BR ptrace (2)\(emeven | |
575 | of other | |
50b4bc23 | 576 | sandboxed processes\(emwithout extreme care; |
06865301 | 577 | ptracers can use this mechanism to escape from the seccomp sandbox.) |
e9519f4f | 578 | .TP |
f04207f4 TH |
579 | .BR SECCOMP_RET_LOG " (since Linux 4.14)" |
580 | .\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 | |
1e94218c MK |
581 | This value results in the system call being executed after |
582 | the filter return action is logged. | |
583 | An administrator may override the logging of this action via | |
f04207f4 TH |
584 | the |
585 | .IR /proc/sys/kernel/seccomp/actions_logged | |
586 | file. | |
587 | .TP | |
e9519f4f | 588 | .BR SECCOMP_RET_ALLOW |
610082d9 | 589 | This value results in the system call being executed. |
0771269c MK |
590 | .PP |
591 | If an action value other than one of the above is specified, | |
592 | then the filter action is treated as either | |
593 | .BR SECCOMP_RET_KILL_PROCESS | |
594 | (since Linux 4.14) | |
595 | .\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 | |
596 | or | |
597 | .BR SECCOMP_RET_KILL_THREAD | |
598 | (in Linux 4.13 and earlier). | |
599 | .\" | |
2577dbba TH |
600 | .SS /proc interfaces |
601 | The files in the directory | |
602 | .IR /proc/sys/kernel/seccomp | |
603 | provide additional seccomp information and configuration: | |
604 | .TP | |
605 | .IR actions_avail " (since Linux 4.14)" | |
606 | .\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af | |
1e94218c MK |
607 | A read-only ordered list of seccomp filter return actions in string form. |
608 | The ordering, from left-to-right, is in decreasing order of precedence. | |
609 | The list represents the set of seccomp filter return actions | |
610 | supported by the kernel. | |
2577dbba TH |
611 | .TP |
612 | .IR actions_logged " (since Linux 4.14)" | |
613 | .\" commit 0ddec0fc8900201c0897b87b762b7c420436662f | |
1e94218c MK |
614 | A read-write ordered list of seccomp filter return actions that |
615 | are allowed to be logged. | |
616 | Writes to the file do not need to be in ordered form but reads from | |
2577dbba TH |
617 | the file will be ordered in the same way as the |
618 | .IR actions_avail | |
619 | file. | |
620 | .IP | |
621 | It is important to note that the value of | |
622 | .IR actions_logged | |
1e94218c MK |
623 | does not prevent certain filter return actions from being logged when |
624 | the audit subsystem is configured to audit a task. | |
625 | If the action is not found in the | |
2577dbba TH |
626 | .IR actions_logged |
627 | file, the final decision on whether to audit the action for that task is | |
628 | ultimately left up to the audit subsystem to decide for all filter return | |
629 | actions other than | |
630 | .BR SECCOMP_RET_ALLOW . | |
631 | .IP | |
632 | The "allow" string is not accepted in the | |
633 | .IR actions_logged | |
634 | file as it is not possible to log | |
635 | .BR SECCOMP_RET_ALLOW | |
1e94218c | 636 | actions. |
865c9c81 MK |
637 | Attempting to write "allow" to the file will fail with the error |
638 | .BR EINVAL . | |
df5b5f9a MK |
639 | .\" |
640 | .SS Audit logging of seccomp actions | |
641 | .\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 | |
642 | Since Linux 4.14, the kernel provides the facility to log the | |
643 | actions returned by seccomp filters in the audit log. | |
644 | The kernel makes the decision to log an action based on | |
645 | the action type, whether or not the action is present in the | |
646 | .I actions_logged | |
647 | file, and whether kernel auditing is enabled | |
648 | (e.g., via the kernel boot option | |
649 | .IR audit=1 ). | |
650 | .\" or auditing could be enabled via the netlink API (AUDIT_SET) | |
651 | The rules are as follows: | |
652 | .IP * 3 | |
653 | If the action is | |
654 | .BR SECCOMP_RET_ALLOW , | |
655 | the action is not logged. | |
656 | .IP * | |
657 | Otherwise, if the action is either | |
658 | .BR SECCOMP_RET_KILL_PROCESS | |
659 | or | |
660 | .BR SECCOMP_RET_KILL_THREAD , | |
c6688cd1 | 661 | and that action appears in the |
df5b5f9a MK |
662 | .IR actions_logged |
663 | file, the action is logged. | |
664 | .IP * | |
665 | Otherwise, if the filter has requested logging (the | |
666 | .BR SECCOMP_FILTER_FLAG_LOG | |
667 | flag) | |
c6688cd1 | 668 | and the action appears in the |
df5b5f9a MK |
669 | .IR actions_logged |
670 | file, the action is logged. | |
671 | .IP * | |
672 | Otherwise, if kernel auditing is enabled and the process is being audited | |
673 | .RB ( autrace (8)), | |
674 | the action is logged. | |
675 | .IP * | |
676 | Otherwise, the action is not logged. | |
e9519f4f KC |
677 | .SH RETURN VALUE |
678 | On success, | |
679 | .BR seccomp () | |
680 | returns 0. | |
681 | On error, if | |
682 | .BR SECCOMP_FILTER_FLAG_TSYNC | |
50b4bc23 | 683 | was used, |
880bc4eb MK |
684 | the return value is the ID of the thread |
685 | that caused the synchronization failure. | |
610082d9 MK |
686 | (This ID is a kernel thread ID of the type returned by |
687 | .BR clone (2) | |
688 | and | |
0e27eb0c | 689 | .BR gettid (2).) |
50b4bc23 | 690 | On other errors, \-1 is returned, and |
e9519f4f KC |
691 | .IR errno |
692 | is set to indicate the cause of the error. | |
693 | .SH ERRORS | |
694 | .BR seccomp () | |
695 | can fail for the following reasons: | |
696 | .TP | |
7b10f505 | 697 | .BR EACCES |
50b4bc23 | 698 | The caller did not have the |
e9519f4f | 699 | .BR CAP_SYS_ADMIN |
be8f1272 | 700 | capability in its user namespace, or had not set |
e9519f4f KC |
701 | .IR no_new_privs |
702 | before using | |
703 | .BR SECCOMP_SET_MODE_FILTER . | |
704 | .TP | |
705 | .BR EFAULT | |
706 | .IR args | |
43b265ff | 707 | was not a valid address. |
e9519f4f KC |
708 | .TP |
709 | .BR EINVAL | |
710 | .IR operation | |
1f5ad3c8 MK |
711 | is unknown or is not supported by this kernel version or configuration. |
712 | .TP | |
713 | .B EINVAL | |
714 | The specified | |
e9519f4f KC |
715 | .IR flags |
716 | are invalid for the given | |
043ed1c6 | 717 | .IR operation . |
29efefec MK |
718 | .TP |
719 | .BR EINVAL | |
882cf566 MK |
720 | .I operation |
721 | included | |
722 | .BR BPF_ABS , | |
723 | but the specified offset was not aligned to a 32-bit boundary or exceeded | |
724 | .IR "sizeof(struct\ seccomp_data)" . | |
882cf566 MK |
725 | .TP |
726 | .BR EINVAL | |
29efefec MK |
727 | .\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources |
728 | A secure computing mode has already been set, and | |
729 | .I operation | |
730 | differs from the existing setting. | |
29efefec MK |
731 | .TP |
732 | .BR EINVAL | |
29efefec MK |
733 | .I operation |
734 | specified | |
735 | .BR SECCOMP_SET_MODE_FILTER , | |
736 | but the filter program pointed to by | |
737 | .I args | |
738 | was not valid or the length of the filter program was zero or exceeded | |
739 | .B BPF_MAXINSNS | |
740 | (4096) instructions. | |
29efefec MK |
741 | .TP |
742 | .BR ENOMEM | |
743 | Out of memory. | |
29efefec MK |
744 | .TP |
745 | .BR ENOMEM | |
746 | .\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources | |
747 | The total length of all filter programs attached | |
748 | to the calling thread would exceed | |
749 | .B MAX_INSNS_PER_PATH | |
750 | (32768) instructions. | |
751 | Note that for the purposes of calculating this limit, | |
ce67ca23 MK |
752 | each already existing filter program incurs an |
753 | overhead penalty of 4 instructions. | |
e9519f4f | 754 | .TP |
6d1728da TH |
755 | .BR EOPNOTSUPP |
756 | .I operation | |
757 | specified | |
758 | .BR SECCOMP_GET_ACTION_AVAIL , | |
759 | but the kernel does not support the filter return action specified by | |
760 | .IR args . | |
761 | .TP | |
e9519f4f KC |
762 | .BR ESRCH |
763 | Another thread caused a failure during thread sync, but its ID could not | |
764 | be determined. | |
765 | .SH VERSIONS | |
87aa1b2c | 766 | The |
dff53954 | 767 | .BR seccomp () |
87aa1b2c | 768 | system call first appeared in Linux 3.17. |
cbffd6ae | 769 | .\" FIXME . Add glibc version |
e9519f4f | 770 | .SH CONFORMING TO |
87aa1b2c | 771 | The |
dff53954 | 772 | .BR seccomp () |
87aa1b2c | 773 | system call is a nonstandard Linux extension. |
e9519f4f | 774 | .SH NOTES |
aea38298 MK |
775 | Rather than hand-coding seccomp filters as shown in the example below, |
776 | you may prefer to employ the | |
777 | .I libseccomp | |
778 | library, which provides a front-end for generating seccomp filters. | |
efeece04 | 779 | .PP |
1367a60a MK |
780 | The |
781 | .IR Seccomp | |
782 | field of the | |
783 | .IR /proc/[pid]/status | |
784 | file provides a method of viewing the seccomp mode of a process; see | |
785 | .BR proc (5). | |
efeece04 | 786 | .PP |
e9519f4f | 787 | .BR seccomp () |
50b4bc23 MK |
788 | provides a superset of the functionality provided by the |
789 | .BR prctl (2) | |
b95b411b | 790 | .BR PR_SET_SECCOMP |
50b4bc23 MK |
791 | operation (which does not support |
792 | .IR flags ). | |
efeece04 | 793 | .PP |
88f02af6 | 794 | Since Linux 4.4, the |
d0eae5b5 | 795 | .BR ptrace (2) |
88f02af6 MK |
796 | .B PTRACE_SECCOMP_GET_FILTER |
797 | operation can be used to dump a process's seccomp filters. | |
798 | .\" | |
30c5868e MK |
799 | .SS Architecture support for seccomp BPF |
800 | Architecture support for seccomp BPF filtering | |
801 | .\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in | |
802 | .\" kernel source. Last checked in Linux 4.16-rc source. | |
803 | is available on the following architectures: | |
804 | .IP * 3 | |
805 | x86-64, i386, x32 (since Linux 3.5) | |
806 | .PD 0 | |
807 | .IP * | |
808 | ARM (since Linux 3.8) | |
809 | .IP * | |
810 | s390 (since Linux 3.8) | |
811 | .IP * | |
812 | MIPS (since Linux 3.16) | |
813 | .IP * | |
814 | ARM-64 (since Linux 3.19) | |
815 | .IP * | |
816 | PowerPC (since Linux 4.3) | |
817 | .IP * | |
818 | Tile (since Linux 4.3) | |
819 | .IP * | |
820 | PA-RISC (since Linux 4.6) | |
821 | .\" User mode Linux since Linux 4.6 | |
822 | .PD | |
823 | .\" | |
5069617c MK |
824 | .SS Caveats |
825 | There are various subtleties to consider when applying seccomp filters | |
826 | to a program, including the following: | |
827 | .IP * 3 | |
828 | Some traditional system calls have user-space implementations in the | |
829 | .BR vdso (7) | |
830 | on many architectures. | |
831 | Notable examples include | |
832 | .BR clock_gettime (2), | |
833 | .BR gettimeofday (2), | |
834 | and | |
835 | .BR time (2). | |
836 | On such architectures, | |
837 | seccomp filtering for these system calls will have no effect. | |
42cfb355 MK |
838 | (However, there are cases where the |
839 | .BR vdso (7) | |
840 | implementations may fall back to invoking the true system call, | |
841 | in which case seccomp filters would see the system call.) | |
5069617c MK |
842 | .IP * |
843 | Seccomp filtering is based on system call numbers. | |
844 | However, applications typically do not directly invoke system calls, | |
845 | but instead call wrapper functions in the C library which | |
846 | in turn invoke the system calls. | |
847 | Consequently, one must be aware of the following: | |
848 | .RS | |
849 | .IP \(bu 3 | |
850 | The glibc wrappers for some traditional system calls may actually | |
851 | employ system calls with different names in the kernel. | |
852 | For example, the | |
853 | .BR exit (2) | |
854 | wrapper function actually employs the | |
855 | .BR exit_group (2) | |
856 | system call, and the | |
857 | .BR fork (2) | |
858 | wrapper function actually calls | |
859 | .BR clone (2). | |
860 | .IP \(bu | |
861 | The behavior of wrapper functions may vary across architectures, | |
862 | according to the range of system calls provided on those architectures. | |
863 | In other words, the same wrapper function may invoke | |
864 | different system calls on different architectures. | |
865 | .IP \(bu | |
866 | Finally, the behavior of wrapper functions can change across glibc versions. | |
867 | For example, in older versions, the glibc wrapper function for | |
868 | .BR open (2) | |
869 | invoked the system call of the same name, | |
870 | but starting in glibc 2.26, the implementation switched to calling | |
871 | .BR openat (2) | |
872 | on all architectures. | |
873 | .RE | |
874 | .PP | |
faec2136 | 875 | The consequence of the above points is that it may be necessary |
5069617c MK |
876 | to filter for a system call other than might be expected. |
877 | Various manual pages in Section 2 provide helpful details | |
878 | about the differences between wrapper functions and | |
879 | the underlying system calls in subsections entitled | |
880 | .IR "C library/kernel differences" . | |
881 | .PP | |
882 | Furthermore, note that the application of seccomp filters | |
883 | even risks causing bugs in an application, | |
884 | when the filters cause unexpected failures for legitimate operations | |
885 | that the application might need to perform. | |
886 | Such bugs may not easily be discovered when testing the seccomp | |
887 | filters if the bugs occur in rarely used application code paths. | |
5069617c | 888 | .\" |
cecc8c48 MK |
889 | .SS Seccomp-specific BPF details |
890 | Note the following BPF details specific to seccomp filters: | |
891 | .IP * 3 | |
892 | The | |
893 | .B BPF_H | |
894 | and | |
895 | .B BPF_B | |
896 | size modifiers are not supported: all operations must load and store | |
897 | (4-byte) words | |
898 | .RB ( BPF_W ). | |
899 | .IP * | |
900 | To access the contents of the | |
901 | .I seccomp_data | |
902 | buffer, use the | |
903 | .B BPF_ABS | |
904 | addressing mode modifier. | |
cecc8c48 MK |
905 | .IP * |
906 | The | |
907 | .B BPF_LEN | |
908 | addressing mode modifier yields an immediate mode operand | |
909 | whose value is the size of the | |
910 | .IR seccomp_data | |
911 | buffer. | |
e9519f4f | 912 | .SH EXAMPLE |
35e4506b MK |
913 | The program below accepts four or more arguments. |
914 | The first three arguments are a system call number, | |
915 | a numeric architecture identifier, and an error number. | |
916 | The program uses these values to construct a BPF filter | |
917 | that is used at run time to perform the following checks: | |
918 | .IP [1] 4 | |
919 | If the program is not running on the specified architecture, | |
920 | the BPF filter causes system calls to fail with the error | |
921 | .BR ENOSYS . | |
922 | .IP [2] | |
923 | If the program attempts to execute the system call with the specified number, | |
924 | the BPF filter causes the system call to fail, with | |
925 | .I errno | |
926 | being set to the specified error number. | |
927 | .PP | |
928 | The remaining command-line arguments specify | |
929 | the pathname and additional arguments of a program | |
930 | that the example program should attempt to execute using | |
71bb61ec | 931 | .BR execv (3) |
35e4506b MK |
932 | (a library function that employs the |
933 | .BR execve (2) | |
934 | system call). | |
935 | Some example runs of the program are shown below. | |
efeece04 | 936 | .PP |
35e4506b MK |
937 | First, we display the architecture that we are running on (x86-64) |
938 | and then construct a shell function that looks up system call | |
939 | numbers on this architecture: | |
efeece04 | 940 | .PP |
35e4506b | 941 | .in +4n |
b8302363 | 942 | .EX |
35e4506b MK |
943 | $ \fBuname -m\fP |
944 | x86_64 | |
945 | $ \fBsyscall_nr() { | |
d1a71985 | 946 | cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e |
06865301 | 947 | awk '$2 != "x32" && $3 == "'$1'" { print $1 }' |
35e4506b | 948 | }\fP |
b8302363 | 949 | .EE |
e646a1ba | 950 | .in |
efeece04 | 951 | .PP |
35e4506b MK |
952 | When the BPF filter rejects a system call (case [2] above), |
953 | it causes the system call to fail with the error number | |
954 | specified on the command line. | |
955 | In the experiments shown here, we'll use error number 99: | |
efeece04 | 956 | .PP |
35e4506b | 957 | .in +4n |
b8302363 | 958 | .EX |
35e4506b MK |
959 | $ \fBerrno 99\fP |
960 | EADDRNOTAVAIL 99 Cannot assign requested address | |
b8302363 | 961 | .EE |
e646a1ba | 962 | .in |
efeece04 | 963 | .PP |
35e4506b MK |
964 | In the following example, we attempt to run the command |
965 | .BR whoami (1), | |
966 | but the BPF filter rejects the | |
967 | .BR execve (2) | |
968 | system call, so that the command is not even executed: | |
efeece04 | 969 | .PP |
35e4506b | 970 | .in +4n |
b8302363 | 971 | .EX |
35e4506b | 972 | $ \fBsyscall_nr execve\fP |
06865301 | 973 | 59 |
cbffd6ae MK |
974 | $ \fB./a.out\fP |
975 | Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>] | |
976 | Hint for <arch>: AUDIT_ARCH_I386: 0x40000003 | |
977 | AUDIT_ARCH_X86_64: 0xC000003E | |
35e4506b MK |
978 | $ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP |
979 | execv: Cannot assign requested address | |
b8302363 | 980 | .EE |
e646a1ba | 981 | .in |
efeece04 | 982 | .PP |
35e4506b MK |
983 | In the next example, the BPF filter rejects the |
984 | .BR write (2) | |
985 | system call, so that, although it is successfully started, the | |
986 | .BR whoami (1) | |
987 | command is not able to write output: | |
efeece04 | 988 | .PP |
35e4506b | 989 | .in +4n |
b8302363 | 990 | .EX |
35e4506b MK |
991 | $ \fBsyscall_nr write\fP |
992 | 1 | |
993 | $ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP | |
b8302363 | 994 | .EE |
e646a1ba | 995 | .in |
efeece04 | 996 | .PP |
35e4506b MK |
997 | In the final example, |
998 | the BPF filter rejects a system call that is not used by the | |
999 | .BR whoami (1) | |
1000 | command, so it is able to successfully execute and produce output: | |
efeece04 | 1001 | .PP |
35e4506b | 1002 | .in +4n |
b8302363 | 1003 | .EX |
35e4506b MK |
1004 | $ \fBsyscall_nr preadv\fP |
1005 | 295 | |
1006 | $ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP | |
1007 | cecilia | |
b8302363 | 1008 | .EE |
e646a1ba | 1009 | .in |
35e4506b | 1010 | .SS Program source |
e7d0bb47 | 1011 | .EX |
e9519f4f KC |
1012 | #include <errno.h> |
1013 | #include <stddef.h> | |
1014 | #include <stdio.h> | |
1015 | #include <stdlib.h> | |
1016 | #include <unistd.h> | |
1017 | #include <linux/audit.h> | |
1018 | #include <linux/filter.h> | |
1019 | #include <linux/seccomp.h> | |
1020 | #include <sys/prctl.h> | |
1021 | ||
b44088b4 JH |
1022 | #define X32_SYSCALL_BIT 0x40000000 |
1023 | ||
d06d25cc | 1024 | static int |
69999632 | 1025 | install_filter(int syscall_nr, int t_arch, int f_errno) |
e9519f4f | 1026 | { |
b44088b4 | 1027 | unsigned int upper_nr_limit = 0xffffffff; |
e9855ef2 | 1028 | |
a3dcaaa2 MK |
1029 | /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI |
1030 | (in the x32 ABI, all system calls have bit 30 set in the | |
1031 | 'nr' field, meaning the numbers are >= X32_SYSCALL_BIT) */ | |
b44088b4 JH |
1032 | if (t_arch == AUDIT_ARCH_X86_64) |
1033 | upper_nr_limit = X32_SYSCALL_BIT - 1; | |
1034 | ||
e9519f4f | 1035 | struct sock_filter filter[] = { |
ef05ec71 | 1036 | /* [0] Load architecture from 'seccomp_data' buffer into |
64267236 | 1037 | accumulator */ |
06865301 | 1038 | BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
e9519f4f | 1039 | (offsetof(struct seccomp_data, arch))), |
d06d25cc | 1040 | |
b44088b4 | 1041 | /* [1] Jump forward 5 instructions if architecture does not |
ef05ec71 | 1042 | match 't_arch' */ |
b44088b4 | 1043 | BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5), |
d06d25cc | 1044 | |
ef05ec71 | 1045 | /* [2] Load system call number from 'seccomp_data' buffer into |
64267236 | 1046 | accumulator */ |
06865301 | 1047 | BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
e9519f4f | 1048 | (offsetof(struct seccomp_data, nr))), |
d06d25cc | 1049 | |
462ce23d | 1050 | /* [3] Check ABI - only needed for x86-64 in deny-list use |
29aa163b | 1051 | cases. Use BPF_JGT instead of checking against the bit |
0582770c | 1052 | mask to avoid having to reload the syscall number. */ |
b44088b4 JH |
1053 | BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0), |
1054 | ||
1055 | /* [4] Jump forward 1 instruction if system call number | |
ef05ec71 | 1056 | does not match 'syscall_nr' */ |
06865301 | 1057 | BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1), |
d06d25cc | 1058 | |
b44088b4 | 1059 | /* [5] Matching architecture and system call: don't execute |
c98fe9f8 | 1060 | the system call, and return 'f_errno' in 'errno' */ |
06865301 | 1061 | BPF_STMT(BPF_RET | BPF_K, |
69999632 | 1062 | SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)), |
d06d25cc | 1063 | |
b44088b4 | 1064 | /* [6] Destination of system call number mismatch: allow other |
87aa1b2c | 1065 | system calls */ |
06865301 | 1066 | BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), |
d06d25cc | 1067 | |
5defc77d | 1068 | /* [7] Destination of architecture mismatch: kill task */ |
06865301 | 1069 | BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL), |
e9519f4f | 1070 | }; |
d06d25cc | 1071 | |
e9519f4f | 1072 | struct sock_fprog prog = { |
d06d25cc | 1073 | .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])), |
e9519f4f KC |
1074 | .filter = filter, |
1075 | }; | |
d06d25cc | 1076 | |
e9519f4f KC |
1077 | if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) { |
1078 | perror("seccomp"); | |
d06d25cc | 1079 | return 1; |
e9519f4f | 1080 | } |
d06d25cc MK |
1081 | |
1082 | return 0; | |
e9519f4f KC |
1083 | } |
1084 | ||
d06d25cc MK |
1085 | int |
1086 | main(int argc, char **argv) | |
e9519f4f KC |
1087 | { |
1088 | if (argc < 5) { | |
cbffd6ae | 1089 | fprintf(stderr, "Usage: " |
d1a71985 MK |
1090 | "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en" |
1091 | "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en" | |
1092 | " AUDIT_ARCH_X86_64: 0x%X\en" | |
1093 | "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); | |
d06d25cc | 1094 | exit(EXIT_FAILURE); |
e9519f4f | 1095 | } |
d06d25cc | 1096 | |
e9519f4f KC |
1097 | if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
1098 | perror("prctl"); | |
d06d25cc | 1099 | exit(EXIT_FAILURE); |
e9519f4f | 1100 | } |
d06d25cc | 1101 | |
e9519f4f KC |
1102 | if (install_filter(strtol(argv[1], NULL, 0), |
1103 | strtol(argv[2], NULL, 0), | |
1104 | strtol(argv[3], NULL, 0))) | |
d06d25cc MK |
1105 | exit(EXIT_FAILURE); |
1106 | ||
e9519f4f KC |
1107 | execv(argv[4], &argv[4]); |
1108 | perror("execv"); | |
d06d25cc | 1109 | exit(EXIT_FAILURE); |
e9519f4f | 1110 | } |
e7d0bb47 | 1111 | .EE |
e9519f4f | 1112 | .SH SEE ALSO |
2337d75e | 1113 | .BR bpfc (1), |
85940258 | 1114 | .BR strace (1), |
00eaa6aa | 1115 | .BR bpf (2), |
e9519f4f KC |
1116 | .BR prctl (2), |
1117 | .BR ptrace (2), | |
3b4a59c4 | 1118 | .BR sigaction (2), |
712551ea | 1119 | .BR proc (5), |
e9519f4f KC |
1120 | .BR signal (7), |
1121 | .BR socket (7) | |
51f5698d | 1122 | .PP |
ca6f43fc MK |
1123 | Various pages from the |
1124 | .I libseccomp | |
1125 | library, including: | |
ccc70c81 | 1126 | .BR scmp_sys_resolver (1), |
ca6f43fc MK |
1127 | .BR seccomp_init (3), |
1128 | .BR seccomp_load (3), | |
1129 | .BR seccomp_rule_add (3), | |
1130 | and | |
1131 | .BR seccomp_export_bpf (3). | |
51f5698d | 1132 | .PP |
e28e21fe MK |
1133 | The kernel source files |
1134 | .IR Documentation/networking/filter.txt | |
1135 | and | |
28d96036 ES |
1136 | .IR Documentation/userspace\-api/seccomp_filter.rst |
1137 | .\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3 | |
1138 | (or | |
1139 | .IR Documentation/prctl/seccomp_filter.txt | |
1140 | before Linux 4.13). | |
51f5698d | 1141 | .PP |
8fb01fde | 1142 | McCanne, S.\& and Jacobson, V.\& (1992) |
f74fd424 MK |
1143 | .IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" , |
1144 | Proceedings of the USENIX Winter 1993 Conference | |
5465ae95 | 1145 | .UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf |
f74fd424 | 1146 | .UE |