]>
Commit | Line | Data |
---|---|---|
e9519f4f KC |
1 | .\" Copyright (C) 2014 Kees Cook <keescook@chromium.org> |
2 | .\" and Copyright (C) 2012 Will Drewry <wad@chromium.org> | |
35e4506b | 3 | .\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com> |
e9519f4f KC |
4 | .\" |
5 | .\" %%%LICENSE_START(VERBATIM) | |
6 | .\" Permission is granted to make and distribute verbatim copies of this | |
7 | .\" manual provided the copyright notice and this permission notice are | |
8 | .\" preserved on all copies. | |
9 | .\" | |
10 | .\" Permission is granted to copy and distribute modified versions of this | |
11 | .\" manual under the conditions for verbatim copying, provided that the | |
12 | .\" entire resulting derived work is distributed under the terms of a | |
13 | .\" permission notice identical to this one. | |
14 | .\" | |
15 | .\" Since the Linux kernel and libraries are constantly changing, this | |
16 | .\" manual page may be incorrect or out-of-date. The author(s) assume no | |
17 | .\" responsibility for errors or omissions, or for damages resulting from | |
18 | .\" the use of the information contained herein. The author(s) may not | |
19 | .\" have taken the same level of care in the production of this manual, | |
20 | .\" which is licensed free of charge, as they might when working | |
21 | .\" professionally. | |
22 | .\" | |
23 | .\" Formatted or processed versions of this manual, if unaccompanied by | |
24 | .\" the source, must acknowledge the copyright and authors of this work. | |
25 | .\" %%%LICENSE_END | |
26 | .\" | |
4b8c67d9 | 27 | .TH SECCOMP 2 2017-09-15 "Linux" "Linux Programmer's Manual" |
e9519f4f | 28 | .SH NAME |
50b4bc23 | 29 | seccomp \- operate on Secure Computing state of the process |
e9519f4f KC |
30 | .SH SYNOPSIS |
31 | .nf | |
32 | .B #include <linux/seccomp.h> | |
33 | .B #include <linux/filter.h> | |
34 | .B #include <linux/audit.h> | |
35 | .B #include <linux/signal.h> | |
36 | .B #include <sys/ptrace.h> | |
cbffd6ae MK |
37 | .\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will |
38 | .\" need <sys/ptrace.h> | |
dbfe9c70 | 39 | .PP |
50b4bc23 MK |
40 | .BI "int seccomp(unsigned int " operation ", unsigned int " flags \ |
41 | ", void *" args ); | |
e9519f4f KC |
42 | .fi |
43 | .SH DESCRIPTION | |
44 | The | |
45 | .BR seccomp () | |
46 | system call operates on the Secure Computing (seccomp) state of the | |
50b4bc23 | 47 | calling process. |
efeece04 | 48 | .PP |
e9519f4f KC |
49 | Currently, Linux supports the following |
50 | .IR operation | |
51 | values: | |
52 | .TP | |
53 | .BR SECCOMP_SET_MODE_STRICT | |
cbffd6ae | 54 | The only system calls that the calling thread is permitted to make are |
e9519f4f KC |
55 | .BR read (2), |
56 | .BR write (2), | |
85fbef74 MK |
57 | .BR _exit (2) |
58 | (but not | |
59 | .BR exit_group (2)), | |
e9519f4f KC |
60 | and |
61 | .BR sigreturn (2). | |
62 | Other system calls result in the delivery of a | |
63 | .BR SIGKILL | |
cbffd6ae | 64 | signal. |
87aa1b2c | 65 | Strict secure computing mode is useful for number-crunching |
e9519f4f KC |
66 | applications that may need to execute untrusted byte code, perhaps |
67 | obtained by reading from a pipe or socket. | |
efeece04 | 68 | .IP |
bec56ee2 JH |
69 | Note that although the calling thread can no longer call |
70 | .BR sigprocmask (2), | |
71 | it can use | |
72 | .BR sigreturn (2) | |
73 | to block all signals apart from | |
74 | .BR SIGKILL | |
75 | and | |
76 | .BR SIGSTOP . | |
65be1b46 | 77 | This means that |
bec56ee2 | 78 | .BR alarm (2) |
65be1b46 MK |
79 | (for example) is not sufficient for restricting the process's execution time. |
80 | Instead, to reliably terminate the process, | |
81 | .BR SIGKILL | |
82 | must be used. | |
83 | This can be done by using | |
bec56ee2 JH |
84 | .BR timer_create (2) |
85 | with | |
86 | .BR SIGEV_SIGNAL | |
87 | and | |
65be1b46 | 88 | .IR sigev_signo |
bec56ee2 | 89 | set to |
65be1b46 MK |
90 | .BR SIGKILL , |
91 | or by using | |
bec56ee2 JH |
92 | .BR setrlimit (2) |
93 | to set the hard limit for | |
94 | .BR RLIMIT_CPU . | |
efeece04 | 95 | .IP |
e9519f4f KC |
96 | This operation is available only if the kernel is configured with |
97 | .BR CONFIG_SECCOMP | |
98 | enabled. | |
efeece04 | 99 | .IP |
e9519f4f KC |
100 | The value of |
101 | .IR flags | |
102 | must be 0, and | |
103 | .IR args | |
104 | must be NULL. | |
efeece04 | 105 | .IP |
50b4bc23 | 106 | This operation is functionally identical to the call: |
efeece04 | 107 | .IP |
50b4bc23 | 108 | prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT); |
e9519f4f KC |
109 | .TP |
110 | .BR SECCOMP_SET_MODE_FILTER | |
111 | The system calls allowed are defined by a pointer to a Berkeley Packet | |
112 | Filter (BPF) passed via | |
113 | .IR args . | |
cbffd6ae | 114 | This argument is a pointer to a |
e9519f4f KC |
115 | .IR "struct\ sock_fprog" ; |
116 | it can be designed to filter arbitrary system calls and system call | |
50b4bc23 | 117 | arguments. |
87aa1b2c MK |
118 | If the filter is invalid, |
119 | .BR seccomp () | |
120 | fails, returning | |
cbffd6ae | 121 | .BR EINVAL |
e9519f4f KC |
122 | in |
123 | .IR errno . | |
efeece04 | 124 | .IP |
e9519f4f | 125 | If |
2d5fb4bb | 126 | .BR fork (2) |
e9519f4f | 127 | or |
2d5fb4bb MK |
128 | .BR clone (2) |
129 | is allowed by the filter, any child processes will be constrained to | |
cbffd6ae | 130 | the same system call filters as the parent. |
2d5fb4bb MK |
131 | If |
132 | .BR execve (2) | |
cbffd6ae MK |
133 | is allowed, |
134 | the existing filters will be preserved across a call to | |
2d5fb4bb | 135 | .BR execve (2). |
efeece04 | 136 | .IP |
955efdbc MK |
137 | In order to use the |
138 | .BR SECCOMP_SET_MODE_FILTER | |
139 | operation, either the caller must have the | |
e9519f4f | 140 | .BR CAP_SYS_ADMIN |
be8f1272 | 141 | capability in its user namespace, or the thread must already have the |
cbffd6ae MK |
142 | .I no_new_privs |
143 | bit set. | |
144 | If that bit was not already set by an ancestor of this thread, | |
145 | the thread must make the following call: | |
efeece04 | 146 | .IP |
955efdbc | 147 | prctl(PR_SET_NO_NEW_PRIVS, 1); |
efeece04 | 148 | .IP |
955efdbc MK |
149 | Otherwise, the |
150 | .BR SECCOMP_SET_MODE_FILTER | |
26cd31fd | 151 | operation fails and returns |
e9519f4f KC |
152 | .BR EACCES |
153 | in | |
154 | .IR errno . | |
b8ffefd7 MK |
155 | This requirement ensures that an unprivileged process cannot apply |
156 | a malicious filter and then invoke a set-user-ID or | |
157 | other privileged program using | |
158 | .BR execve (2), | |
cbffd6ae MK |
159 | thus potentially compromising that program. |
160 | (Such a malicious filter might, for example, cause an attempt to use | |
161 | .BR setuid (2) | |
162 | to set the caller's user IDs to non-zero values to instead | |
163 | return 0 without actually making the system call. | |
164 | Thus, the program might be tricked into retaining superuser privileges | |
165 | in circumstances where it is possible to influence it to do | |
166 | dangerous things because it did not actually drop privileges.) | |
efeece04 | 167 | .IP |
7d596c7b | 168 | If |
e9519f4f KC |
169 | .BR prctl (2) |
170 | or | |
bf7bc8b8 | 171 | .BR seccomp () |
7d596c7b | 172 | is allowed by the attached filter, further filters may be added. |
e01ca3aa | 173 | This will increase evaluation time, but allows for further reduction of |
cbffd6ae | 174 | the attack surface during execution of a thread. |
efeece04 | 175 | .IP |
50b4bc23 MK |
176 | The |
177 | .BR SECCOMP_SET_MODE_FILTER | |
178 | operation is available only if the kernel is configured with | |
e9519f4f KC |
179 | .BR CONFIG_SECCOMP_FILTER |
180 | enabled. | |
efeece04 | 181 | .IP |
e9519f4f KC |
182 | When |
183 | .IR flags | |
50b4bc23 | 184 | is 0, this operation is functionally identical to the call: |
efeece04 | 185 | .IP |
50b4bc23 | 186 | prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args); |
efeece04 | 187 | .IP |
e9519f4f KC |
188 | The recognized |
189 | .IR flags | |
190 | are: | |
191 | .RS | |
192 | .TP | |
193 | .BR SECCOMP_FILTER_FLAG_TSYNC | |
87aa1b2c | 194 | When adding a new filter, synchronize all other threads of the calling |
50b4bc23 | 195 | process to the same seccomp filter tree. |
b8ffefd7 MK |
196 | A "filter tree" is the ordered list of filters attached to a thread. |
197 | (Attaching identical filters in separate | |
198 | .BR seccomp () | |
199 | calls results in different filters from this perspective.) | |
efeece04 | 200 | .IP |
b8ffefd7 | 201 | If any thread cannot synchronize to the same filter tree, |
50b4bc23 MK |
202 | the call will not attach the new seccomp filter, |
203 | and will fail, returning the first thread ID found that cannot synchronize. | |
cbffd6ae | 204 | Synchronization will fail if another thread in the same process is in |
e9519f4f | 205 | .BR SECCOMP_MODE_STRICT |
50b4bc23 MK |
206 | or if it has attached new seccomp filters to itself, |
207 | diverging from the calling thread's filter tree. | |
e9519f4f | 208 | .RE |
93b9a9ee | 209 | .SS Filters |
e9519f4f KC |
210 | When adding filters via |
211 | .BR SECCOMP_SET_MODE_FILTER , | |
212 | .IR args | |
213 | points to a filter program: | |
efeece04 | 214 | .PP |
e9519f4f | 215 | .in +4n |
b8302363 | 216 | .EX |
e9519f4f | 217 | struct sock_fprog { |
1da13d15 MK |
218 | unsigned short len; /* Number of BPF instructions */ |
219 | struct sock_filter *filter; /* Pointer to array of | |
220 | BPF instructions */ | |
e9519f4f | 221 | }; |
b8302363 | 222 | .EE |
e9519f4f | 223 | .in |
efeece04 | 224 | .PP |
e9519f4f | 225 | Each program must contain one or more BPF instructions: |
efeece04 | 226 | .PP |
e9519f4f | 227 | .in +4n |
b8302363 | 228 | .EX |
1da13d15 MK |
229 | struct sock_filter { /* Filter block */ |
230 | __u16 code; /* Actual filter code */ | |
231 | __u8 jt; /* Jump true */ | |
232 | __u8 jf; /* Jump false */ | |
233 | __u32 k; /* Generic multiuse field */ | |
e9519f4f | 234 | }; |
b8302363 | 235 | .EE |
e9519f4f | 236 | .in |
efeece04 | 237 | .PP |
0c2e01b7 MK |
238 | When executing the instructions, the BPF program operates on the |
239 | system call information made available (i.e., use the | |
240 | .BR BPF_ABS | |
1a7e5113 MK |
241 | addressing mode) as a (read-only) |
242 | .\" Quoting Kees Cook: | |
243 | .\" If BPF even allows changing the data, it's not copied back to | |
244 | .\" the syscall when it runs. Anything wanting to do things like | |
3775c1ca | 245 | .\" that would need to use ptrace to catch the call and directly |
1a7e5113 MK |
246 | .\" modify the registers before continuing with the call. |
247 | buffer of the following form: | |
efeece04 | 248 | .PP |
e9519f4f | 249 | .in +4n |
b8302363 | 250 | .EX |
e9519f4f | 251 | struct seccomp_data { |
5b95c4ab | 252 | int nr; /* System call number */ |
a0a0c98b MK |
253 | __u32 arch; /* AUDIT_ARCH_* value |
254 | (see <linux/audit.h>) */ | |
e9519f4f | 255 | __u64 instruction_pointer; /* CPU instruction pointer */ |
1da13d15 | 256 | __u64 args[6]; /* Up to 6 system call arguments */ |
e9519f4f | 257 | }; |
b8302363 | 258 | .EE |
e9519f4f | 259 | .in |
efeece04 | 260 | .PP |
944a31e2 | 261 | Because numbering of system calls varies between architectures and |
65cfc712 | 262 | some architectures (e.g., x86-64) allow user-space code to use |
b44088b4 JH |
263 | the calling conventions of multiple architectures, it is usually |
264 | necessary to verify the value of the | |
265 | .IR arch | |
266 | field. | |
efeece04 | 267 | .PP |
b44088b4 JH |
268 | It is strongly recommended to use a whitelisting approach whenever |
269 | possible because such an approach is more robust and simple. | |
270 | A blacklist will have to be updated whenever a potentially | |
37daa840 | 271 | dangerous system call is added (or a dangerous flag or option if those |
b44088b4 JH |
272 | are blacklisted), and it is often possible to alter the |
273 | representation of a value without altering its meaning, leading to | |
274 | a blacklist bypass. | |
efeece04 | 275 | .PP |
b44088b4 JH |
276 | The |
277 | .IR arch | |
37daa840 | 278 | field is not unique for all calling conventions. |
65cfc712 | 279 | The x86-64 ABI and the x32 ABI both use |
b44088b4 JH |
280 | .BR AUDIT_ARCH_X86_64 |
281 | as | |
282 | .IR arch , | |
37daa840 MK |
283 | and they run on the same processors. |
284 | Instead, the mask | |
b44088b4 JH |
285 | .BR __X32_SYSCALL_BIT |
286 | is used on the system call number to tell the two ABIs apart. | |
8cc4d071 | 287 | .\" As noted by Dave Drysdale in a note at the end of |
3c5ab770 MK |
288 | .\" https://lwn.net/Articles/604515/ |
289 | .\" One additional detail to point out for the x32 ABI case: | |
290 | .\" the syscall number gets a high bit set (__X32_SYSCALL_BIT), | |
291 | .\" to mark it as an x32 call. | |
292 | .\" | |
293 | .\" If x32 support is included in the kernel, then __SYSCALL_MASK | |
294 | .\" will have a value that is not all-ones, and this will trigger | |
295 | .\" an extra instruction in system_call to mask off the extra bit, | |
8cc4d071 | 296 | .\" so that the syscall table indexing still works. |
efeece04 | 297 | .PP |
b44088b4 | 298 | This means that in order to create a seccomp-based |
65cfc712 | 299 | blacklist for system calls performed through the x86-64 ABI, |
b44088b4 JH |
300 | it is necessary to not only check that |
301 | .IR arch | |
302 | equals | |
303 | .BR AUDIT_ARCH_X86_64 , | |
7a79bb30 | 304 | but also to explicitly reject all system calls that contain |
b44088b4 JH |
305 | .BR __X32_SYSCALL_BIT |
306 | in | |
307 | .IR nr . | |
efeece04 | 308 | .PP |
712551ea MK |
309 | The |
310 | .I instruction_pointer | |
311 | field provides the address of the machine-language instruction that | |
312 | performed the system call. | |
313 | This might be useful in conjunction with the use of | |
314 | .I /proc/[pid]/maps | |
315 | to perform checks based on which region (mapping) of the program | |
316 | made the system call. | |
317 | (Probably, it is wise to lock down the | |
318 | .BR mmap (2) | |
319 | and | |
320 | .BR mprotect (2) | |
321 | system calls to prevent the program from subverting such checks.) | |
efeece04 | 322 | .PP |
b44088b4 JH |
323 | When checking values from |
324 | .IR args | |
325 | against a blacklist, keep in mind that arguments are often | |
37daa840 MK |
326 | silently truncated before being processed, but after the seccomp check. |
327 | For example, this happens if the i386 ABI is used on an | |
3c5ab770 | 328 | x86-64 kernel: although the kernel will normally not look beyond |
b44088b4 | 329 | the 32 lowest bits of the arguments, the values of the full |
37daa840 | 330 | 64-bit registers will be present in the seccomp data. |
65cfc712 | 331 | A less surprising example is that if the x86-64 ABI is used to perform |
37daa840 MK |
332 | a system call that takes an argument of type |
333 | .IR int , | |
334 | the more-significant half of the argument register is ignored by | |
335 | the system call, but visible in the seccomp data. | |
efeece04 | 336 | .PP |
cbffd6ae MK |
337 | A seccomp filter returns a 32-bit value consisting of two parts: |
338 | the most significant 16 bits | |
339 | (corresponding to the mask defined by the constant | |
340 | .BR SECCOMP_RET_ACTION ) | |
341 | contain one of the "action" values listed below; | |
342 | the least significant 16-bits (defined by the constant | |
343 | .BR SECCOMP_RET_DATA ) | |
344 | are "data" to be associated with this return value. | |
efeece04 | 345 | .PP |
755a656a MK |
346 | If multiple filters exist, they are \fIall\fP executed, |
347 | in reverse order of their addition to the filter tree\(emthat is, | |
348 | the most recently installed filter is executed first. | |
d74503a5 MK |
349 | (Note that all filters will be called |
350 | even if one of the earlier filters returns | |
351 | .BR SECCOMP_RET_KILL . | |
352 | This is done to simplify the kernel code and to provide a | |
353 | tiny speed-up in the execution of sets of filters by | |
354 | avoiding a check for this uncommon case.) | |
355 | .\" From an Aug 2015 conversation with Kees Cook where I asked why *all* | |
356 | .\" filters even if one of the early filters returns SECCOMP_RET_KILL: | |
357 | .\" | |
358 | .\" It's just because it would be an optimization that would only speed up | |
359 | .\" the RET_KILL case, but it's the uncommon one and the one that doesn't | |
360 | .\" benefit meaningfully from such a change (you need to kill the process | |
361 | .\" really quickly?). We would speed up killing a program at the (albeit | |
362 | .\" tiny) expense to all other filtered programs. Best to keep the filter | |
363 | .\" execution logic clear, simple, and as fast as possible for all | |
364 | .\" filters. | |
cbffd6ae MK |
365 | The return value for the evaluation of a given system call is the first-seen |
366 | .BR SECCOMP_RET_ACTION | |
367 | value of highest precedence (along with its accompanying data) | |
368 | returned by execution of all of the filters. | |
efeece04 | 369 | .PP |
cbffd6ae | 370 | In decreasing order of precedence, |
ec2bb3c0 | 371 | the values that may be returned by a seccomp filter are: |
e9519f4f KC |
372 | .TP |
373 | .BR SECCOMP_RET_KILL | |
5defc77d | 374 | This value results in the task exiting immediately |
0fc339b2 | 375 | .\" We really do man "task" here, not "process" |
610082d9 | 376 | without executing the system call. |
5defc77d | 377 | The task terminates as though killed by a |
0a2a0df0 MK |
378 | .B SIGSYS |
379 | signal | |
380 | .RI ( not | |
381 | .BR SIGKILL ). | |
089761d5 MF |
382 | Even if a signal handler has been registered and otherwise catches |
383 | .BR SIGSYS , | |
0fc339b2 | 384 | the handler will be ignored in this case and the task always terminates. |
efeece04 | 385 | .IP |
089761d5 MF |
386 | .\" See these commits: |
387 | .\" seccomp: dump core when using SECCOMP_RET_KILL (b25e67161c295c98acda92123b2dd1e7d8642901) | |
388 | .\" seccomp: Only dump core when single-threaded (d7276e321ff8a53106a59c85ca46d03e34288893) | |
baaf65e8 MK |
389 | Before Linux 4.11, |
390 | any process terminated this way would not trigger a coredump | |
089761d5 MF |
391 | (even though |
392 | .B SIGSYS | |
393 | is documented in | |
394 | .BR signal (7) | |
baaf65e8 MK |
395 | as having a default action of termination with a core dump). |
396 | Since Linux 4.11, | |
397 | single threaded processes follow standard core dump behavior, | |
089761d5 MF |
398 | but multithreaded processes still do not. |
399 | There is no workaround currently for multithreaded processes. | |
e9519f4f KC |
400 | .TP |
401 | .BR SECCOMP_RET_TRAP | |
610082d9 | 402 | This value results in the kernel sending a |
e9519f4f | 403 | .BR SIGSYS |
cbffd6ae MK |
404 | signal to the triggering process without executing the system call. |
405 | Various fields will be set in the | |
406 | .I siginfo_t | |
db99c064 MK |
407 | structure (see |
408 | .BR sigaction (2)) | |
409 | associated with signal: | |
cf690e13 MK |
410 | .RS |
411 | .IP * 3 | |
412 | .I si_signo | |
413 | will contain | |
414 | .BR SIGSYS . | |
415 | .IP * | |
db99c064 | 416 | .IR si_call_addr |
cf690e13 MK |
417 | will show the address of the system call instruction. |
418 | .IP * | |
db99c064 | 419 | .IR si_syscall |
e9519f4f | 420 | and |
db99c064 | 421 | .IR si_arch |
87aa1b2c | 422 | will indicate which system call was attempted. |
cf690e13 MK |
423 | .IP * |
424 | .I si_code | |
cf690e13 MK |
425 | will contain |
426 | .BR SYS_SECCOMP . | |
427 | .IP * | |
428 | .I si_errno | |
429 | will contain the | |
430 | .BR SECCOMP_RET_DATA | |
431 | portion of the filter return value. | |
432 | .RE | |
433 | .IP | |
50b4bc23 | 434 | The program counter will be as though the system call happened |
87aa1b2c | 435 | (i.e., it will not point to the system call instruction). |
50b4bc23 | 436 | The return value register will contain an architecture\-dependent value; |
06865301 | 437 | if resuming execution, set it to something appropriate for the system call. |
a79566fb | 438 | (The architecture dependency is because replacing it with |
e9519f4f KC |
439 | .BR ENOSYS |
440 | could overwrite some useful information.) | |
e9519f4f KC |
441 | .TP |
442 | .BR SECCOMP_RET_ERRNO | |
610082d9 | 443 | This value results in the |
cbffd6ae MK |
444 | .B SECCOMP_RET_DATA |
445 | portion of the filter's return value being passed to user space as the | |
e9519f4f | 446 | .IR errno |
cbffd6ae | 447 | value without executing the system call. |
e9519f4f KC |
448 | .TP |
449 | .BR SECCOMP_RET_TRACE | |
87aa1b2c MK |
450 | When returned, this value will cause the kernel to attempt to notify a |
451 | .BR ptrace (2)-based | |
452 | tracer prior to executing the system call. | |
50b4bc23 | 453 | If there is no tracer present, |
3fbe3aa3 MK |
454 | the system call is not executed and returns a failure status with |
455 | .I errno | |
456 | set to | |
457 | .BR ENOSYS . | |
efeece04 | 458 | .IP |
e9519f4f KC |
459 | A tracer will be notified if it requests |
460 | .BR PTRACE_O_TRACESECCOMP | |
461 | using | |
462 | .IR ptrace(PTRACE_SETOPTIONS) . | |
463 | The tracer will be notified of a | |
464 | .BR PTRACE_EVENT_SECCOMP | |
465 | and the | |
466 | .BR SECCOMP_RET_DATA | |
cbffd6ae | 467 | portion of the filter's return value will be available to the tracer via |
e9519f4f | 468 | .BR PTRACE_GETEVENTMSG . |
efeece04 | 469 | .IP |
50b4bc23 MK |
470 | The tracer can skip the system call by changing the system call number |
471 | to \-1. | |
472 | Alternatively, the tracer can change the system call | |
87aa1b2c | 473 | requested by changing the system call to a valid system call number. |
50b4bc23 MK |
474 | If the tracer asks to skip the system call, then the system call will |
475 | appear to return the value that the tracer puts in the return value register. | |
efeece04 | 476 | .IP |
09481df3 JH |
477 | .\" This was changed in ce6526e8afa4. |
478 | .\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was | |
479 | .\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and | |
480 | .\" 0f3912fd934c for ARM. | |
481 | Before kernel 4.8, the seccomp check will not be run again after the tracer is | |
482 | notified. | |
483 | (This means that, on older kernels, seccomp-based sandboxes | |
50b4bc23 | 484 | .B "must not" |
87aa1b2c MK |
485 | allow use of |
486 | .BR ptrace (2)\(emeven | |
487 | of other | |
50b4bc23 | 488 | sandboxed processes\(emwithout extreme care; |
06865301 | 489 | ptracers can use this mechanism to escape from the seccomp sandbox.) |
e9519f4f KC |
490 | .TP |
491 | .BR SECCOMP_RET_ALLOW | |
610082d9 | 492 | This value results in the system call being executed. |
e9519f4f KC |
493 | .SH RETURN VALUE |
494 | On success, | |
495 | .BR seccomp () | |
496 | returns 0. | |
497 | On error, if | |
498 | .BR SECCOMP_FILTER_FLAG_TSYNC | |
50b4bc23 | 499 | was used, |
880bc4eb MK |
500 | the return value is the ID of the thread |
501 | that caused the synchronization failure. | |
610082d9 MK |
502 | (This ID is a kernel thread ID of the type returned by |
503 | .BR clone (2) | |
504 | and | |
0e27eb0c | 505 | .BR gettid (2).) |
50b4bc23 | 506 | On other errors, \-1 is returned, and |
e9519f4f KC |
507 | .IR errno |
508 | is set to indicate the cause of the error. | |
509 | .SH ERRORS | |
510 | .BR seccomp () | |
511 | can fail for the following reasons: | |
512 | .TP | |
513 | .BR EACCESS | |
50b4bc23 | 514 | The caller did not have the |
e9519f4f | 515 | .BR CAP_SYS_ADMIN |
be8f1272 | 516 | capability in its user namespace, or had not set |
e9519f4f KC |
517 | .IR no_new_privs |
518 | before using | |
519 | .BR SECCOMP_SET_MODE_FILTER . | |
520 | .TP | |
521 | .BR EFAULT | |
522 | .IR args | |
43b265ff | 523 | was not a valid address. |
e9519f4f KC |
524 | .TP |
525 | .BR EINVAL | |
526 | .IR operation | |
527 | is unknown; or | |
528 | .IR flags | |
529 | are invalid for the given | |
043ed1c6 | 530 | .IR operation . |
29efefec MK |
531 | .TP |
532 | .BR EINVAL | |
882cf566 MK |
533 | .I operation |
534 | included | |
535 | .BR BPF_ABS , | |
536 | but the specified offset was not aligned to a 32-bit boundary or exceeded | |
537 | .IR "sizeof(struct\ seccomp_data)" . | |
882cf566 MK |
538 | .TP |
539 | .BR EINVAL | |
29efefec MK |
540 | .\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources |
541 | A secure computing mode has already been set, and | |
542 | .I operation | |
543 | differs from the existing setting. | |
29efefec MK |
544 | .TP |
545 | .BR EINVAL | |
546 | .\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources | |
547 | .I operation | |
548 | specified | |
549 | .BR SECCOMP_SET_MODE_FILTER , | |
550 | but the kernel was not built with | |
551 | .B CONFIG_SECCOMP_FILTER | |
552 | enabled. | |
29efefec MK |
553 | .TP |
554 | .BR EINVAL | |
555 | .I operation | |
556 | specified | |
557 | .BR SECCOMP_SET_MODE_FILTER , | |
558 | but the filter program pointed to by | |
559 | .I args | |
560 | was not valid or the length of the filter program was zero or exceeded | |
561 | .B BPF_MAXINSNS | |
562 | (4096) instructions. | |
29efefec MK |
563 | .TP |
564 | .BR ENOMEM | |
565 | Out of memory. | |
29efefec MK |
566 | .TP |
567 | .BR ENOMEM | |
568 | .\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources | |
569 | The total length of all filter programs attached | |
570 | to the calling thread would exceed | |
571 | .B MAX_INSNS_PER_PATH | |
572 | (32768) instructions. | |
573 | Note that for the purposes of calculating this limit, | |
ce67ca23 MK |
574 | each already existing filter program incurs an |
575 | overhead penalty of 4 instructions. | |
e9519f4f KC |
576 | .TP |
577 | .BR ESRCH | |
578 | Another thread caused a failure during thread sync, but its ID could not | |
579 | be determined. | |
580 | .SH VERSIONS | |
87aa1b2c | 581 | The |
dff53954 | 582 | .BR seccomp () |
87aa1b2c | 583 | system call first appeared in Linux 3.17. |
cbffd6ae | 584 | .\" FIXME . Add glibc version |
e9519f4f | 585 | .SH CONFORMING TO |
87aa1b2c | 586 | The |
dff53954 | 587 | .BR seccomp () |
87aa1b2c | 588 | system call is a nonstandard Linux extension. |
e9519f4f | 589 | .SH NOTES |
aea38298 MK |
590 | Rather than hand-coding seccomp filters as shown in the example below, |
591 | you may prefer to employ the | |
592 | .I libseccomp | |
593 | library, which provides a front-end for generating seccomp filters. | |
efeece04 | 594 | .PP |
1367a60a MK |
595 | The |
596 | .IR Seccomp | |
597 | field of the | |
598 | .IR /proc/[pid]/status | |
599 | file provides a method of viewing the seccomp mode of a process; see | |
600 | .BR proc (5). | |
efeece04 | 601 | .PP |
e9519f4f | 602 | .BR seccomp () |
50b4bc23 MK |
603 | provides a superset of the functionality provided by the |
604 | .BR prctl (2) | |
b95b411b | 605 | .BR PR_SET_SECCOMP |
50b4bc23 MK |
606 | operation (which does not support |
607 | .IR flags ). | |
efeece04 | 608 | .PP |
88f02af6 MK |
609 | Since Linux 4.4, the |
610 | .BR prctl (2) | |
611 | .B PTRACE_SECCOMP_GET_FILTER | |
612 | operation can be used to dump a process's seccomp filters. | |
613 | .\" | |
cecc8c48 MK |
614 | .SS Seccomp-specific BPF details |
615 | Note the following BPF details specific to seccomp filters: | |
616 | .IP * 3 | |
617 | The | |
618 | .B BPF_H | |
619 | and | |
620 | .B BPF_B | |
621 | size modifiers are not supported: all operations must load and store | |
622 | (4-byte) words | |
623 | .RB ( BPF_W ). | |
624 | .IP * | |
625 | To access the contents of the | |
626 | .I seccomp_data | |
627 | buffer, use the | |
628 | .B BPF_ABS | |
629 | addressing mode modifier. | |
cecc8c48 MK |
630 | .IP * |
631 | The | |
632 | .B BPF_LEN | |
633 | addressing mode modifier yields an immediate mode operand | |
634 | whose value is the size of the | |
635 | .IR seccomp_data | |
636 | buffer. | |
e9519f4f | 637 | .SH EXAMPLE |
35e4506b MK |
638 | The program below accepts four or more arguments. |
639 | The first three arguments are a system call number, | |
640 | a numeric architecture identifier, and an error number. | |
641 | The program uses these values to construct a BPF filter | |
642 | that is used at run time to perform the following checks: | |
643 | .IP [1] 4 | |
644 | If the program is not running on the specified architecture, | |
645 | the BPF filter causes system calls to fail with the error | |
646 | .BR ENOSYS . | |
647 | .IP [2] | |
648 | If the program attempts to execute the system call with the specified number, | |
649 | the BPF filter causes the system call to fail, with | |
650 | .I errno | |
651 | being set to the specified error number. | |
652 | .PP | |
653 | The remaining command-line arguments specify | |
654 | the pathname and additional arguments of a program | |
655 | that the example program should attempt to execute using | |
71bb61ec | 656 | .BR execv (3) |
35e4506b MK |
657 | (a library function that employs the |
658 | .BR execve (2) | |
659 | system call). | |
660 | Some example runs of the program are shown below. | |
efeece04 | 661 | .PP |
35e4506b MK |
662 | First, we display the architecture that we are running on (x86-64) |
663 | and then construct a shell function that looks up system call | |
664 | numbers on this architecture: | |
efeece04 | 665 | .PP |
35e4506b | 666 | .in +4n |
b8302363 | 667 | .EX |
35e4506b MK |
668 | $ \fBuname -m\fP |
669 | x86_64 | |
670 | $ \fBsyscall_nr() { | |
671 | cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\ | |
06865301 | 672 | awk '$2 != "x32" && $3 == "'$1'" { print $1 }' |
35e4506b | 673 | }\fP |
b8302363 | 674 | .EE |
e646a1ba | 675 | .in |
efeece04 | 676 | .PP |
35e4506b MK |
677 | When the BPF filter rejects a system call (case [2] above), |
678 | it causes the system call to fail with the error number | |
679 | specified on the command line. | |
680 | In the experiments shown here, we'll use error number 99: | |
efeece04 | 681 | .PP |
35e4506b | 682 | .in +4n |
b8302363 | 683 | .EX |
35e4506b MK |
684 | $ \fBerrno 99\fP |
685 | EADDRNOTAVAIL 99 Cannot assign requested address | |
b8302363 | 686 | .EE |
e646a1ba | 687 | .in |
efeece04 | 688 | .PP |
35e4506b MK |
689 | In the following example, we attempt to run the command |
690 | .BR whoami (1), | |
691 | but the BPF filter rejects the | |
692 | .BR execve (2) | |
693 | system call, so that the command is not even executed: | |
efeece04 | 694 | .PP |
35e4506b | 695 | .in +4n |
b8302363 | 696 | .EX |
35e4506b | 697 | $ \fBsyscall_nr execve\fP |
06865301 | 698 | 59 |
cbffd6ae MK |
699 | $ \fB./a.out\fP |
700 | Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>] | |
701 | Hint for <arch>: AUDIT_ARCH_I386: 0x40000003 | |
702 | AUDIT_ARCH_X86_64: 0xC000003E | |
35e4506b MK |
703 | $ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP |
704 | execv: Cannot assign requested address | |
b8302363 | 705 | .EE |
e646a1ba | 706 | .in |
efeece04 | 707 | .PP |
35e4506b MK |
708 | In the next example, the BPF filter rejects the |
709 | .BR write (2) | |
710 | system call, so that, although it is successfully started, the | |
711 | .BR whoami (1) | |
712 | command is not able to write output: | |
efeece04 | 713 | .PP |
35e4506b | 714 | .in +4n |
b8302363 | 715 | .EX |
35e4506b MK |
716 | $ \fBsyscall_nr write\fP |
717 | 1 | |
718 | $ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP | |
b8302363 | 719 | .EE |
e646a1ba | 720 | .in |
efeece04 | 721 | .PP |
35e4506b MK |
722 | In the final example, |
723 | the BPF filter rejects a system call that is not used by the | |
724 | .BR whoami (1) | |
725 | command, so it is able to successfully execute and produce output: | |
efeece04 | 726 | .PP |
35e4506b | 727 | .in +4n |
b8302363 | 728 | .EX |
35e4506b MK |
729 | $ \fBsyscall_nr preadv\fP |
730 | 295 | |
731 | $ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP | |
732 | cecilia | |
b8302363 | 733 | .EE |
e646a1ba | 734 | .in |
35e4506b | 735 | .SS Program source |
e7d0bb47 | 736 | .EX |
e9519f4f KC |
737 | #include <errno.h> |
738 | #include <stddef.h> | |
739 | #include <stdio.h> | |
740 | #include <stdlib.h> | |
741 | #include <unistd.h> | |
742 | #include <linux/audit.h> | |
743 | #include <linux/filter.h> | |
744 | #include <linux/seccomp.h> | |
745 | #include <sys/prctl.h> | |
746 | ||
b44088b4 JH |
747 | #define X32_SYSCALL_BIT 0x40000000 |
748 | ||
d06d25cc | 749 | static int |
69999632 | 750 | install_filter(int syscall_nr, int t_arch, int f_errno) |
e9519f4f | 751 | { |
b44088b4 | 752 | unsigned int upper_nr_limit = 0xffffffff; |
e9855ef2 MK |
753 | |
754 | /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI */ | |
b44088b4 JH |
755 | if (t_arch == AUDIT_ARCH_X86_64) |
756 | upper_nr_limit = X32_SYSCALL_BIT - 1; | |
757 | ||
e9519f4f | 758 | struct sock_filter filter[] = { |
ef05ec71 | 759 | /* [0] Load architecture from 'seccomp_data' buffer into |
64267236 | 760 | accumulator */ |
06865301 | 761 | BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
e9519f4f | 762 | (offsetof(struct seccomp_data, arch))), |
d06d25cc | 763 | |
b44088b4 | 764 | /* [1] Jump forward 5 instructions if architecture does not |
ef05ec71 | 765 | match 't_arch' */ |
b44088b4 | 766 | BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5), |
d06d25cc | 767 | |
ef05ec71 | 768 | /* [2] Load system call number from 'seccomp_data' buffer into |
64267236 | 769 | accumulator */ |
06865301 | 770 | BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
e9519f4f | 771 | (offsetof(struct seccomp_data, nr))), |
d06d25cc | 772 | |
65cfc712 | 773 | /* [3] Check ABI - only needed for x86-64 in blacklist use |
0582770c MK |
774 | cases. Use JGT instead of checking against the bit |
775 | mask to avoid having to reload the syscall number. */ | |
b44088b4 JH |
776 | BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0), |
777 | ||
778 | /* [4] Jump forward 1 instruction if system call number | |
ef05ec71 | 779 | does not match 'syscall_nr' */ |
06865301 | 780 | BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1), |
d06d25cc | 781 | |
b44088b4 | 782 | /* [5] Matching architecture and system call: don't execute |
69999632 | 783 | the system call, and return 'f_errno' in 'errno' */ |
06865301 | 784 | BPF_STMT(BPF_RET | BPF_K, |
69999632 | 785 | SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)), |
d06d25cc | 786 | |
b44088b4 | 787 | /* [6] Destination of system call number mismatch: allow other |
87aa1b2c | 788 | system calls */ |
06865301 | 789 | BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), |
d06d25cc | 790 | |
5defc77d | 791 | /* [7] Destination of architecture mismatch: kill task */ |
06865301 | 792 | BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL), |
e9519f4f | 793 | }; |
d06d25cc | 794 | |
e9519f4f | 795 | struct sock_fprog prog = { |
d06d25cc | 796 | .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])), |
e9519f4f KC |
797 | .filter = filter, |
798 | }; | |
d06d25cc | 799 | |
e9519f4f KC |
800 | if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) { |
801 | perror("seccomp"); | |
d06d25cc | 802 | return 1; |
e9519f4f | 803 | } |
d06d25cc MK |
804 | |
805 | return 0; | |
e9519f4f KC |
806 | } |
807 | ||
d06d25cc MK |
808 | int |
809 | main(int argc, char **argv) | |
e9519f4f KC |
810 | { |
811 | if (argc < 5) { | |
cbffd6ae MK |
812 | fprintf(stderr, "Usage: " |
813 | "%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n" | |
814 | "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n" | |
815 | " AUDIT_ARCH_X86_64: 0x%X\\n" | |
816 | "\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); | |
d06d25cc | 817 | exit(EXIT_FAILURE); |
e9519f4f | 818 | } |
d06d25cc | 819 | |
e9519f4f KC |
820 | if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
821 | perror("prctl"); | |
d06d25cc | 822 | exit(EXIT_FAILURE); |
e9519f4f | 823 | } |
d06d25cc | 824 | |
e9519f4f KC |
825 | if (install_filter(strtol(argv[1], NULL, 0), |
826 | strtol(argv[2], NULL, 0), | |
827 | strtol(argv[3], NULL, 0))) | |
d06d25cc MK |
828 | exit(EXIT_FAILURE); |
829 | ||
e9519f4f KC |
830 | execv(argv[4], &argv[4]); |
831 | perror("execv"); | |
d06d25cc | 832 | exit(EXIT_FAILURE); |
e9519f4f | 833 | } |
e7d0bb47 | 834 | .EE |
e9519f4f | 835 | .SH SEE ALSO |
00eaa6aa | 836 | .BR bpf (2), |
e9519f4f KC |
837 | .BR prctl (2), |
838 | .BR ptrace (2), | |
3b4a59c4 | 839 | .BR sigaction (2), |
712551ea | 840 | .BR proc (5), |
e9519f4f KC |
841 | .BR signal (7), |
842 | .BR socket (7) | |
51f5698d | 843 | .PP |
ca6f43fc MK |
844 | Various pages from the |
845 | .I libseccomp | |
846 | library, including: | |
ccc70c81 | 847 | .BR scmp_sys_resolver (1), |
ca6f43fc MK |
848 | .BR seccomp_init (3), |
849 | .BR seccomp_load (3), | |
850 | .BR seccomp_rule_add (3), | |
851 | and | |
852 | .BR seccomp_export_bpf (3). | |
51f5698d | 853 | .PP |
e28e21fe MK |
854 | The kernel source files |
855 | .IR Documentation/networking/filter.txt | |
856 | and | |
28d96036 ES |
857 | .IR Documentation/userspace\-api/seccomp_filter.rst |
858 | .\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3 | |
859 | (or | |
860 | .IR Documentation/prctl/seccomp_filter.txt | |
861 | before Linux 4.13). | |
51f5698d | 862 | .PP |
f74fd424 MK |
863 | McCanne, S. and Jacobson, V. (1992) |
864 | .IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" , | |
865 | Proceedings of the USENIX Winter 1993 Conference | |
5465ae95 | 866 | .UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf |
f74fd424 | 867 | .UE |