]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/seccomp.2
proc.5: ffix
[thirdparty/man-pages.git] / man2 / seccomp.2
CommitLineData
e9519f4f
KC
1.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
2.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
1445a0ff
MK
3.\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com>
4.\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com>
e9519f4f
KC
5.\"
6.\" %%%LICENSE_START(VERBATIM)
7.\" Permission is granted to make and distribute verbatim copies of this
8.\" manual provided the copyright notice and this permission notice are
9.\" preserved on all copies.
10.\"
11.\" Permission is granted to copy and distribute modified versions of this
12.\" manual under the conditions for verbatim copying, provided that the
13.\" entire resulting derived work is distributed under the terms of a
14.\" permission notice identical to this one.
15.\"
16.\" Since the Linux kernel and libraries are constantly changing, this
17.\" manual page may be incorrect or out-of-date. The author(s) assume no
18.\" responsibility for errors or omissions, or for damages resulting from
19.\" the use of the information contained herein. The author(s) may not
20.\" have taken the same level of care in the production of this manual,
21.\" which is licensed free of charge, as they might when working
22.\" professionally.
23.\"
24.\" Formatted or processed versions of this manual, if unaccompanied by
25.\" the source, must acknowledge the copyright and authors of this work.
26.\" %%%LICENSE_END
27.\"
9ba01802 28.TH SECCOMP 2 2019-03-06 "Linux" "Linux Programmer's Manual"
e9519f4f 29.SH NAME
50b4bc23 30seccomp \- operate on Secure Computing state of the process
e9519f4f
KC
31.SH SYNOPSIS
32.nf
33.B #include <linux/seccomp.h>
34.B #include <linux/filter.h>
35.B #include <linux/audit.h>
36.B #include <linux/signal.h>
37.B #include <sys/ptrace.h>
cbffd6ae
MK
38.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
39.\" need <sys/ptrace.h>
dbfe9c70 40.PP
50b4bc23
MK
41.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
42", void *" args );
e9519f4f
KC
43.fi
44.SH DESCRIPTION
45The
46.BR seccomp ()
47system call operates on the Secure Computing (seccomp) state of the
50b4bc23 48calling process.
efeece04 49.PP
e9519f4f
KC
50Currently, Linux supports the following
51.IR operation
52values:
53.TP
54.BR SECCOMP_SET_MODE_STRICT
cbffd6ae 55The only system calls that the calling thread is permitted to make are
e9519f4f
KC
56.BR read (2),
57.BR write (2),
85fbef74
MK
58.BR _exit (2)
59(but not
60.BR exit_group (2)),
e9519f4f
KC
61and
62.BR sigreturn (2).
63Other system calls result in the delivery of a
64.BR SIGKILL
cbffd6ae 65signal.
87aa1b2c 66Strict secure computing mode is useful for number-crunching
e9519f4f
KC
67applications that may need to execute untrusted byte code, perhaps
68obtained by reading from a pipe or socket.
efeece04 69.IP
bec56ee2
JH
70Note that although the calling thread can no longer call
71.BR sigprocmask (2),
72it can use
73.BR sigreturn (2)
74to block all signals apart from
75.BR SIGKILL
76and
77.BR SIGSTOP .
65be1b46 78This means that
bec56ee2 79.BR alarm (2)
65be1b46
MK
80(for example) is not sufficient for restricting the process's execution time.
81Instead, to reliably terminate the process,
82.BR SIGKILL
83must be used.
84This can be done by using
bec56ee2
JH
85.BR timer_create (2)
86with
87.BR SIGEV_SIGNAL
88and
65be1b46 89.IR sigev_signo
bec56ee2 90set to
65be1b46
MK
91.BR SIGKILL ,
92or by using
bec56ee2
JH
93.BR setrlimit (2)
94to set the hard limit for
95.BR RLIMIT_CPU .
efeece04 96.IP
e9519f4f
KC
97This operation is available only if the kernel is configured with
98.BR CONFIG_SECCOMP
99enabled.
efeece04 100.IP
e9519f4f
KC
101The value of
102.IR flags
103must be 0, and
104.IR args
105must be NULL.
efeece04 106.IP
50b4bc23 107This operation is functionally identical to the call:
efeece04 108.IP
f36293a2
MK
109.in +4n
110.EX
111prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
112.EE
113.in
e9519f4f
KC
114.TP
115.BR SECCOMP_SET_MODE_FILTER
116The system calls allowed are defined by a pointer to a Berkeley Packet
117Filter (BPF) passed via
118.IR args .
cbffd6ae 119This argument is a pointer to a
e9519f4f
KC
120.IR "struct\ sock_fprog" ;
121it can be designed to filter arbitrary system calls and system call
50b4bc23 122arguments.
87aa1b2c
MK
123If the filter is invalid,
124.BR seccomp ()
125fails, returning
cbffd6ae 126.BR EINVAL
e9519f4f
KC
127in
128.IR errno .
efeece04 129.IP
e9519f4f 130If
2d5fb4bb 131.BR fork (2)
e9519f4f 132or
2d5fb4bb
MK
133.BR clone (2)
134is allowed by the filter, any child processes will be constrained to
cbffd6ae 135the same system call filters as the parent.
2d5fb4bb
MK
136If
137.BR execve (2)
cbffd6ae
MK
138is allowed,
139the existing filters will be preserved across a call to
2d5fb4bb 140.BR execve (2).
efeece04 141.IP
955efdbc
MK
142In order to use the
143.BR SECCOMP_SET_MODE_FILTER
400388b5 144operation, either the calling thread must have the
e9519f4f 145.BR CAP_SYS_ADMIN
be8f1272 146capability in its user namespace, or the thread must already have the
cbffd6ae
MK
147.I no_new_privs
148bit set.
149If that bit was not already set by an ancestor of this thread,
150the thread must make the following call:
efeece04 151.IP
f36293a2
MK
152.in +4n
153.EX
154prctl(PR_SET_NO_NEW_PRIVS, 1);
155.EE
156.in
efeece04 157.IP
955efdbc
MK
158Otherwise, the
159.BR SECCOMP_SET_MODE_FILTER
26cd31fd 160operation fails and returns
e9519f4f
KC
161.BR EACCES
162in
163.IR errno .
b8ffefd7
MK
164This requirement ensures that an unprivileged process cannot apply
165a malicious filter and then invoke a set-user-ID or
166other privileged program using
167.BR execve (2),
cbffd6ae
MK
168thus potentially compromising that program.
169(Such a malicious filter might, for example, cause an attempt to use
170.BR setuid (2)
58440555 171to set the caller's user IDs to nonzero values to instead
cbffd6ae
MK
172return 0 without actually making the system call.
173Thus, the program might be tricked into retaining superuser privileges
174in circumstances where it is possible to influence it to do
175dangerous things because it did not actually drop privileges.)
efeece04 176.IP
7d596c7b 177If
e9519f4f
KC
178.BR prctl (2)
179or
bf7bc8b8 180.BR seccomp ()
7d596c7b 181is allowed by the attached filter, further filters may be added.
e01ca3aa 182This will increase evaluation time, but allows for further reduction of
cbffd6ae 183the attack surface during execution of a thread.
efeece04 184.IP
50b4bc23
MK
185The
186.BR SECCOMP_SET_MODE_FILTER
187operation is available only if the kernel is configured with
e9519f4f
KC
188.BR CONFIG_SECCOMP_FILTER
189enabled.
efeece04 190.IP
e9519f4f
KC
191When
192.IR flags
50b4bc23 193is 0, this operation is functionally identical to the call:
efeece04 194.IP
f36293a2
MK
195.in +4n
196.EX
197prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
198.EE
199.in
efeece04 200.IP
e9519f4f
KC
201The recognized
202.IR flags
203are:
204.RS
205.TP
206.BR SECCOMP_FILTER_FLAG_TSYNC
87aa1b2c 207When adding a new filter, synchronize all other threads of the calling
50b4bc23 208process to the same seccomp filter tree.
b8ffefd7
MK
209A "filter tree" is the ordered list of filters attached to a thread.
210(Attaching identical filters in separate
211.BR seccomp ()
212calls results in different filters from this perspective.)
efeece04 213.IP
b8ffefd7 214If any thread cannot synchronize to the same filter tree,
50b4bc23
MK
215the call will not attach the new seccomp filter,
216and will fail, returning the first thread ID found that cannot synchronize.
cbffd6ae 217Synchronization will fail if another thread in the same process is in
e9519f4f 218.BR SECCOMP_MODE_STRICT
50b4bc23
MK
219or if it has attached new seccomp filters to itself,
220diverging from the calling thread's filter tree.
17c56ad0
TH
221.TP
222.BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)"
223.\" commit e66a39977985b1e69e17c4042cb290768eca9b02
224All filter return actions except
225.BR SECCOMP_RET_ALLOW
1e94218c
MK
226should be logged.
227An administrator may override this filter flag by preventing specific
228actions from being logged via the
17c56ad0
TH
229.IR /proc/sys/kernel/seccomp/actions_logged
230file.
906e981b
MK
231.TP
232.BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)"
233.\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675
234Disable Speculative Store Bypass mitigation.
e9519f4f 235.RE
6d1728da
TH
236.TP
237.BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)"
238.\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655
1e94218c
MK
239Test to see if an action is supported by the kernel.
240This operation is helpful to confirm that the kernel knows
241of a more recently added filter return action
242since the kernel treats all unknown actions as
0771269c 243.BR SECCOMP_RET_KILL_PROCESS .
6d1728da
TH
244.IP
245The value of
246.IR flags
247must be 0, and
248.IR args
249must be a pointer to an unsigned 32-bit filter return action.
93b9a9ee 250.SS Filters
e9519f4f
KC
251When adding filters via
252.BR SECCOMP_SET_MODE_FILTER ,
253.IR args
254points to a filter program:
efeece04 255.PP
e9519f4f 256.in +4n
b8302363 257.EX
e9519f4f 258struct sock_fprog {
1da13d15
MK
259 unsigned short len; /* Number of BPF instructions */
260 struct sock_filter *filter; /* Pointer to array of
261 BPF instructions */
e9519f4f 262};
b8302363 263.EE
e9519f4f 264.in
efeece04 265.PP
e9519f4f 266Each program must contain one or more BPF instructions:
efeece04 267.PP
e9519f4f 268.in +4n
b8302363 269.EX
1da13d15
MK
270struct sock_filter { /* Filter block */
271 __u16 code; /* Actual filter code */
272 __u8 jt; /* Jump true */
273 __u8 jf; /* Jump false */
274 __u32 k; /* Generic multiuse field */
e9519f4f 275};
b8302363 276.EE
e9519f4f 277.in
efeece04 278.PP
0c2e01b7
MK
279When executing the instructions, the BPF program operates on the
280system call information made available (i.e., use the
281.BR BPF_ABS
1a7e5113
MK
282addressing mode) as a (read-only)
283.\" Quoting Kees Cook:
284.\" If BPF even allows changing the data, it's not copied back to
285.\" the syscall when it runs. Anything wanting to do things like
3775c1ca 286.\" that would need to use ptrace to catch the call and directly
1a7e5113
MK
287.\" modify the registers before continuing with the call.
288buffer of the following form:
efeece04 289.PP
e9519f4f 290.in +4n
b8302363 291.EX
e9519f4f 292struct seccomp_data {
5b95c4ab 293 int nr; /* System call number */
a0a0c98b
MK
294 __u32 arch; /* AUDIT_ARCH_* value
295 (see <linux/audit.h>) */
e9519f4f 296 __u64 instruction_pointer; /* CPU instruction pointer */
1da13d15 297 __u64 args[6]; /* Up to 6 system call arguments */
e9519f4f 298};
b8302363 299.EE
e9519f4f 300.in
efeece04 301.PP
944a31e2 302Because numbering of system calls varies between architectures and
65cfc712 303some architectures (e.g., x86-64) allow user-space code to use
036b0619
MK
304the calling conventions of multiple architectures
305(and the convention being used may vary over the life of a process that uses
306.BR execve (2)
307to execute binaries that employ the different conventions),
308it is usually necessary to verify the value of the
b44088b4
JH
309.IR arch
310field.
efeece04 311.PP
b44088b4
JH
312It is strongly recommended to use a whitelisting approach whenever
313possible because such an approach is more robust and simple.
314A blacklist will have to be updated whenever a potentially
37daa840 315dangerous system call is added (or a dangerous flag or option if those
b44088b4
JH
316are blacklisted), and it is often possible to alter the
317representation of a value without altering its meaning, leading to
318a blacklist bypass.
5069617c
MK
319See also
320.IR Caveats
321below.
efeece04 322.PP
b44088b4
JH
323The
324.IR arch
37daa840 325field is not unique for all calling conventions.
65cfc712 326The x86-64 ABI and the x32 ABI both use
b44088b4
JH
327.BR AUDIT_ARCH_X86_64
328as
329.IR arch ,
37daa840
MK
330and they run on the same processors.
331Instead, the mask
b44088b4
JH
332.BR __X32_SYSCALL_BIT
333is used on the system call number to tell the two ABIs apart.
8cc4d071 334.\" As noted by Dave Drysdale in a note at the end of
3c5ab770
MK
335.\" https://lwn.net/Articles/604515/
336.\" One additional detail to point out for the x32 ABI case:
337.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT),
338.\" to mark it as an x32 call.
339.\"
340.\" If x32 support is included in the kernel, then __SYSCALL_MASK
341.\" will have a value that is not all-ones, and this will trigger
342.\" an extra instruction in system_call to mask off the extra bit,
8cc4d071 343.\" so that the syscall table indexing still works.
efeece04 344.PP
b44088b4 345This means that in order to create a seccomp-based
65cfc712 346blacklist for system calls performed through the x86-64 ABI,
b44088b4
JH
347it is necessary to not only check that
348.IR arch
349equals
350.BR AUDIT_ARCH_X86_64 ,
7a79bb30 351but also to explicitly reject all system calls that contain
b44088b4
JH
352.BR __X32_SYSCALL_BIT
353in
354.IR nr .
efeece04 355.PP
712551ea
MK
356The
357.I instruction_pointer
358field provides the address of the machine-language instruction that
359performed the system call.
360This might be useful in conjunction with the use of
361.I /proc/[pid]/maps
362to perform checks based on which region (mapping) of the program
363made the system call.
364(Probably, it is wise to lock down the
365.BR mmap (2)
366and
367.BR mprotect (2)
368system calls to prevent the program from subverting such checks.)
efeece04 369.PP
b44088b4
JH
370When checking values from
371.IR args
372against a blacklist, keep in mind that arguments are often
37daa840
MK
373silently truncated before being processed, but after the seccomp check.
374For example, this happens if the i386 ABI is used on an
3c5ab770 375x86-64 kernel: although the kernel will normally not look beyond
b44088b4 376the 32 lowest bits of the arguments, the values of the full
37daa840 37764-bit registers will be present in the seccomp data.
65cfc712 378A less surprising example is that if the x86-64 ABI is used to perform
37daa840
MK
379a system call that takes an argument of type
380.IR int ,
381the more-significant half of the argument register is ignored by
382the system call, but visible in the seccomp data.
efeece04 383.PP
cbffd6ae
MK
384A seccomp filter returns a 32-bit value consisting of two parts:
385the most significant 16 bits
386(corresponding to the mask defined by the constant
0c438780 387.BR SECCOMP_RET_ACTION_FULL )
cbffd6ae
MK
388contain one of the "action" values listed below;
389the least significant 16-bits (defined by the constant
390.BR SECCOMP_RET_DATA )
391are "data" to be associated with this return value.
efeece04 392.PP
755a656a
MK
393If multiple filters exist, they are \fIall\fP executed,
394in reverse order of their addition to the filter tree\(emthat is,
395the most recently installed filter is executed first.
d74503a5
MK
396(Note that all filters will be called
397even if one of the earlier filters returns
398.BR SECCOMP_RET_KILL .
399This is done to simplify the kernel code and to provide a
400tiny speed-up in the execution of sets of filters by
401avoiding a check for this uncommon case.)
cb453c80
MK
402.\" From an Aug 2015 conversation with Kees Cook where I asked why *all*
403.\" filters are applied even if one of the early filters returns
404.\" SECCOMP_RET_KILL:
d74503a5
MK
405.\"
406.\" It's just because it would be an optimization that would only speed up
407.\" the RET_KILL case, but it's the uncommon one and the one that doesn't
408.\" benefit meaningfully from such a change (you need to kill the process
409.\" really quickly?). We would speed up killing a program at the (albeit
410.\" tiny) expense to all other filtered programs. Best to keep the filter
411.\" execution logic clear, simple, and as fast as possible for all
412.\" filters.
cbffd6ae 413The return value for the evaluation of a given system call is the first-seen
1d530819 414action value of highest precedence (along with its accompanying data)
cbffd6ae 415returned by execution of all of the filters.
efeece04 416.PP
cbffd6ae 417In decreasing order of precedence,
1d530819 418the action values that may be returned by a seccomp filter are:
e9519f4f 419.TP
51c58a6c
MK
420.BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)"
421.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
422.\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421
423This value results in immediate termination of the process,
424with a core dump.
425The system call is not executed.
426By contrast with
427.BR SECCOMP_RET_KILL_THREAD
428below, all threads in the thread group are terminated.
429(For a discussion of thread groups, see the description of the
430.BR CLONE_THREAD
431flag in
432.BR clone (2).)
433.IP
434The process terminates
435.I "as though"
436killed by a
437.B SIGSYS
438signal.
439Even if a signal handler has been registered for
440.BR SIGSYS ,
441the handler will be ignored in this case and the process always terminates.
442To a parent process that is waiting on this process (using
443.BR waitpid (2)
444or similar), the returned
445.I wstatus
446will indicate that its child was terminated as though by a
447.BR SIGSYS
448signal.
449.TP
6aa0baa4 450.BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL )
5cfa0627
MK
451This value results in immediate termination of the thread
452that made the system call.
453The system call is not executed.
454Other threads in the same thread group will continue to execute.
455.IP
b9c6742b
MK
456The thread terminates
457.I "as though"
458killed by a
0a2a0df0 459.B SIGSYS
5cfa0627 460signal.
b9c6742b
MK
461See
462.BR SECCOMP_RET_KILL_PROCESS
463above.
efeece04 464.IP
089761d5 465.\" See these commits:
6aa0baa4
MK
466.\" seccomp: dump core when using SECCOMP_RET_KILL
467.\" (b25e67161c295c98acda92123b2dd1e7d8642901)
468.\" seccomp: Only dump core when single-threaded
469.\" (d7276e321ff8a53106a59c85ca46d03e34288893)
baaf65e8 470Before Linux 4.11,
6aa0baa4 471any process terminated in this way would not trigger a coredump
089761d5
MF
472(even though
473.B SIGSYS
474is documented in
475.BR signal (7)
baaf65e8
MK
476as having a default action of termination with a core dump).
477Since Linux 4.11,
6aa0baa4
MK
478a single-threaded process will dump core if terminated in this way.
479.IP
480With the addition of
481.BR SECCOMP_RET_KILL_PROCESS
482in Linux 4.14,
483.BR SECCOMP_RET_KILL_THREAD
484was added as a synonym for
485.BR SECCOMP_RET_KILL ,
486in order to more clearly distinguish the two actions.
e9519f4f
KC
487.TP
488.BR SECCOMP_RET_TRAP
b94c5c5a 489This value results in the kernel sending a thread-directed
e9519f4f 490.BR SIGSYS
b94c5c5a
MK
491signal to the triggering thread.
492(The system call is not executed.)
cbffd6ae
MK
493Various fields will be set in the
494.I siginfo_t
db99c064
MK
495structure (see
496.BR sigaction (2))
497associated with signal:
cf690e13
MK
498.RS
499.IP * 3
500.I si_signo
501will contain
502.BR SIGSYS .
503.IP *
db99c064 504.IR si_call_addr
cf690e13
MK
505will show the address of the system call instruction.
506.IP *
db99c064 507.IR si_syscall
e9519f4f 508and
db99c064 509.IR si_arch
87aa1b2c 510will indicate which system call was attempted.
cf690e13
MK
511.IP *
512.I si_code
cf690e13
MK
513will contain
514.BR SYS_SECCOMP .
515.IP *
516.I si_errno
517will contain the
518.BR SECCOMP_RET_DATA
519portion of the filter return value.
520.RE
521.IP
50b4bc23 522The program counter will be as though the system call happened
6dfb150c 523(i.e., the program counter will not point to the system call instruction).
50b4bc23 524The return value register will contain an architecture\-dependent value;
06865301 525if resuming execution, set it to something appropriate for the system call.
a79566fb 526(The architecture dependency is because replacing it with
e9519f4f
KC
527.BR ENOSYS
528could overwrite some useful information.)
e9519f4f
KC
529.TP
530.BR SECCOMP_RET_ERRNO
610082d9 531This value results in the
cbffd6ae
MK
532.B SECCOMP_RET_DATA
533portion of the filter's return value being passed to user space as the
e9519f4f 534.IR errno
cbffd6ae 535value without executing the system call.
e9519f4f
KC
536.TP
537.BR SECCOMP_RET_TRACE
87aa1b2c
MK
538When returned, this value will cause the kernel to attempt to notify a
539.BR ptrace (2)-based
540tracer prior to executing the system call.
50b4bc23 541If there is no tracer present,
3fbe3aa3
MK
542the system call is not executed and returns a failure status with
543.I errno
544set to
545.BR ENOSYS .
efeece04 546.IP
e9519f4f
KC
547A tracer will be notified if it requests
548.BR PTRACE_O_TRACESECCOMP
549using
550.IR ptrace(PTRACE_SETOPTIONS) .
551The tracer will be notified of a
552.BR PTRACE_EVENT_SECCOMP
553and the
554.BR SECCOMP_RET_DATA
cbffd6ae 555portion of the filter's return value will be available to the tracer via
e9519f4f 556.BR PTRACE_GETEVENTMSG .
efeece04 557.IP
50b4bc23
MK
558The tracer can skip the system call by changing the system call number
559to \-1.
560Alternatively, the tracer can change the system call
87aa1b2c 561requested by changing the system call to a valid system call number.
50b4bc23
MK
562If the tracer asks to skip the system call, then the system call will
563appear to return the value that the tracer puts in the return value register.
efeece04 564.IP
09481df3
JH
565.\" This was changed in ce6526e8afa4.
566.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was
567.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and
568.\" 0f3912fd934c for ARM.
569Before kernel 4.8, the seccomp check will not be run again after the tracer is
570notified.
571(This means that, on older kernels, seccomp-based sandboxes
50b4bc23 572.B "must not"
87aa1b2c
MK
573allow use of
574.BR ptrace (2)\(emeven
575of other
50b4bc23 576sandboxed processes\(emwithout extreme care;
06865301 577ptracers can use this mechanism to escape from the seccomp sandbox.)
e9519f4f 578.TP
f04207f4
TH
579.BR SECCOMP_RET_LOG " (since Linux 4.14)"
580.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
1e94218c
MK
581This value results in the system call being executed after
582the filter return action is logged.
583An administrator may override the logging of this action via
f04207f4
TH
584the
585.IR /proc/sys/kernel/seccomp/actions_logged
586file.
587.TP
e9519f4f 588.BR SECCOMP_RET_ALLOW
610082d9 589This value results in the system call being executed.
0771269c
MK
590.PP
591If an action value other than one of the above is specified,
592then the filter action is treated as either
593.BR SECCOMP_RET_KILL_PROCESS
594(since Linux 4.14)
595.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
596or
597.BR SECCOMP_RET_KILL_THREAD
598(in Linux 4.13 and earlier).
599.\"
2577dbba
TH
600.SS /proc interfaces
601The files in the directory
602.IR /proc/sys/kernel/seccomp
603provide additional seccomp information and configuration:
604.TP
605.IR actions_avail " (since Linux 4.14)"
606.\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af
1e94218c
MK
607A read-only ordered list of seccomp filter return actions in string form.
608The ordering, from left-to-right, is in decreasing order of precedence.
609The list represents the set of seccomp filter return actions
610supported by the kernel.
2577dbba
TH
611.TP
612.IR actions_logged " (since Linux 4.14)"
613.\" commit 0ddec0fc8900201c0897b87b762b7c420436662f
1e94218c
MK
614A read-write ordered list of seccomp filter return actions that
615are allowed to be logged.
616Writes to the file do not need to be in ordered form but reads from
2577dbba
TH
617the file will be ordered in the same way as the
618.IR actions_avail
619file.
620.IP
621It is important to note that the value of
622.IR actions_logged
1e94218c
MK
623does not prevent certain filter return actions from being logged when
624the audit subsystem is configured to audit a task.
625If the action is not found in the
2577dbba
TH
626.IR actions_logged
627file, the final decision on whether to audit the action for that task is
628ultimately left up to the audit subsystem to decide for all filter return
629actions other than
630.BR SECCOMP_RET_ALLOW .
631.IP
632The "allow" string is not accepted in the
633.IR actions_logged
634file as it is not possible to log
635.BR SECCOMP_RET_ALLOW
1e94218c 636actions.
865c9c81
MK
637Attempting to write "allow" to the file will fail with the error
638.BR EINVAL .
df5b5f9a
MK
639.\"
640.SS Audit logging of seccomp actions
641.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
642Since Linux 4.14, the kernel provides the facility to log the
643actions returned by seccomp filters in the audit log.
644The kernel makes the decision to log an action based on
645the action type, whether or not the action is present in the
646.I actions_logged
647file, and whether kernel auditing is enabled
648(e.g., via the kernel boot option
649.IR audit=1 ).
650.\" or auditing could be enabled via the netlink API (AUDIT_SET)
651The rules are as follows:
652.IP * 3
653If the action is
654.BR SECCOMP_RET_ALLOW ,
655the action is not logged.
656.IP *
657Otherwise, if the action is either
658.BR SECCOMP_RET_KILL_PROCESS
659or
660.BR SECCOMP_RET_KILL_THREAD ,
c6688cd1 661and that action appears in the
df5b5f9a
MK
662.IR actions_logged
663file, the action is logged.
664.IP *
665Otherwise, if the filter has requested logging (the
666.BR SECCOMP_FILTER_FLAG_LOG
667flag)
c6688cd1 668and the action appears in the
df5b5f9a
MK
669.IR actions_logged
670file, the action is logged.
671.IP *
672Otherwise, if kernel auditing is enabled and the process is being audited
673.RB ( autrace (8)),
674the action is logged.
675.IP *
676Otherwise, the action is not logged.
e9519f4f
KC
677.SH RETURN VALUE
678On success,
679.BR seccomp ()
680returns 0.
681On error, if
682.BR SECCOMP_FILTER_FLAG_TSYNC
50b4bc23 683was used,
880bc4eb
MK
684the return value is the ID of the thread
685that caused the synchronization failure.
610082d9
MK
686(This ID is a kernel thread ID of the type returned by
687.BR clone (2)
688and
0e27eb0c 689.BR gettid (2).)
50b4bc23 690On other errors, \-1 is returned, and
e9519f4f
KC
691.IR errno
692is set to indicate the cause of the error.
693.SH ERRORS
694.BR seccomp ()
695can fail for the following reasons:
696.TP
7b10f505 697.BR EACCES
50b4bc23 698The caller did not have the
e9519f4f 699.BR CAP_SYS_ADMIN
be8f1272 700capability in its user namespace, or had not set
e9519f4f
KC
701.IR no_new_privs
702before using
703.BR SECCOMP_SET_MODE_FILTER .
704.TP
705.BR EFAULT
706.IR args
43b265ff 707was not a valid address.
e9519f4f
KC
708.TP
709.BR EINVAL
710.IR operation
1f5ad3c8
MK
711is unknown or is not supported by this kernel version or configuration.
712.TP
713.B EINVAL
714The specified
e9519f4f
KC
715.IR flags
716are invalid for the given
043ed1c6 717.IR operation .
29efefec
MK
718.TP
719.BR EINVAL
882cf566
MK
720.I operation
721included
722.BR BPF_ABS ,
723but the specified offset was not aligned to a 32-bit boundary or exceeded
724.IR "sizeof(struct\ seccomp_data)" .
882cf566
MK
725.TP
726.BR EINVAL
29efefec
MK
727.\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources
728A secure computing mode has already been set, and
729.I operation
730differs from the existing setting.
29efefec
MK
731.TP
732.BR EINVAL
29efefec
MK
733.I operation
734specified
735.BR SECCOMP_SET_MODE_FILTER ,
736but the filter program pointed to by
737.I args
738was not valid or the length of the filter program was zero or exceeded
739.B BPF_MAXINSNS
740(4096) instructions.
29efefec
MK
741.TP
742.BR ENOMEM
743Out of memory.
29efefec
MK
744.TP
745.BR ENOMEM
746.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources
747The total length of all filter programs attached
748to the calling thread would exceed
749.B MAX_INSNS_PER_PATH
750(32768) instructions.
751Note that for the purposes of calculating this limit,
ce67ca23
MK
752each already existing filter program incurs an
753overhead penalty of 4 instructions.
e9519f4f 754.TP
6d1728da
TH
755.BR EOPNOTSUPP
756.I operation
757specified
758.BR SECCOMP_GET_ACTION_AVAIL ,
759but the kernel does not support the filter return action specified by
760.IR args .
761.TP
e9519f4f
KC
762.BR ESRCH
763Another thread caused a failure during thread sync, but its ID could not
764be determined.
765.SH VERSIONS
87aa1b2c 766The
dff53954 767.BR seccomp ()
87aa1b2c 768system call first appeared in Linux 3.17.
cbffd6ae 769.\" FIXME . Add glibc version
e9519f4f 770.SH CONFORMING TO
87aa1b2c 771The
dff53954 772.BR seccomp ()
87aa1b2c 773system call is a nonstandard Linux extension.
e9519f4f 774.SH NOTES
aea38298
MK
775Rather than hand-coding seccomp filters as shown in the example below,
776you may prefer to employ the
777.I libseccomp
778library, which provides a front-end for generating seccomp filters.
efeece04 779.PP
1367a60a
MK
780The
781.IR Seccomp
782field of the
783.IR /proc/[pid]/status
784file provides a method of viewing the seccomp mode of a process; see
785.BR proc (5).
efeece04 786.PP
e9519f4f 787.BR seccomp ()
50b4bc23
MK
788provides a superset of the functionality provided by the
789.BR prctl (2)
b95b411b 790.BR PR_SET_SECCOMP
50b4bc23
MK
791operation (which does not support
792.IR flags ).
efeece04 793.PP
88f02af6 794Since Linux 4.4, the
d0eae5b5 795.BR ptrace (2)
88f02af6
MK
796.B PTRACE_SECCOMP_GET_FILTER
797operation can be used to dump a process's seccomp filters.
798.\"
30c5868e
MK
799.SS Architecture support for seccomp BPF
800Architecture support for seccomp BPF filtering
801.\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in
802.\" kernel source. Last checked in Linux 4.16-rc source.
803is available on the following architectures:
804.IP * 3
805x86-64, i386, x32 (since Linux 3.5)
806.PD 0
807.IP *
808ARM (since Linux 3.8)
809.IP *
810s390 (since Linux 3.8)
811.IP *
812MIPS (since Linux 3.16)
813.IP *
814ARM-64 (since Linux 3.19)
815.IP *
816PowerPC (since Linux 4.3)
817.IP *
818Tile (since Linux 4.3)
819.IP *
820PA-RISC (since Linux 4.6)
821.\" User mode Linux since Linux 4.6
822.PD
823.\"
5069617c
MK
824.SS Caveats
825There are various subtleties to consider when applying seccomp filters
826to a program, including the following:
827.IP * 3
828Some traditional system calls have user-space implementations in the
829.BR vdso (7)
830on many architectures.
831Notable examples include
832.BR clock_gettime (2),
833.BR gettimeofday (2),
834and
835.BR time (2).
836On such architectures,
837seccomp filtering for these system calls will have no effect.
42cfb355
MK
838(However, there are cases where the
839.BR vdso (7)
840implementations may fall back to invoking the true system call,
841in which case seccomp filters would see the system call.)
5069617c
MK
842.IP *
843Seccomp filtering is based on system call numbers.
844However, applications typically do not directly invoke system calls,
845but instead call wrapper functions in the C library which
846in turn invoke the system calls.
847Consequently, one must be aware of the following:
848.RS
849.IP \(bu 3
850The glibc wrappers for some traditional system calls may actually
851employ system calls with different names in the kernel.
852For example, the
853.BR exit (2)
854wrapper function actually employs the
855.BR exit_group (2)
856system call, and the
857.BR fork (2)
858wrapper function actually calls
859.BR clone (2).
860.IP \(bu
861The behavior of wrapper functions may vary across architectures,
862according to the range of system calls provided on those architectures.
863In other words, the same wrapper function may invoke
864different system calls on different architectures.
865.IP \(bu
866Finally, the behavior of wrapper functions can change across glibc versions.
867For example, in older versions, the glibc wrapper function for
868.BR open (2)
869invoked the system call of the same name,
870but starting in glibc 2.26, the implementation switched to calling
871.BR openat (2)
872on all architectures.
873.RE
874.PP
faec2136 875The consequence of the above points is that it may be necessary
5069617c
MK
876to filter for a system call other than might be expected.
877Various manual pages in Section 2 provide helpful details
878about the differences between wrapper functions and
879the underlying system calls in subsections entitled
880.IR "C library/kernel differences" .
881.PP
882Furthermore, note that the application of seccomp filters
883even risks causing bugs in an application,
884when the filters cause unexpected failures for legitimate operations
885that the application might need to perform.
886Such bugs may not easily be discovered when testing the seccomp
887filters if the bugs occur in rarely used application code paths.
5069617c 888.\"
cecc8c48
MK
889.SS Seccomp-specific BPF details
890Note the following BPF details specific to seccomp filters:
891.IP * 3
892The
893.B BPF_H
894and
895.B BPF_B
896size modifiers are not supported: all operations must load and store
897(4-byte) words
898.RB ( BPF_W ).
899.IP *
900To access the contents of the
901.I seccomp_data
902buffer, use the
903.B BPF_ABS
904addressing mode modifier.
cecc8c48
MK
905.IP *
906The
907.B BPF_LEN
908addressing mode modifier yields an immediate mode operand
909whose value is the size of the
910.IR seccomp_data
911buffer.
e9519f4f 912.SH EXAMPLE
35e4506b
MK
913The program below accepts four or more arguments.
914The first three arguments are a system call number,
915a numeric architecture identifier, and an error number.
916The program uses these values to construct a BPF filter
917that is used at run time to perform the following checks:
918.IP [1] 4
919If the program is not running on the specified architecture,
920the BPF filter causes system calls to fail with the error
921.BR ENOSYS .
922.IP [2]
923If the program attempts to execute the system call with the specified number,
924the BPF filter causes the system call to fail, with
925.I errno
926being set to the specified error number.
927.PP
928The remaining command-line arguments specify
929the pathname and additional arguments of a program
930that the example program should attempt to execute using
71bb61ec 931.BR execv (3)
35e4506b
MK
932(a library function that employs the
933.BR execve (2)
934system call).
935Some example runs of the program are shown below.
efeece04 936.PP
35e4506b
MK
937First, we display the architecture that we are running on (x86-64)
938and then construct a shell function that looks up system call
939numbers on this architecture:
efeece04 940.PP
35e4506b 941.in +4n
b8302363 942.EX
35e4506b
MK
943$ \fBuname -m\fP
944x86_64
945$ \fBsyscall_nr() {
d1a71985 946 cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e
06865301 947 awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
35e4506b 948}\fP
b8302363 949.EE
e646a1ba 950.in
efeece04 951.PP
35e4506b
MK
952When the BPF filter rejects a system call (case [2] above),
953it causes the system call to fail with the error number
954specified on the command line.
955In the experiments shown here, we'll use error number 99:
efeece04 956.PP
35e4506b 957.in +4n
b8302363 958.EX
35e4506b
MK
959$ \fBerrno 99\fP
960EADDRNOTAVAIL 99 Cannot assign requested address
b8302363 961.EE
e646a1ba 962.in
efeece04 963.PP
35e4506b
MK
964In the following example, we attempt to run the command
965.BR whoami (1),
966but the BPF filter rejects the
967.BR execve (2)
968system call, so that the command is not even executed:
efeece04 969.PP
35e4506b 970.in +4n
b8302363 971.EX
35e4506b 972$ \fBsyscall_nr execve\fP
06865301 97359
cbffd6ae
MK
974$ \fB./a.out\fP
975Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
976Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
977 AUDIT_ARCH_X86_64: 0xC000003E
35e4506b
MK
978$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
979execv: Cannot assign requested address
b8302363 980.EE
e646a1ba 981.in
efeece04 982.PP
35e4506b
MK
983In the next example, the BPF filter rejects the
984.BR write (2)
985system call, so that, although it is successfully started, the
986.BR whoami (1)
987command is not able to write output:
efeece04 988.PP
35e4506b 989.in +4n
b8302363 990.EX
35e4506b
MK
991$ \fBsyscall_nr write\fP
9921
993$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
b8302363 994.EE
e646a1ba 995.in
efeece04 996.PP
35e4506b
MK
997In the final example,
998the BPF filter rejects a system call that is not used by the
999.BR whoami (1)
1000command, so it is able to successfully execute and produce output:
efeece04 1001.PP
35e4506b 1002.in +4n
b8302363 1003.EX
35e4506b
MK
1004$ \fBsyscall_nr preadv\fP
1005295
1006$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
1007cecilia
b8302363 1008.EE
e646a1ba 1009.in
35e4506b 1010.SS Program source
e7d0bb47 1011.EX
e9519f4f
KC
1012#include <errno.h>
1013#include <stddef.h>
1014#include <stdio.h>
1015#include <stdlib.h>
1016#include <unistd.h>
1017#include <linux/audit.h>
1018#include <linux/filter.h>
1019#include <linux/seccomp.h>
1020#include <sys/prctl.h>
1021
b44088b4
JH
1022#define X32_SYSCALL_BIT 0x40000000
1023
d06d25cc 1024static int
69999632 1025install_filter(int syscall_nr, int t_arch, int f_errno)
e9519f4f 1026{
b44088b4 1027 unsigned int upper_nr_limit = 0xffffffff;
e9855ef2 1028
a3dcaaa2
MK
1029 /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI
1030 (in the x32 ABI, all system calls have bit 30 set in the
1031 'nr' field, meaning the numbers are >= X32_SYSCALL_BIT) */
b44088b4
JH
1032 if (t_arch == AUDIT_ARCH_X86_64)
1033 upper_nr_limit = X32_SYSCALL_BIT - 1;
1034
e9519f4f 1035 struct sock_filter filter[] = {
ef05ec71 1036 /* [0] Load architecture from 'seccomp_data' buffer into
64267236 1037 accumulator */
06865301 1038 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
e9519f4f 1039 (offsetof(struct seccomp_data, arch))),
d06d25cc 1040
b44088b4 1041 /* [1] Jump forward 5 instructions if architecture does not
ef05ec71 1042 match 't_arch' */
b44088b4 1043 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5),
d06d25cc 1044
ef05ec71 1045 /* [2] Load system call number from 'seccomp_data' buffer into
64267236 1046 accumulator */
06865301 1047 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
e9519f4f 1048 (offsetof(struct seccomp_data, nr))),
d06d25cc 1049
65cfc712 1050 /* [3] Check ABI - only needed for x86-64 in blacklist use
29aa163b 1051 cases. Use BPF_JGT instead of checking against the bit
0582770c 1052 mask to avoid having to reload the syscall number. */
b44088b4
JH
1053 BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
1054
1055 /* [4] Jump forward 1 instruction if system call number
ef05ec71 1056 does not match 'syscall_nr' */
06865301 1057 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
d06d25cc 1058
b44088b4 1059 /* [5] Matching architecture and system call: don't execute
69999632 1060 the system call, and return 'f_errno' in 'errno' */
06865301 1061 BPF_STMT(BPF_RET | BPF_K,
69999632 1062 SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
d06d25cc 1063
b44088b4 1064 /* [6] Destination of system call number mismatch: allow other
87aa1b2c 1065 system calls */
06865301 1066 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
d06d25cc 1067
5defc77d 1068 /* [7] Destination of architecture mismatch: kill task */
06865301 1069 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
e9519f4f 1070 };
d06d25cc 1071
e9519f4f 1072 struct sock_fprog prog = {
d06d25cc 1073 .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
e9519f4f
KC
1074 .filter = filter,
1075 };
d06d25cc 1076
e9519f4f
KC
1077 if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
1078 perror("seccomp");
d06d25cc 1079 return 1;
e9519f4f 1080 }
d06d25cc
MK
1081
1082 return 0;
e9519f4f
KC
1083}
1084
d06d25cc
MK
1085int
1086main(int argc, char **argv)
e9519f4f
KC
1087{
1088 if (argc < 5) {
cbffd6ae 1089 fprintf(stderr, "Usage: "
d1a71985
MK
1090 "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en"
1091 "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en"
1092 " AUDIT_ARCH_X86_64: 0x%X\en"
1093 "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
d06d25cc 1094 exit(EXIT_FAILURE);
e9519f4f 1095 }
d06d25cc 1096
e9519f4f
KC
1097 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
1098 perror("prctl");
d06d25cc 1099 exit(EXIT_FAILURE);
e9519f4f 1100 }
d06d25cc 1101
e9519f4f
KC
1102 if (install_filter(strtol(argv[1], NULL, 0),
1103 strtol(argv[2], NULL, 0),
1104 strtol(argv[3], NULL, 0)))
d06d25cc
MK
1105 exit(EXIT_FAILURE);
1106
e9519f4f
KC
1107 execv(argv[4], &argv[4]);
1108 perror("execv");
d06d25cc 1109 exit(EXIT_FAILURE);
e9519f4f 1110}
e7d0bb47 1111.EE
e9519f4f 1112.SH SEE ALSO
2337d75e 1113.BR bpfc (1),
85940258 1114.BR strace (1),
00eaa6aa 1115.BR bpf (2),
e9519f4f
KC
1116.BR prctl (2),
1117.BR ptrace (2),
3b4a59c4 1118.BR sigaction (2),
712551ea 1119.BR proc (5),
e9519f4f
KC
1120.BR signal (7),
1121.BR socket (7)
51f5698d 1122.PP
ca6f43fc
MK
1123Various pages from the
1124.I libseccomp
1125library, including:
ccc70c81 1126.BR scmp_sys_resolver (1),
ca6f43fc
MK
1127.BR seccomp_init (3),
1128.BR seccomp_load (3),
1129.BR seccomp_rule_add (3),
1130and
1131.BR seccomp_export_bpf (3).
51f5698d 1132.PP
e28e21fe
MK
1133The kernel source files
1134.IR Documentation/networking/filter.txt
1135and
28d96036
ES
1136.IR Documentation/userspace\-api/seccomp_filter.rst
1137.\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3
1138(or
1139.IR Documentation/prctl/seccomp_filter.txt
1140before Linux 4.13).
51f5698d 1141.PP
8fb01fde 1142McCanne, S.\& and Jacobson, V.\& (1992)
f74fd424
MK
1143.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
1144Proceedings of the USENIX Winter 1993 Conference
5465ae95 1145.UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf
f74fd424 1146.UE