]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/seccomp.2
access.2, delete_module.2, eventfd.2, fallocate.2, fcntl.2, getrandom.2, init_module...
[thirdparty/man-pages.git] / man2 / seccomp.2
CommitLineData
e9519f4f
KC
1.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
2.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
35e4506b 3.\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
e9519f4f
KC
4.\"
5.\" %%%LICENSE_START(VERBATIM)
6.\" Permission is granted to make and distribute verbatim copies of this
7.\" manual provided the copyright notice and this permission notice are
8.\" preserved on all copies.
9.\"
10.\" Permission is granted to copy and distribute modified versions of this
11.\" manual under the conditions for verbatim copying, provided that the
12.\" entire resulting derived work is distributed under the terms of a
13.\" permission notice identical to this one.
14.\"
15.\" Since the Linux kernel and libraries are constantly changing, this
16.\" manual page may be incorrect or out-of-date. The author(s) assume no
17.\" responsibility for errors or omissions, or for damages resulting from
18.\" the use of the information contained herein. The author(s) may not
19.\" have taken the same level of care in the production of this manual,
20.\" which is licensed free of charge, as they might when working
21.\" professionally.
22.\"
23.\" Formatted or processed versions of this manual, if unaccompanied by
24.\" the source, must acknowledge the copyright and authors of this work.
25.\" %%%LICENSE_END
26.\"
4b8c67d9 27.TH SECCOMP 2 2017-09-15 "Linux" "Linux Programmer's Manual"
e9519f4f 28.SH NAME
50b4bc23 29seccomp \- operate on Secure Computing state of the process
e9519f4f
KC
30.SH SYNOPSIS
31.nf
32.B #include <linux/seccomp.h>
33.B #include <linux/filter.h>
34.B #include <linux/audit.h>
35.B #include <linux/signal.h>
36.B #include <sys/ptrace.h>
cbffd6ae
MK
37.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
38.\" need <sys/ptrace.h>
dbfe9c70 39.PP
50b4bc23
MK
40.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
41", void *" args );
e9519f4f
KC
42.fi
43.SH DESCRIPTION
44The
45.BR seccomp ()
46system call operates on the Secure Computing (seccomp) state of the
50b4bc23 47calling process.
efeece04 48.PP
e9519f4f
KC
49Currently, Linux supports the following
50.IR operation
51values:
52.TP
53.BR SECCOMP_SET_MODE_STRICT
cbffd6ae 54The only system calls that the calling thread is permitted to make are
e9519f4f
KC
55.BR read (2),
56.BR write (2),
85fbef74
MK
57.BR _exit (2)
58(but not
59.BR exit_group (2)),
e9519f4f
KC
60and
61.BR sigreturn (2).
62Other system calls result in the delivery of a
63.BR SIGKILL
cbffd6ae 64signal.
87aa1b2c 65Strict secure computing mode is useful for number-crunching
e9519f4f
KC
66applications that may need to execute untrusted byte code, perhaps
67obtained by reading from a pipe or socket.
efeece04 68.IP
bec56ee2
JH
69Note that although the calling thread can no longer call
70.BR sigprocmask (2),
71it can use
72.BR sigreturn (2)
73to block all signals apart from
74.BR SIGKILL
75and
76.BR SIGSTOP .
65be1b46 77This means that
bec56ee2 78.BR alarm (2)
65be1b46
MK
79(for example) is not sufficient for restricting the process's execution time.
80Instead, to reliably terminate the process,
81.BR SIGKILL
82must be used.
83This can be done by using
bec56ee2
JH
84.BR timer_create (2)
85with
86.BR SIGEV_SIGNAL
87and
65be1b46 88.IR sigev_signo
bec56ee2 89set to
65be1b46
MK
90.BR SIGKILL ,
91or by using
bec56ee2
JH
92.BR setrlimit (2)
93to set the hard limit for
94.BR RLIMIT_CPU .
efeece04 95.IP
e9519f4f
KC
96This operation is available only if the kernel is configured with
97.BR CONFIG_SECCOMP
98enabled.
efeece04 99.IP
e9519f4f
KC
100The value of
101.IR flags
102must be 0, and
103.IR args
104must be NULL.
efeece04 105.IP
50b4bc23 106This operation is functionally identical to the call:
efeece04 107.IP
50b4bc23 108 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
e9519f4f
KC
109.TP
110.BR SECCOMP_SET_MODE_FILTER
111The system calls allowed are defined by a pointer to a Berkeley Packet
112Filter (BPF) passed via
113.IR args .
cbffd6ae 114This argument is a pointer to a
e9519f4f
KC
115.IR "struct\ sock_fprog" ;
116it can be designed to filter arbitrary system calls and system call
50b4bc23 117arguments.
87aa1b2c
MK
118If the filter is invalid,
119.BR seccomp ()
120fails, returning
cbffd6ae 121.BR EINVAL
e9519f4f
KC
122in
123.IR errno .
efeece04 124.IP
e9519f4f 125If
2d5fb4bb 126.BR fork (2)
e9519f4f 127or
2d5fb4bb
MK
128.BR clone (2)
129is allowed by the filter, any child processes will be constrained to
cbffd6ae 130the same system call filters as the parent.
2d5fb4bb
MK
131If
132.BR execve (2)
cbffd6ae
MK
133is allowed,
134the existing filters will be preserved across a call to
2d5fb4bb 135.BR execve (2).
efeece04 136.IP
955efdbc
MK
137In order to use the
138.BR SECCOMP_SET_MODE_FILTER
139operation, either the caller must have the
e9519f4f 140.BR CAP_SYS_ADMIN
be8f1272 141capability in its user namespace, or the thread must already have the
cbffd6ae
MK
142.I no_new_privs
143bit set.
144If that bit was not already set by an ancestor of this thread,
145the thread must make the following call:
efeece04 146.IP
955efdbc 147 prctl(PR_SET_NO_NEW_PRIVS, 1);
efeece04 148.IP
955efdbc
MK
149Otherwise, the
150.BR SECCOMP_SET_MODE_FILTER
26cd31fd 151operation fails and returns
e9519f4f
KC
152.BR EACCES
153in
154.IR errno .
b8ffefd7
MK
155This requirement ensures that an unprivileged process cannot apply
156a malicious filter and then invoke a set-user-ID or
157other privileged program using
158.BR execve (2),
cbffd6ae
MK
159thus potentially compromising that program.
160(Such a malicious filter might, for example, cause an attempt to use
161.BR setuid (2)
162to set the caller's user IDs to non-zero values to instead
163return 0 without actually making the system call.
164Thus, the program might be tricked into retaining superuser privileges
165in circumstances where it is possible to influence it to do
166dangerous things because it did not actually drop privileges.)
efeece04 167.IP
7d596c7b 168If
e9519f4f
KC
169.BR prctl (2)
170or
bf7bc8b8 171.BR seccomp ()
7d596c7b 172is allowed by the attached filter, further filters may be added.
e01ca3aa 173This will increase evaluation time, but allows for further reduction of
cbffd6ae 174the attack surface during execution of a thread.
efeece04 175.IP
50b4bc23
MK
176The
177.BR SECCOMP_SET_MODE_FILTER
178operation is available only if the kernel is configured with
e9519f4f
KC
179.BR CONFIG_SECCOMP_FILTER
180enabled.
efeece04 181.IP
e9519f4f
KC
182When
183.IR flags
50b4bc23 184is 0, this operation is functionally identical to the call:
efeece04 185.IP
50b4bc23 186 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
efeece04 187.IP
e9519f4f
KC
188The recognized
189.IR flags
190are:
191.RS
192.TP
193.BR SECCOMP_FILTER_FLAG_TSYNC
87aa1b2c 194When adding a new filter, synchronize all other threads of the calling
50b4bc23 195process to the same seccomp filter tree.
b8ffefd7
MK
196A "filter tree" is the ordered list of filters attached to a thread.
197(Attaching identical filters in separate
198.BR seccomp ()
199calls results in different filters from this perspective.)
efeece04 200.IP
b8ffefd7 201If any thread cannot synchronize to the same filter tree,
50b4bc23
MK
202the call will not attach the new seccomp filter,
203and will fail, returning the first thread ID found that cannot synchronize.
cbffd6ae 204Synchronization will fail if another thread in the same process is in
e9519f4f 205.BR SECCOMP_MODE_STRICT
50b4bc23
MK
206or if it has attached new seccomp filters to itself,
207diverging from the calling thread's filter tree.
e9519f4f 208.RE
93b9a9ee 209.SS Filters
e9519f4f
KC
210When adding filters via
211.BR SECCOMP_SET_MODE_FILTER ,
212.IR args
213points to a filter program:
efeece04 214.PP
e9519f4f 215.in +4n
b8302363 216.EX
e9519f4f 217struct sock_fprog {
1da13d15
MK
218 unsigned short len; /* Number of BPF instructions */
219 struct sock_filter *filter; /* Pointer to array of
220 BPF instructions */
e9519f4f 221};
b8302363 222.EE
e9519f4f 223.in
efeece04 224.PP
e9519f4f 225Each program must contain one or more BPF instructions:
efeece04 226.PP
e9519f4f 227.in +4n
b8302363 228.EX
1da13d15
MK
229struct sock_filter { /* Filter block */
230 __u16 code; /* Actual filter code */
231 __u8 jt; /* Jump true */
232 __u8 jf; /* Jump false */
233 __u32 k; /* Generic multiuse field */
e9519f4f 234};
b8302363 235.EE
e9519f4f 236.in
efeece04 237.PP
0c2e01b7
MK
238When executing the instructions, the BPF program operates on the
239system call information made available (i.e., use the
240.BR BPF_ABS
1a7e5113
MK
241addressing mode) as a (read-only)
242.\" Quoting Kees Cook:
243.\" If BPF even allows changing the data, it's not copied back to
244.\" the syscall when it runs. Anything wanting to do things like
3775c1ca 245.\" that would need to use ptrace to catch the call and directly
1a7e5113
MK
246.\" modify the registers before continuing with the call.
247buffer of the following form:
efeece04 248.PP
e9519f4f 249.in +4n
b8302363 250.EX
e9519f4f 251struct seccomp_data {
5b95c4ab 252 int nr; /* System call number */
a0a0c98b
MK
253 __u32 arch; /* AUDIT_ARCH_* value
254 (see <linux/audit.h>) */
e9519f4f 255 __u64 instruction_pointer; /* CPU instruction pointer */
1da13d15 256 __u64 args[6]; /* Up to 6 system call arguments */
e9519f4f 257};
b8302363 258.EE
e9519f4f 259.in
efeece04 260.PP
944a31e2 261Because numbering of system calls varies between architectures and
65cfc712 262some architectures (e.g., x86-64) allow user-space code to use
b44088b4
JH
263the calling conventions of multiple architectures, it is usually
264necessary to verify the value of the
265.IR arch
266field.
efeece04 267.PP
b44088b4
JH
268It is strongly recommended to use a whitelisting approach whenever
269possible because such an approach is more robust and simple.
270A blacklist will have to be updated whenever a potentially
37daa840 271dangerous system call is added (or a dangerous flag or option if those
b44088b4
JH
272are blacklisted), and it is often possible to alter the
273representation of a value without altering its meaning, leading to
274a blacklist bypass.
efeece04 275.PP
b44088b4
JH
276The
277.IR arch
37daa840 278field is not unique for all calling conventions.
65cfc712 279The x86-64 ABI and the x32 ABI both use
b44088b4
JH
280.BR AUDIT_ARCH_X86_64
281as
282.IR arch ,
37daa840
MK
283and they run on the same processors.
284Instead, the mask
b44088b4
JH
285.BR __X32_SYSCALL_BIT
286is used on the system call number to tell the two ABIs apart.
8cc4d071 287.\" As noted by Dave Drysdale in a note at the end of
3c5ab770
MK
288.\" https://lwn.net/Articles/604515/
289.\" One additional detail to point out for the x32 ABI case:
290.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT),
291.\" to mark it as an x32 call.
292.\"
293.\" If x32 support is included in the kernel, then __SYSCALL_MASK
294.\" will have a value that is not all-ones, and this will trigger
295.\" an extra instruction in system_call to mask off the extra bit,
8cc4d071 296.\" so that the syscall table indexing still works.
efeece04 297.PP
b44088b4 298This means that in order to create a seccomp-based
65cfc712 299blacklist for system calls performed through the x86-64 ABI,
b44088b4
JH
300it is necessary to not only check that
301.IR arch
302equals
303.BR AUDIT_ARCH_X86_64 ,
7a79bb30 304but also to explicitly reject all system calls that contain
b44088b4
JH
305.BR __X32_SYSCALL_BIT
306in
307.IR nr .
efeece04 308.PP
712551ea
MK
309The
310.I instruction_pointer
311field provides the address of the machine-language instruction that
312performed the system call.
313This might be useful in conjunction with the use of
314.I /proc/[pid]/maps
315to perform checks based on which region (mapping) of the program
316made the system call.
317(Probably, it is wise to lock down the
318.BR mmap (2)
319and
320.BR mprotect (2)
321system calls to prevent the program from subverting such checks.)
efeece04 322.PP
b44088b4
JH
323When checking values from
324.IR args
325against a blacklist, keep in mind that arguments are often
37daa840
MK
326silently truncated before being processed, but after the seccomp check.
327For example, this happens if the i386 ABI is used on an
3c5ab770 328x86-64 kernel: although the kernel will normally not look beyond
b44088b4 329the 32 lowest bits of the arguments, the values of the full
37daa840 33064-bit registers will be present in the seccomp data.
65cfc712 331A less surprising example is that if the x86-64 ABI is used to perform
37daa840
MK
332a system call that takes an argument of type
333.IR int ,
334the more-significant half of the argument register is ignored by
335the system call, but visible in the seccomp data.
efeece04 336.PP
cbffd6ae
MK
337A seccomp filter returns a 32-bit value consisting of two parts:
338the most significant 16 bits
339(corresponding to the mask defined by the constant
340.BR SECCOMP_RET_ACTION )
341contain one of the "action" values listed below;
342the least significant 16-bits (defined by the constant
343.BR SECCOMP_RET_DATA )
344are "data" to be associated with this return value.
efeece04 345.PP
755a656a
MK
346If multiple filters exist, they are \fIall\fP executed,
347in reverse order of their addition to the filter tree\(emthat is,
348the most recently installed filter is executed first.
d74503a5
MK
349(Note that all filters will be called
350even if one of the earlier filters returns
351.BR SECCOMP_RET_KILL .
352This is done to simplify the kernel code and to provide a
353tiny speed-up in the execution of sets of filters by
354avoiding a check for this uncommon case.)
355.\" From an Aug 2015 conversation with Kees Cook where I asked why *all*
356.\" filters even if one of the early filters returns SECCOMP_RET_KILL:
357.\"
358.\" It's just because it would be an optimization that would only speed up
359.\" the RET_KILL case, but it's the uncommon one and the one that doesn't
360.\" benefit meaningfully from such a change (you need to kill the process
361.\" really quickly?). We would speed up killing a program at the (albeit
362.\" tiny) expense to all other filtered programs. Best to keep the filter
363.\" execution logic clear, simple, and as fast as possible for all
364.\" filters.
cbffd6ae
MK
365The return value for the evaluation of a given system call is the first-seen
366.BR SECCOMP_RET_ACTION
367value of highest precedence (along with its accompanying data)
368returned by execution of all of the filters.
efeece04 369.PP
cbffd6ae 370In decreasing order of precedence,
ec2bb3c0 371the values that may be returned by a seccomp filter are:
e9519f4f
KC
372.TP
373.BR SECCOMP_RET_KILL
5defc77d 374This value results in the task exiting immediately
0fc339b2 375.\" We really do man "task" here, not "process"
610082d9 376without executing the system call.
5defc77d 377The task terminates as though killed by a
0a2a0df0
MK
378.B SIGSYS
379signal
380.RI ( not
381.BR SIGKILL ).
089761d5
MF
382Even if a signal handler has been registered and otherwise catches
383.BR SIGSYS ,
0fc339b2 384the handler will be ignored in this case and the task always terminates.
efeece04 385.IP
089761d5
MF
386.\" See these commits:
387.\" seccomp: dump core when using SECCOMP_RET_KILL (b25e67161c295c98acda92123b2dd1e7d8642901)
388.\" seccomp: Only dump core when single-threaded (d7276e321ff8a53106a59c85ca46d03e34288893)
baaf65e8
MK
389Before Linux 4.11,
390any process terminated this way would not trigger a coredump
089761d5
MF
391(even though
392.B SIGSYS
393is documented in
394.BR signal (7)
baaf65e8
MK
395as having a default action of termination with a core dump).
396Since Linux 4.11,
397single threaded processes follow standard core dump behavior,
089761d5
MF
398but multithreaded processes still do not.
399There is no workaround currently for multithreaded processes.
e9519f4f
KC
400.TP
401.BR SECCOMP_RET_TRAP
610082d9 402This value results in the kernel sending a
e9519f4f 403.BR SIGSYS
cbffd6ae
MK
404signal to the triggering process without executing the system call.
405Various fields will be set in the
406.I siginfo_t
db99c064
MK
407structure (see
408.BR sigaction (2))
409associated with signal:
cf690e13
MK
410.RS
411.IP * 3
412.I si_signo
413will contain
414.BR SIGSYS .
415.IP *
db99c064 416.IR si_call_addr
cf690e13
MK
417will show the address of the system call instruction.
418.IP *
db99c064 419.IR si_syscall
e9519f4f 420and
db99c064 421.IR si_arch
87aa1b2c 422will indicate which system call was attempted.
cf690e13
MK
423.IP *
424.I si_code
cf690e13
MK
425will contain
426.BR SYS_SECCOMP .
427.IP *
428.I si_errno
429will contain the
430.BR SECCOMP_RET_DATA
431portion of the filter return value.
432.RE
433.IP
50b4bc23 434The program counter will be as though the system call happened
87aa1b2c 435(i.e., it will not point to the system call instruction).
50b4bc23 436The return value register will contain an architecture\-dependent value;
06865301 437if resuming execution, set it to something appropriate for the system call.
a79566fb 438(The architecture dependency is because replacing it with
e9519f4f
KC
439.BR ENOSYS
440could overwrite some useful information.)
e9519f4f
KC
441.TP
442.BR SECCOMP_RET_ERRNO
610082d9 443This value results in the
cbffd6ae
MK
444.B SECCOMP_RET_DATA
445portion of the filter's return value being passed to user space as the
e9519f4f 446.IR errno
cbffd6ae 447value without executing the system call.
e9519f4f
KC
448.TP
449.BR SECCOMP_RET_TRACE
87aa1b2c
MK
450When returned, this value will cause the kernel to attempt to notify a
451.BR ptrace (2)-based
452tracer prior to executing the system call.
50b4bc23 453If there is no tracer present,
3fbe3aa3
MK
454the system call is not executed and returns a failure status with
455.I errno
456set to
457.BR ENOSYS .
efeece04 458.IP
e9519f4f
KC
459A tracer will be notified if it requests
460.BR PTRACE_O_TRACESECCOMP
461using
462.IR ptrace(PTRACE_SETOPTIONS) .
463The tracer will be notified of a
464.BR PTRACE_EVENT_SECCOMP
465and the
466.BR SECCOMP_RET_DATA
cbffd6ae 467portion of the filter's return value will be available to the tracer via
e9519f4f 468.BR PTRACE_GETEVENTMSG .
efeece04 469.IP
50b4bc23
MK
470The tracer can skip the system call by changing the system call number
471to \-1.
472Alternatively, the tracer can change the system call
87aa1b2c 473requested by changing the system call to a valid system call number.
50b4bc23
MK
474If the tracer asks to skip the system call, then the system call will
475appear to return the value that the tracer puts in the return value register.
efeece04 476.IP
09481df3
JH
477.\" This was changed in ce6526e8afa4.
478.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was
479.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and
480.\" 0f3912fd934c for ARM.
481Before kernel 4.8, the seccomp check will not be run again after the tracer is
482notified.
483(This means that, on older kernels, seccomp-based sandboxes
50b4bc23 484.B "must not"
87aa1b2c
MK
485allow use of
486.BR ptrace (2)\(emeven
487of other
50b4bc23 488sandboxed processes\(emwithout extreme care;
06865301 489ptracers can use this mechanism to escape from the seccomp sandbox.)
e9519f4f
KC
490.TP
491.BR SECCOMP_RET_ALLOW
610082d9 492This value results in the system call being executed.
e9519f4f
KC
493.SH RETURN VALUE
494On success,
495.BR seccomp ()
496returns 0.
497On error, if
498.BR SECCOMP_FILTER_FLAG_TSYNC
50b4bc23 499was used,
880bc4eb
MK
500the return value is the ID of the thread
501that caused the synchronization failure.
610082d9
MK
502(This ID is a kernel thread ID of the type returned by
503.BR clone (2)
504and
0e27eb0c 505.BR gettid (2).)
50b4bc23 506On other errors, \-1 is returned, and
e9519f4f
KC
507.IR errno
508is set to indicate the cause of the error.
509.SH ERRORS
510.BR seccomp ()
511can fail for the following reasons:
512.TP
513.BR EACCESS
50b4bc23 514The caller did not have the
e9519f4f 515.BR CAP_SYS_ADMIN
be8f1272 516capability in its user namespace, or had not set
e9519f4f
KC
517.IR no_new_privs
518before using
519.BR SECCOMP_SET_MODE_FILTER .
520.TP
521.BR EFAULT
522.IR args
43b265ff 523was not a valid address.
e9519f4f
KC
524.TP
525.BR EINVAL
526.IR operation
527is unknown; or
528.IR flags
529are invalid for the given
043ed1c6 530.IR operation .
29efefec
MK
531.TP
532.BR EINVAL
882cf566
MK
533.I operation
534included
535.BR BPF_ABS ,
536but the specified offset was not aligned to a 32-bit boundary or exceeded
537.IR "sizeof(struct\ seccomp_data)" .
882cf566
MK
538.TP
539.BR EINVAL
29efefec
MK
540.\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources
541A secure computing mode has already been set, and
542.I operation
543differs from the existing setting.
29efefec
MK
544.TP
545.BR EINVAL
546.\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources
547.I operation
548specified
549.BR SECCOMP_SET_MODE_FILTER ,
550but the kernel was not built with
551.B CONFIG_SECCOMP_FILTER
552enabled.
29efefec
MK
553.TP
554.BR EINVAL
555.I operation
556specified
557.BR SECCOMP_SET_MODE_FILTER ,
558but the filter program pointed to by
559.I args
560was not valid or the length of the filter program was zero or exceeded
561.B BPF_MAXINSNS
562(4096) instructions.
29efefec
MK
563.TP
564.BR ENOMEM
565Out of memory.
29efefec
MK
566.TP
567.BR ENOMEM
568.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources
569The total length of all filter programs attached
570to the calling thread would exceed
571.B MAX_INSNS_PER_PATH
572(32768) instructions.
573Note that for the purposes of calculating this limit,
ce67ca23
MK
574each already existing filter program incurs an
575overhead penalty of 4 instructions.
e9519f4f
KC
576.TP
577.BR ESRCH
578Another thread caused a failure during thread sync, but its ID could not
579be determined.
580.SH VERSIONS
87aa1b2c 581The
dff53954 582.BR seccomp ()
87aa1b2c 583system call first appeared in Linux 3.17.
cbffd6ae 584.\" FIXME . Add glibc version
e9519f4f 585.SH CONFORMING TO
87aa1b2c 586The
dff53954 587.BR seccomp ()
87aa1b2c 588system call is a nonstandard Linux extension.
e9519f4f 589.SH NOTES
aea38298
MK
590Rather than hand-coding seccomp filters as shown in the example below,
591you may prefer to employ the
592.I libseccomp
593library, which provides a front-end for generating seccomp filters.
efeece04 594.PP
1367a60a
MK
595The
596.IR Seccomp
597field of the
598.IR /proc/[pid]/status
599file provides a method of viewing the seccomp mode of a process; see
600.BR proc (5).
efeece04 601.PP
e9519f4f 602.BR seccomp ()
50b4bc23
MK
603provides a superset of the functionality provided by the
604.BR prctl (2)
b95b411b 605.BR PR_SET_SECCOMP
50b4bc23
MK
606operation (which does not support
607.IR flags ).
efeece04 608.PP
88f02af6
MK
609Since Linux 4.4, the
610.BR prctl (2)
611.B PTRACE_SECCOMP_GET_FILTER
612operation can be used to dump a process's seccomp filters.
613.\"
cecc8c48
MK
614.SS Seccomp-specific BPF details
615Note the following BPF details specific to seccomp filters:
616.IP * 3
617The
618.B BPF_H
619and
620.B BPF_B
621size modifiers are not supported: all operations must load and store
622(4-byte) words
623.RB ( BPF_W ).
624.IP *
625To access the contents of the
626.I seccomp_data
627buffer, use the
628.B BPF_ABS
629addressing mode modifier.
cecc8c48
MK
630.IP *
631The
632.B BPF_LEN
633addressing mode modifier yields an immediate mode operand
634whose value is the size of the
635.IR seccomp_data
636buffer.
e9519f4f 637.SH EXAMPLE
35e4506b
MK
638The program below accepts four or more arguments.
639The first three arguments are a system call number,
640a numeric architecture identifier, and an error number.
641The program uses these values to construct a BPF filter
642that is used at run time to perform the following checks:
643.IP [1] 4
644If the program is not running on the specified architecture,
645the BPF filter causes system calls to fail with the error
646.BR ENOSYS .
647.IP [2]
648If the program attempts to execute the system call with the specified number,
649the BPF filter causes the system call to fail, with
650.I errno
651being set to the specified error number.
652.PP
653The remaining command-line arguments specify
654the pathname and additional arguments of a program
655that the example program should attempt to execute using
71bb61ec 656.BR execv (3)
35e4506b
MK
657(a library function that employs the
658.BR execve (2)
659system call).
660Some example runs of the program are shown below.
efeece04 661.PP
35e4506b
MK
662First, we display the architecture that we are running on (x86-64)
663and then construct a shell function that looks up system call
664numbers on this architecture:
efeece04 665.PP
35e4506b 666.in +4n
b8302363 667.EX
35e4506b
MK
668$ \fBuname -m\fP
669x86_64
670$ \fBsyscall_nr() {
671 cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\
06865301 672 awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
35e4506b 673}\fP
b8302363 674.EE
e646a1ba 675.in
efeece04 676.PP
35e4506b
MK
677When the BPF filter rejects a system call (case [2] above),
678it causes the system call to fail with the error number
679specified on the command line.
680In the experiments shown here, we'll use error number 99:
efeece04 681.PP
35e4506b 682.in +4n
b8302363 683.EX
35e4506b
MK
684$ \fBerrno 99\fP
685EADDRNOTAVAIL 99 Cannot assign requested address
b8302363 686.EE
e646a1ba 687.in
efeece04 688.PP
35e4506b
MK
689In the following example, we attempt to run the command
690.BR whoami (1),
691but the BPF filter rejects the
692.BR execve (2)
693system call, so that the command is not even executed:
efeece04 694.PP
35e4506b 695.in +4n
b8302363 696.EX
35e4506b 697$ \fBsyscall_nr execve\fP
06865301 69859
cbffd6ae
MK
699$ \fB./a.out\fP
700Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
701Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
702 AUDIT_ARCH_X86_64: 0xC000003E
35e4506b
MK
703$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
704execv: Cannot assign requested address
b8302363 705.EE
e646a1ba 706.in
efeece04 707.PP
35e4506b
MK
708In the next example, the BPF filter rejects the
709.BR write (2)
710system call, so that, although it is successfully started, the
711.BR whoami (1)
712command is not able to write output:
efeece04 713.PP
35e4506b 714.in +4n
b8302363 715.EX
35e4506b
MK
716$ \fBsyscall_nr write\fP
7171
718$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
b8302363 719.EE
e646a1ba 720.in
efeece04 721.PP
35e4506b
MK
722In the final example,
723the BPF filter rejects a system call that is not used by the
724.BR whoami (1)
725command, so it is able to successfully execute and produce output:
efeece04 726.PP
35e4506b 727.in +4n
b8302363 728.EX
35e4506b
MK
729$ \fBsyscall_nr preadv\fP
730295
731$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
732cecilia
b8302363 733.EE
e646a1ba 734.in
35e4506b 735.SS Program source
e7d0bb47 736.EX
e9519f4f
KC
737#include <errno.h>
738#include <stddef.h>
739#include <stdio.h>
740#include <stdlib.h>
741#include <unistd.h>
742#include <linux/audit.h>
743#include <linux/filter.h>
744#include <linux/seccomp.h>
745#include <sys/prctl.h>
746
b44088b4
JH
747#define X32_SYSCALL_BIT 0x40000000
748
d06d25cc 749static int
69999632 750install_filter(int syscall_nr, int t_arch, int f_errno)
e9519f4f 751{
b44088b4 752 unsigned int upper_nr_limit = 0xffffffff;
e9855ef2
MK
753
754 /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI */
b44088b4
JH
755 if (t_arch == AUDIT_ARCH_X86_64)
756 upper_nr_limit = X32_SYSCALL_BIT - 1;
757
e9519f4f 758 struct sock_filter filter[] = {
ef05ec71 759 /* [0] Load architecture from 'seccomp_data' buffer into
64267236 760 accumulator */
06865301 761 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
e9519f4f 762 (offsetof(struct seccomp_data, arch))),
d06d25cc 763
b44088b4 764 /* [1] Jump forward 5 instructions if architecture does not
ef05ec71 765 match 't_arch' */
b44088b4 766 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5),
d06d25cc 767
ef05ec71 768 /* [2] Load system call number from 'seccomp_data' buffer into
64267236 769 accumulator */
06865301 770 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
e9519f4f 771 (offsetof(struct seccomp_data, nr))),
d06d25cc 772
65cfc712 773 /* [3] Check ABI - only needed for x86-64 in blacklist use
0582770c
MK
774 cases. Use JGT instead of checking against the bit
775 mask to avoid having to reload the syscall number. */
b44088b4
JH
776 BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
777
778 /* [4] Jump forward 1 instruction if system call number
ef05ec71 779 does not match 'syscall_nr' */
06865301 780 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
d06d25cc 781
b44088b4 782 /* [5] Matching architecture and system call: don't execute
69999632 783 the system call, and return 'f_errno' in 'errno' */
06865301 784 BPF_STMT(BPF_RET | BPF_K,
69999632 785 SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
d06d25cc 786
b44088b4 787 /* [6] Destination of system call number mismatch: allow other
87aa1b2c 788 system calls */
06865301 789 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
d06d25cc 790
5defc77d 791 /* [7] Destination of architecture mismatch: kill task */
06865301 792 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
e9519f4f 793 };
d06d25cc 794
e9519f4f 795 struct sock_fprog prog = {
d06d25cc 796 .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
e9519f4f
KC
797 .filter = filter,
798 };
d06d25cc 799
e9519f4f
KC
800 if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
801 perror("seccomp");
d06d25cc 802 return 1;
e9519f4f 803 }
d06d25cc
MK
804
805 return 0;
e9519f4f
KC
806}
807
d06d25cc
MK
808int
809main(int argc, char **argv)
e9519f4f
KC
810{
811 if (argc < 5) {
cbffd6ae
MK
812 fprintf(stderr, "Usage: "
813 "%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
814 "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n"
815 " AUDIT_ARCH_X86_64: 0x%X\\n"
816 "\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
d06d25cc 817 exit(EXIT_FAILURE);
e9519f4f 818 }
d06d25cc 819
e9519f4f
KC
820 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
821 perror("prctl");
d06d25cc 822 exit(EXIT_FAILURE);
e9519f4f 823 }
d06d25cc 824
e9519f4f
KC
825 if (install_filter(strtol(argv[1], NULL, 0),
826 strtol(argv[2], NULL, 0),
827 strtol(argv[3], NULL, 0)))
d06d25cc
MK
828 exit(EXIT_FAILURE);
829
e9519f4f
KC
830 execv(argv[4], &argv[4]);
831 perror("execv");
d06d25cc 832 exit(EXIT_FAILURE);
e9519f4f 833}
e7d0bb47 834.EE
e9519f4f 835.SH SEE ALSO
00eaa6aa 836.BR bpf (2),
e9519f4f
KC
837.BR prctl (2),
838.BR ptrace (2),
3b4a59c4 839.BR sigaction (2),
712551ea 840.BR proc (5),
e9519f4f
KC
841.BR signal (7),
842.BR socket (7)
51f5698d 843.PP
ca6f43fc
MK
844Various pages from the
845.I libseccomp
846library, including:
ccc70c81 847.BR scmp_sys_resolver (1),
ca6f43fc
MK
848.BR seccomp_init (3),
849.BR seccomp_load (3),
850.BR seccomp_rule_add (3),
851and
852.BR seccomp_export_bpf (3).
51f5698d 853.PP
e28e21fe
MK
854The kernel source files
855.IR Documentation/networking/filter.txt
856and
28d96036
ES
857.IR Documentation/userspace\-api/seccomp_filter.rst
858.\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3
859(or
860.IR Documentation/prctl/seccomp_filter.txt
861before Linux 4.13).
51f5698d 862.PP
f74fd424
MK
863McCanne, S. and Jacobson, V. (1992)
864.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
865Proceedings of the USENIX Winter 1993 Conference
5465ae95 866.UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf
f74fd424 867.UE