]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/seccomp.2
libc.7: Add a note on why glibc 2.x uses the soname libc.so.6
[thirdparty/man-pages.git] / man2 / seccomp.2
CommitLineData
e9519f4f
KC
1.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
2.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
35e4506b 3.\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
e9519f4f
KC
4.\"
5.\" %%%LICENSE_START(VERBATIM)
6.\" Permission is granted to make and distribute verbatim copies of this
7.\" manual provided the copyright notice and this permission notice are
8.\" preserved on all copies.
9.\"
10.\" Permission is granted to copy and distribute modified versions of this
11.\" manual under the conditions for verbatim copying, provided that the
12.\" entire resulting derived work is distributed under the terms of a
13.\" permission notice identical to this one.
14.\"
15.\" Since the Linux kernel and libraries are constantly changing, this
16.\" manual page may be incorrect or out-of-date. The author(s) assume no
17.\" responsibility for errors or omissions, or for damages resulting from
18.\" the use of the information contained herein. The author(s) may not
19.\" have taken the same level of care in the production of this manual,
20.\" which is licensed free of charge, as they might when working
21.\" professionally.
22.\"
23.\" Formatted or processed versions of this manual, if unaccompanied by
24.\" the source, must acknowledge the copyright and authors of this work.
25.\" %%%LICENSE_END
26.\"
b8efb414 27.TH SECCOMP 2 2016-10-08 "Linux" "Linux Programmer's Manual"
e9519f4f 28.SH NAME
50b4bc23 29seccomp \- operate on Secure Computing state of the process
e9519f4f
KC
30.SH SYNOPSIS
31.nf
32.B #include <linux/seccomp.h>
33.B #include <linux/filter.h>
34.B #include <linux/audit.h>
35.B #include <linux/signal.h>
36.B #include <sys/ptrace.h>
cbffd6ae
MK
37.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
38.\" need <sys/ptrace.h>
e9519f4f 39
50b4bc23
MK
40.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
41", void *" args );
e9519f4f
KC
42.fi
43.SH DESCRIPTION
44The
45.BR seccomp ()
46system call operates on the Secure Computing (seccomp) state of the
50b4bc23 47calling process.
e9519f4f
KC
48
49Currently, Linux supports the following
50.IR operation
51values:
52.TP
53.BR SECCOMP_SET_MODE_STRICT
cbffd6ae 54The only system calls that the calling thread is permitted to make are
e9519f4f
KC
55.BR read (2),
56.BR write (2),
85fbef74
MK
57.BR _exit (2)
58(but not
59.BR exit_group (2)),
e9519f4f
KC
60and
61.BR sigreturn (2).
62Other system calls result in the delivery of a
63.BR SIGKILL
cbffd6ae 64signal.
87aa1b2c 65Strict secure computing mode is useful for number-crunching
e9519f4f
KC
66applications that may need to execute untrusted byte code, perhaps
67obtained by reading from a pipe or socket.
68
bec56ee2
JH
69Note that although the calling thread can no longer call
70.BR sigprocmask (2),
71it can use
72.BR sigreturn (2)
73to block all signals apart from
74.BR SIGKILL
75and
76.BR SIGSTOP .
65be1b46 77This means that
bec56ee2 78.BR alarm (2)
65be1b46
MK
79(for example) is not sufficient for restricting the process's execution time.
80Instead, to reliably terminate the process,
81.BR SIGKILL
82must be used.
83This can be done by using
bec56ee2
JH
84.BR timer_create (2)
85with
86.BR SIGEV_SIGNAL
87and
65be1b46 88.IR sigev_signo
bec56ee2 89set to
65be1b46
MK
90.BR SIGKILL ,
91or by using
bec56ee2
JH
92.BR setrlimit (2)
93to set the hard limit for
94.BR RLIMIT_CPU .
95
e9519f4f
KC
96This operation is available only if the kernel is configured with
97.BR CONFIG_SECCOMP
98enabled.
99
100The value of
101.IR flags
102must be 0, and
103.IR args
104must be NULL.
105
50b4bc23
MK
106This operation is functionally identical to the call:
107
108 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
e9519f4f
KC
109.TP
110.BR SECCOMP_SET_MODE_FILTER
111The system calls allowed are defined by a pointer to a Berkeley Packet
112Filter (BPF) passed via
113.IR args .
cbffd6ae 114This argument is a pointer to a
e9519f4f
KC
115.IR "struct\ sock_fprog" ;
116it can be designed to filter arbitrary system calls and system call
50b4bc23 117arguments.
87aa1b2c
MK
118If the filter is invalid,
119.BR seccomp ()
120fails, returning
cbffd6ae 121.BR EINVAL
e9519f4f
KC
122in
123.IR errno .
124
125If
2d5fb4bb 126.BR fork (2)
e9519f4f 127or
2d5fb4bb
MK
128.BR clone (2)
129is allowed by the filter, any child processes will be constrained to
cbffd6ae 130the same system call filters as the parent.
2d5fb4bb
MK
131If
132.BR execve (2)
cbffd6ae
MK
133is allowed,
134the existing filters will be preserved across a call to
2d5fb4bb 135.BR execve (2).
e9519f4f 136
955efdbc
MK
137In order to use the
138.BR SECCOMP_SET_MODE_FILTER
139operation, either the caller must have the
e9519f4f 140.BR CAP_SYS_ADMIN
be8f1272 141capability in its user namespace, or the thread must already have the
cbffd6ae
MK
142.I no_new_privs
143bit set.
144If that bit was not already set by an ancestor of this thread,
145the thread must make the following call:
955efdbc
MK
146
147 prctl(PR_SET_NO_NEW_PRIVS, 1);
148
149Otherwise, the
150.BR SECCOMP_SET_MODE_FILTER
151operation will fail and return
e9519f4f
KC
152.BR EACCES
153in
154.IR errno .
b8ffefd7
MK
155This requirement ensures that an unprivileged process cannot apply
156a malicious filter and then invoke a set-user-ID or
157other privileged program using
158.BR execve (2),
cbffd6ae
MK
159thus potentially compromising that program.
160(Such a malicious filter might, for example, cause an attempt to use
161.BR setuid (2)
162to set the caller's user IDs to non-zero values to instead
163return 0 without actually making the system call.
164Thus, the program might be tricked into retaining superuser privileges
165in circumstances where it is possible to influence it to do
166dangerous things because it did not actually drop privileges.)
e9519f4f 167
7d596c7b 168If
e9519f4f
KC
169.BR prctl (2)
170or
bf7bc8b8 171.BR seccomp ()
7d596c7b 172is allowed by the attached filter, further filters may be added.
e01ca3aa 173This will increase evaluation time, but allows for further reduction of
cbffd6ae 174the attack surface during execution of a thread.
e9519f4f 175
50b4bc23
MK
176The
177.BR SECCOMP_SET_MODE_FILTER
178operation is available only if the kernel is configured with
e9519f4f
KC
179.BR CONFIG_SECCOMP_FILTER
180enabled.
181
182When
183.IR flags
50b4bc23
MK
184is 0, this operation is functionally identical to the call:
185
186 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
e9519f4f
KC
187
188The recognized
189.IR flags
190are:
191.RS
192.TP
193.BR SECCOMP_FILTER_FLAG_TSYNC
87aa1b2c 194When adding a new filter, synchronize all other threads of the calling
50b4bc23 195process to the same seccomp filter tree.
b8ffefd7
MK
196A "filter tree" is the ordered list of filters attached to a thread.
197(Attaching identical filters in separate
198.BR seccomp ()
199calls results in different filters from this perspective.)
200
201If any thread cannot synchronize to the same filter tree,
50b4bc23
MK
202the call will not attach the new seccomp filter,
203and will fail, returning the first thread ID found that cannot synchronize.
cbffd6ae 204Synchronization will fail if another thread in the same process is in
e9519f4f 205.BR SECCOMP_MODE_STRICT
50b4bc23
MK
206or if it has attached new seccomp filters to itself,
207diverging from the calling thread's filter tree.
e9519f4f 208.RE
93b9a9ee 209.SS Filters
e9519f4f
KC
210When adding filters via
211.BR SECCOMP_SET_MODE_FILTER ,
212.IR args
213points to a filter program:
214
215.in +4n
216.nf
217struct sock_fprog {
1da13d15
MK
218 unsigned short len; /* Number of BPF instructions */
219 struct sock_filter *filter; /* Pointer to array of
220 BPF instructions */
e9519f4f
KC
221};
222.fi
223.in
224
225Each program must contain one or more BPF instructions:
226
227.in +4n
228.nf
1da13d15
MK
229struct sock_filter { /* Filter block */
230 __u16 code; /* Actual filter code */
231 __u8 jt; /* Jump true */
232 __u8 jf; /* Jump false */
233 __u32 k; /* Generic multiuse field */
e9519f4f
KC
234};
235.fi
236.in
237
0c2e01b7
MK
238When executing the instructions, the BPF program operates on the
239system call information made available (i.e., use the
240.BR BPF_ABS
1a7e5113
MK
241addressing mode) as a (read-only)
242.\" Quoting Kees Cook:
243.\" If BPF even allows changing the data, it's not copied back to
244.\" the syscall when it runs. Anything wanting to do things like
245.\" that would need to use ptrace to catch the call an directly
246.\" modify the registers before continuing with the call.
247buffer of the following form:
e9519f4f
KC
248
249.in +4n
250.nf
251struct seccomp_data {
5b95c4ab 252 int nr; /* System call number */
a0a0c98b
MK
253 __u32 arch; /* AUDIT_ARCH_* value
254 (see <linux/audit.h>) */
e9519f4f 255 __u64 instruction_pointer; /* CPU instruction pointer */
1da13d15 256 __u64 args[6]; /* Up to 6 system call arguments */
e9519f4f
KC
257};
258.fi
259.in
260
944a31e2 261Because numbering of system calls varies between architectures and
65cfc712 262some architectures (e.g., x86-64) allow user-space code to use
b44088b4
JH
263the calling conventions of multiple architectures, it is usually
264necessary to verify the value of the
265.IR arch
266field.
267
268It is strongly recommended to use a whitelisting approach whenever
269possible because such an approach is more robust and simple.
270A blacklist will have to be updated whenever a potentially
37daa840 271dangerous system call is added (or a dangerous flag or option if those
b44088b4
JH
272are blacklisted), and it is often possible to alter the
273representation of a value without altering its meaning, leading to
274a blacklist bypass.
275
276The
277.IR arch
37daa840 278field is not unique for all calling conventions.
65cfc712 279The x86-64 ABI and the x32 ABI both use
b44088b4
JH
280.BR AUDIT_ARCH_X86_64
281as
282.IR arch ,
37daa840
MK
283and they run on the same processors.
284Instead, the mask
b44088b4
JH
285.BR __X32_SYSCALL_BIT
286is used on the system call number to tell the two ABIs apart.
8cc4d071 287.\" As noted by Dave Drysdale in a note at the end of
3c5ab770
MK
288.\" https://lwn.net/Articles/604515/
289.\" One additional detail to point out for the x32 ABI case:
290.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT),
291.\" to mark it as an x32 call.
292.\"
293.\" If x32 support is included in the kernel, then __SYSCALL_MASK
294.\" will have a value that is not all-ones, and this will trigger
295.\" an extra instruction in system_call to mask off the extra bit,
8cc4d071 296.\" so that the syscall table indexing still works.
3c5ab770 297
b44088b4 298This means that in order to create a seccomp-based
65cfc712 299blacklist for system calls performed through the x86-64 ABI,
b44088b4
JH
300it is necessary to not only check that
301.IR arch
302equals
303.BR AUDIT_ARCH_X86_64 ,
7a79bb30 304but also to explicitly reject all system calls that contain
b44088b4
JH
305.BR __X32_SYSCALL_BIT
306in
307.IR nr .
308
712551ea
MK
309The
310.I instruction_pointer
311field provides the address of the machine-language instruction that
312performed the system call.
313This might be useful in conjunction with the use of
314.I /proc/[pid]/maps
315to perform checks based on which region (mapping) of the program
316made the system call.
317(Probably, it is wise to lock down the
318.BR mmap (2)
319and
320.BR mprotect (2)
321system calls to prevent the program from subverting such checks.)
322
b44088b4
JH
323When checking values from
324.IR args
325against a blacklist, keep in mind that arguments are often
37daa840
MK
326silently truncated before being processed, but after the seccomp check.
327For example, this happens if the i386 ABI is used on an
3c5ab770 328x86-64 kernel: although the kernel will normally not look beyond
b44088b4 329the 32 lowest bits of the arguments, the values of the full
37daa840 33064-bit registers will be present in the seccomp data.
65cfc712 331A less surprising example is that if the x86-64 ABI is used to perform
37daa840
MK
332a system call that takes an argument of type
333.IR int ,
334the more-significant half of the argument register is ignored by
335the system call, but visible in the seccomp data.
b44088b4 336
cbffd6ae
MK
337A seccomp filter returns a 32-bit value consisting of two parts:
338the most significant 16 bits
339(corresponding to the mask defined by the constant
340.BR SECCOMP_RET_ACTION )
341contain one of the "action" values listed below;
342the least significant 16-bits (defined by the constant
343.BR SECCOMP_RET_DATA )
344are "data" to be associated with this return value.
345
755a656a
MK
346If multiple filters exist, they are \fIall\fP executed,
347in reverse order of their addition to the filter tree\(emthat is,
348the most recently installed filter is executed first.
d74503a5
MK
349(Note that all filters will be called
350even if one of the earlier filters returns
351.BR SECCOMP_RET_KILL .
352This is done to simplify the kernel code and to provide a
353tiny speed-up in the execution of sets of filters by
354avoiding a check for this uncommon case.)
355.\" From an Aug 2015 conversation with Kees Cook where I asked why *all*
356.\" filters even if one of the early filters returns SECCOMP_RET_KILL:
357.\"
358.\" It's just because it would be an optimization that would only speed up
359.\" the RET_KILL case, but it's the uncommon one and the one that doesn't
360.\" benefit meaningfully from such a change (you need to kill the process
361.\" really quickly?). We would speed up killing a program at the (albeit
362.\" tiny) expense to all other filtered programs. Best to keep the filter
363.\" execution logic clear, simple, and as fast as possible for all
364.\" filters.
cbffd6ae
MK
365The return value for the evaluation of a given system call is the first-seen
366.BR SECCOMP_RET_ACTION
367value of highest precedence (along with its accompanying data)
368returned by execution of all of the filters.
e9519f4f 369
cbffd6ae 370In decreasing order of precedence,
ec2bb3c0 371the values that may be returned by a seccomp filter are:
e9519f4f
KC
372.TP
373.BR SECCOMP_RET_KILL
610082d9
MK
374This value results in the process exiting immediately
375without executing the system call.
cbffd6ae 376The process terminates as though killed by a
0a2a0df0
MK
377.B SIGSYS
378signal
379.RI ( not
380.BR SIGKILL ).
e9519f4f
KC
381.TP
382.BR SECCOMP_RET_TRAP
610082d9 383This value results in the kernel sending a
e9519f4f 384.BR SIGSYS
cbffd6ae
MK
385signal to the triggering process without executing the system call.
386Various fields will be set in the
387.I siginfo_t
db99c064
MK
388structure (see
389.BR sigaction (2))
390associated with signal:
cf690e13
MK
391.RS
392.IP * 3
393.I si_signo
394will contain
395.BR SIGSYS .
396.IP *
db99c064 397.IR si_call_addr
cf690e13
MK
398will show the address of the system call instruction.
399.IP *
db99c064 400.IR si_syscall
e9519f4f 401and
db99c064 402.IR si_arch
87aa1b2c 403will indicate which system call was attempted.
cf690e13
MK
404.IP *
405.I si_code
cf690e13
MK
406will contain
407.BR SYS_SECCOMP .
408.IP *
409.I si_errno
410will contain the
411.BR SECCOMP_RET_DATA
412portion of the filter return value.
413.RE
414.IP
50b4bc23 415The program counter will be as though the system call happened
87aa1b2c 416(i.e., it will not point to the system call instruction).
50b4bc23 417The return value register will contain an architecture\-dependent value;
06865301 418if resuming execution, set it to something appropriate for the system call.
a79566fb 419(The architecture dependency is because replacing it with
e9519f4f
KC
420.BR ENOSYS
421could overwrite some useful information.)
e9519f4f
KC
422.TP
423.BR SECCOMP_RET_ERRNO
610082d9 424This value results in the
cbffd6ae
MK
425.B SECCOMP_RET_DATA
426portion of the filter's return value being passed to user space as the
e9519f4f 427.IR errno
cbffd6ae 428value without executing the system call.
e9519f4f
KC
429.TP
430.BR SECCOMP_RET_TRACE
87aa1b2c
MK
431When returned, this value will cause the kernel to attempt to notify a
432.BR ptrace (2)-based
433tracer prior to executing the system call.
50b4bc23 434If there is no tracer present,
3fbe3aa3
MK
435the system call is not executed and returns a failure status with
436.I errno
437set to
438.BR ENOSYS .
e9519f4f
KC
439
440A tracer will be notified if it requests
441.BR PTRACE_O_TRACESECCOMP
442using
443.IR ptrace(PTRACE_SETOPTIONS) .
444The tracer will be notified of a
445.BR PTRACE_EVENT_SECCOMP
446and the
447.BR SECCOMP_RET_DATA
cbffd6ae 448portion of the filter's return value will be available to the tracer via
e9519f4f
KC
449.BR PTRACE_GETEVENTMSG .
450
50b4bc23
MK
451The tracer can skip the system call by changing the system call number
452to \-1.
453Alternatively, the tracer can change the system call
87aa1b2c 454requested by changing the system call to a valid system call number.
50b4bc23
MK
455If the tracer asks to skip the system call, then the system call will
456appear to return the value that the tracer puts in the return value register.
e9519f4f 457
09481df3
JH
458.\" This was changed in ce6526e8afa4.
459.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was
460.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and
461.\" 0f3912fd934c for ARM.
462Before kernel 4.8, the seccomp check will not be run again after the tracer is
463notified.
464(This means that, on older kernels, seccomp-based sandboxes
50b4bc23 465.B "must not"
87aa1b2c
MK
466allow use of
467.BR ptrace (2)\(emeven
468of other
50b4bc23 469sandboxed processes\(emwithout extreme care;
06865301 470ptracers can use this mechanism to escape from the seccomp sandbox.)
e9519f4f
KC
471.TP
472.BR SECCOMP_RET_ALLOW
610082d9 473This value results in the system call being executed.
e9519f4f
KC
474.SH RETURN VALUE
475On success,
476.BR seccomp ()
477returns 0.
478On error, if
479.BR SECCOMP_FILTER_FLAG_TSYNC
50b4bc23 480was used,
880bc4eb
MK
481the return value is the ID of the thread
482that caused the synchronization failure.
610082d9
MK
483(This ID is a kernel thread ID of the type returned by
484.BR clone (2)
485and
0e27eb0c 486.BR gettid (2).)
50b4bc23 487On other errors, \-1 is returned, and
e9519f4f
KC
488.IR errno
489is set to indicate the cause of the error.
490.SH ERRORS
491.BR seccomp ()
492can fail for the following reasons:
493.TP
494.BR EACCESS
50b4bc23 495The caller did not have the
e9519f4f 496.BR CAP_SYS_ADMIN
be8f1272 497capability in its user namespace, or had not set
e9519f4f
KC
498.IR no_new_privs
499before using
500.BR SECCOMP_SET_MODE_FILTER .
501.TP
502.BR EFAULT
503.IR args
43b265ff 504was not a valid address.
e9519f4f
KC
505.TP
506.BR EINVAL
507.IR operation
508is unknown; or
509.IR flags
510are invalid for the given
043ed1c6 511.IR operation .
29efefec
MK
512.TP
513.BR EINVAL
882cf566
MK
514.I operation
515included
516.BR BPF_ABS ,
517but the specified offset was not aligned to a 32-bit boundary or exceeded
518.IR "sizeof(struct\ seccomp_data)" .
882cf566
MK
519.TP
520.BR EINVAL
29efefec
MK
521.\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources
522A secure computing mode has already been set, and
523.I operation
524differs from the existing setting.
29efefec
MK
525.TP
526.BR EINVAL
527.\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources
528.I operation
529specified
530.BR SECCOMP_SET_MODE_FILTER ,
531but the kernel was not built with
532.B CONFIG_SECCOMP_FILTER
533enabled.
29efefec
MK
534.TP
535.BR EINVAL
536.I operation
537specified
538.BR SECCOMP_SET_MODE_FILTER ,
539but the filter program pointed to by
540.I args
541was not valid or the length of the filter program was zero or exceeded
542.B BPF_MAXINSNS
543(4096) instructions.
29efefec
MK
544.TP
545.BR ENOMEM
546Out of memory.
29efefec
MK
547.TP
548.BR ENOMEM
549.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources
550The total length of all filter programs attached
551to the calling thread would exceed
552.B MAX_INSNS_PER_PATH
553(32768) instructions.
554Note that for the purposes of calculating this limit,
ce67ca23
MK
555each already existing filter program incurs an
556overhead penalty of 4 instructions.
e9519f4f
KC
557.TP
558.BR ESRCH
559Another thread caused a failure during thread sync, but its ID could not
560be determined.
561.SH VERSIONS
87aa1b2c 562The
dff53954 563.BR seccomp ()
87aa1b2c 564system call first appeared in Linux 3.17.
cbffd6ae 565.\" FIXME . Add glibc version
e9519f4f 566.SH CONFORMING TO
87aa1b2c 567The
dff53954 568.BR seccomp ()
87aa1b2c 569system call is a nonstandard Linux extension.
e9519f4f 570.SH NOTES
aea38298
MK
571Rather than hand-coding seccomp filters as shown in the example below,
572you may prefer to employ the
573.I libseccomp
574library, which provides a front-end for generating seccomp filters.
575
1367a60a
MK
576The
577.IR Seccomp
578field of the
579.IR /proc/[pid]/status
580file provides a method of viewing the seccomp mode of a process; see
581.BR proc (5).
582
e9519f4f 583.BR seccomp ()
50b4bc23
MK
584provides a superset of the functionality provided by the
585.BR prctl (2)
b95b411b 586.BR PR_SET_SECCOMP
50b4bc23
MK
587operation (which does not support
588.IR flags ).
88f02af6
MK
589
590Since Linux 4.4, the
591.BR prctl (2)
592.B PTRACE_SECCOMP_GET_FILTER
593operation can be used to dump a process's seccomp filters.
594.\"
cecc8c48
MK
595.SS Seccomp-specific BPF details
596Note the following BPF details specific to seccomp filters:
597.IP * 3
598The
599.B BPF_H
600and
601.B BPF_B
602size modifiers are not supported: all operations must load and store
603(4-byte) words
604.RB ( BPF_W ).
605.IP *
606To access the contents of the
607.I seccomp_data
608buffer, use the
609.B BPF_ABS
610addressing mode modifier.
cecc8c48
MK
611.IP *
612The
613.B BPF_LEN
614addressing mode modifier yields an immediate mode operand
615whose value is the size of the
616.IR seccomp_data
617buffer.
e9519f4f 618.SH EXAMPLE
35e4506b
MK
619The program below accepts four or more arguments.
620The first three arguments are a system call number,
621a numeric architecture identifier, and an error number.
622The program uses these values to construct a BPF filter
623that is used at run time to perform the following checks:
624.IP [1] 4
625If the program is not running on the specified architecture,
626the BPF filter causes system calls to fail with the error
627.BR ENOSYS .
628.IP [2]
629If the program attempts to execute the system call with the specified number,
630the BPF filter causes the system call to fail, with
631.I errno
632being set to the specified error number.
633.PP
634The remaining command-line arguments specify
635the pathname and additional arguments of a program
636that the example program should attempt to execute using
71bb61ec 637.BR execv (3)
35e4506b
MK
638(a library function that employs the
639.BR execve (2)
640system call).
641Some example runs of the program are shown below.
642
643First, we display the architecture that we are running on (x86-64)
644and then construct a shell function that looks up system call
645numbers on this architecture:
646
647.nf
648.in +4n
649$ \fBuname -m\fP
650x86_64
651$ \fBsyscall_nr() {
652 cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\
06865301 653 awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
35e4506b
MK
654}\fP
655.in
656.fi
657
658When the BPF filter rejects a system call (case [2] above),
659it causes the system call to fail with the error number
660specified on the command line.
661In the experiments shown here, we'll use error number 99:
662
663.nf
664.in +4n
665$ \fBerrno 99\fP
666EADDRNOTAVAIL 99 Cannot assign requested address
667.in
668.fi
669
670In the following example, we attempt to run the command
671.BR whoami (1),
672but the BPF filter rejects the
673.BR execve (2)
674system call, so that the command is not even executed:
675
676.nf
677.in +4n
678$ \fBsyscall_nr execve\fP
06865301 67959
cbffd6ae
MK
680$ \fB./a.out\fP
681Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
682Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
683 AUDIT_ARCH_X86_64: 0xC000003E
35e4506b
MK
684$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
685execv: Cannot assign requested address
686.in
687.fi
688
689In the next example, the BPF filter rejects the
690.BR write (2)
691system call, so that, although it is successfully started, the
692.BR whoami (1)
693command is not able to write output:
694
695.nf
696.in +4n
697$ \fBsyscall_nr write\fP
6981
699$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
700.in
701.fi
702
703In the final example,
704the BPF filter rejects a system call that is not used by the
705.BR whoami (1)
706command, so it is able to successfully execute and produce output:
707
708.nf
709.in +4n
710$ \fBsyscall_nr preadv\fP
711295
712$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
713cecilia
714.in
715.fi
716.SS Program source
e9519f4f
KC
717.nf
718#include <errno.h>
719#include <stddef.h>
720#include <stdio.h>
721#include <stdlib.h>
722#include <unistd.h>
723#include <linux/audit.h>
724#include <linux/filter.h>
725#include <linux/seccomp.h>
726#include <sys/prctl.h>
727
b44088b4
JH
728#define X32_SYSCALL_BIT 0x40000000
729
d06d25cc 730static int
69999632 731install_filter(int syscall_nr, int t_arch, int f_errno)
e9519f4f 732{
b44088b4 733 unsigned int upper_nr_limit = 0xffffffff;
e9855ef2
MK
734
735 /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI */
b44088b4
JH
736 if (t_arch == AUDIT_ARCH_X86_64)
737 upper_nr_limit = X32_SYSCALL_BIT - 1;
738
e9519f4f 739 struct sock_filter filter[] = {
ef05ec71 740 /* [0] Load architecture from 'seccomp_data' buffer into
64267236 741 accumulator */
06865301 742 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
e9519f4f 743 (offsetof(struct seccomp_data, arch))),
d06d25cc 744
b44088b4 745 /* [1] Jump forward 5 instructions if architecture does not
ef05ec71 746 match 't_arch' */
b44088b4 747 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5),
d06d25cc 748
ef05ec71 749 /* [2] Load system call number from 'seccomp_data' buffer into
64267236 750 accumulator */
06865301 751 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
e9519f4f 752 (offsetof(struct seccomp_data, nr))),
d06d25cc 753
65cfc712 754 /* [3] Check ABI - only needed for x86-64 in blacklist use
0582770c
MK
755 cases. Use JGT instead of checking against the bit
756 mask to avoid having to reload the syscall number. */
b44088b4
JH
757 BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
758
759 /* [4] Jump forward 1 instruction if system call number
ef05ec71 760 does not match 'syscall_nr' */
06865301 761 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
d06d25cc 762
b44088b4 763 /* [5] Matching architecture and system call: don't execute
69999632 764 the system call, and return 'f_errno' in 'errno' */
06865301 765 BPF_STMT(BPF_RET | BPF_K,
69999632 766 SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
d06d25cc 767
b44088b4 768 /* [6] Destination of system call number mismatch: allow other
87aa1b2c 769 system calls */
06865301 770 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
d06d25cc 771
b44088b4 772 /* [7] Destination of architecture mismatch: kill process */
06865301 773 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
e9519f4f 774 };
d06d25cc 775
e9519f4f 776 struct sock_fprog prog = {
d06d25cc 777 .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
e9519f4f
KC
778 .filter = filter,
779 };
d06d25cc 780
e9519f4f
KC
781 if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
782 perror("seccomp");
d06d25cc 783 return 1;
e9519f4f 784 }
d06d25cc
MK
785
786 return 0;
e9519f4f
KC
787}
788
d06d25cc
MK
789int
790main(int argc, char **argv)
e9519f4f
KC
791{
792 if (argc < 5) {
cbffd6ae
MK
793 fprintf(stderr, "Usage: "
794 "%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
795 "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n"
796 " AUDIT_ARCH_X86_64: 0x%X\\n"
797 "\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
d06d25cc 798 exit(EXIT_FAILURE);
e9519f4f 799 }
d06d25cc 800
e9519f4f
KC
801 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
802 perror("prctl");
d06d25cc 803 exit(EXIT_FAILURE);
e9519f4f 804 }
d06d25cc 805
e9519f4f
KC
806 if (install_filter(strtol(argv[1], NULL, 0),
807 strtol(argv[2], NULL, 0),
808 strtol(argv[3], NULL, 0)))
d06d25cc
MK
809 exit(EXIT_FAILURE);
810
e9519f4f
KC
811 execv(argv[4], &argv[4]);
812 perror("execv");
d06d25cc 813 exit(EXIT_FAILURE);
e9519f4f
KC
814}
815.fi
816.SH SEE ALSO
00eaa6aa 817.BR bpf (2),
e9519f4f
KC
818.BR prctl (2),
819.BR ptrace (2),
3b4a59c4 820.BR sigaction (2),
712551ea 821.BR proc (5),
e9519f4f
KC
822.BR signal (7),
823.BR socket (7)
18abaf3e 824.sp
ca6f43fc
MK
825Various pages from the
826.I libseccomp
827library, including:
ccc70c81 828.BR scmp_sys_resolver (1),
ca6f43fc
MK
829.BR seccomp_init (3),
830.BR seccomp_load (3),
831.BR seccomp_rule_add (3),
832and
833.BR seccomp_export_bpf (3).
834.sp
e28e21fe
MK
835The kernel source files
836.IR Documentation/networking/filter.txt
837and
838.IR Documentation/prctl/seccomp_filter.txt .
f74fd424
MK
839.sp
840McCanne, S. and Jacobson, V. (1992)
841.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
842Proceedings of the USENIX Winter 1993 Conference
843.UR http://www.tcpdump.org/papers/bpf-usenix93.pdf
844.UE