gettid.2, open.2, seccomp.2, pthread_mutex_consistent.3, signal.7, xattr.7: Put SEE...

[thirdparty/man-pages.git] / man2 / seccomp.2
diff --git a/man2/seccomp.2 b/man2/seccomp.2

index 28eb572e05f534e7bf51a00c1d92c7c218984342..4707f86287eb74e989b92e4c4c3c656923d48cdf 100644 (file)
--- a/man2/seccomp.2
+++ b/man2/seccomp.2
@@ -1,6 +1,7 @@
  .\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
  .\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
-.\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com>
  .\"
  .\" %%%LICENSE_START(VERBATIM)
  .\" Permission is granted to make and distribute verbatim copies of this
@@ -24,7 +25,7 @@
  .\" the source, must acknowledge the copyright and authors of this work.
  .\" %%%LICENSE_END
  .\"
-.TH SECCOMP 2 2017-09-15 "Linux" "Linux Programmer's Manual"
+.TH SECCOMP 2 2019-11-19 "Linux" "Linux Programmer's Manual"
  .SH NAME
  seccomp \- operate on Secure Computing state of the process
  .SH SYNOPSIS
@@ -105,7 +106,11 @@ must be NULL.
  .IP
  This operation is functionally identical to the call:
  .IP
-    prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
+.in +4n
+.EX
+prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
+.EE
+.in
  .TP
  .BR SECCOMP_SET_MODE_FILTER
  The system calls allowed are defined by a pointer to a Berkeley Packet
@@ -136,7 +141,7 @@ the existing filters will be preserved across a call to
  .IP
  In order to use the
  .BR SECCOMP_SET_MODE_FILTER
-operation, either the caller must have the
+operation, either the calling thread must have the
  .BR CAP_SYS_ADMIN
  capability in its user namespace, or the thread must already have the
  .I no_new_privs
@@ -144,7 +149,11 @@ bit set.
  If that bit was not already set by an ancestor of this thread,
  the thread must make the following call:
  .IP
-    prctl(PR_SET_NO_NEW_PRIVS, 1);
+.in +4n
+.EX
+prctl(PR_SET_NO_NEW_PRIVS, 1);
+.EE
+.in
  .IP
  Otherwise, the
  .BR SECCOMP_SET_MODE_FILTER
@@ -159,7 +168,7 @@ other privileged program using
  thus potentially compromising that program.
  (Such a malicious filter might, for example, cause an attempt to use
  .BR setuid (2)
-to set the caller's user IDs to non-zero values to instead
+to set the caller's user IDs to nonzero values to instead
  return 0 without actually making the system call.
  Thus, the program might be tricked into retaining superuser privileges
  in circumstances where it is possible to influence it to do
@@ -183,7 +192,11 @@ When
  .IR flags
  is 0, this operation is functionally identical to the call:
  .IP
-    prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
+.in +4n
+.EX
+prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
+.EE
+.in
  .IP
  The recognized
  .IR flags
@@ -210,15 +223,24 @@ diverging from the calling thread's filter tree.
  .\" commit e66a39977985b1e69e17c4042cb290768eca9b02
  All filter return actions except
  .BR SECCOMP_RET_ALLOW
-should be logged. An administrator may override this filter flag by preventing specific actions from being logged via the
+should be logged.
+An administrator may override this filter flag by preventing specific
+actions from being logged via the
  .IR /proc/sys/kernel/seccomp/actions_logged
  file.
+.TP
+.BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)"
+.\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675
+Disable Speculative Store Bypass mitigation.
  .RE
  .TP
  .BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)"
  .\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655
-Test to see if an action is supported by the kernel. This operation is helpful to confirm that the kernel knows of a more recently added filter return action since the kernel treats all unknown actions as
-.BR SECCOMP_RET_KILL .
+Test to see if an action is supported by the kernel.
+This operation is helpful to confirm that the kernel knows
+of a more recently added filter return action
+since the kernel treats all unknown actions as
+.BR SECCOMP_RET_KILL_PROCESS .
  .IP
  The value of
  .IR flags
@@ -279,18 +301,24 @@ struct seccomp_data {
  .PP
  Because numbering of system calls varies between architectures and
  some architectures (e.g., x86-64) allow user-space code to use
-the calling conventions of multiple architectures, it is usually
-necessary to verify the value of the
+the calling conventions of multiple architectures
+(and the convention being used may vary over the life of a process that uses
+.BR execve (2)
+to execute binaries that employ the different conventions),
+it is usually necessary to verify the value of the
  .IR arch
  field.
  .PP
-It is strongly recommended to use a whitelisting approach whenever
+It is strongly recommended to use an allow-list approach whenever
  possible because such an approach is more robust and simple.
-A blacklist will have to be updated whenever a potentially
+A deny-list will have to be updated whenever a potentially
  dangerous system call is added (or a dangerous flag or option if those
-are blacklisted), and it is often possible to alter the
+are deny-listed), and it is often possible to alter the
  representation of a value without altering its meaning, leading to
-a blacklist bypass.
+a deny-list bypass.
+See also
+.IR Caveats
+below.
  .PP
  The
  .IR arch
@@ -315,7 +343,7 @@ is used on the system call number to tell the two ABIs apart.
  .\"     so that the syscall table indexing still works.
  .PP
  This means that in order to create a seccomp-based
-blacklist for system calls performed through the x86-64 ABI,
+deny-list for system calls performed through the x86-64 ABI,
  it is necessary to not only check that
  .IR arch
  equals
@@ -341,7 +369,7 @@ system calls to prevent the program from subverting such checks.)
  .PP
  When checking values from
  .IR args
-against a blacklist, keep in mind that arguments are often
+against a deny-list, keep in mind that arguments are often
  silently truncated before being processed, but after the seccomp check.
  For example, this happens if the i386 ABI is used on an
  x86-64 kernel: although the kernel will normally not look beyond
@@ -356,7 +384,7 @@ the system call, but visible in the seccomp data.
  A seccomp filter returns a 32-bit value consisting of two parts:
  the most significant 16 bits
  (corresponding to the mask defined by the constant
-.BR SECCOMP_RET_ACTION )
+.BR SECCOMP_RET_ACTION_FULL )
  contain one of the "action" values listed below;
  the least significant 16-bits (defined by the constant
  .BR SECCOMP_RET_DATA )
@@ -383,45 +411,85 @@ avoiding a check for this uncommon case.)
  .\"     execution logic clear, simple, and as fast as possible for all
  .\"     filters.
  The return value for the evaluation of a given system call is the first-seen
-.BR SECCOMP_RET_ACTION
-value of highest precedence (along with its accompanying data)
+action value of highest precedence (along with its accompanying data)
  returned by execution of all of the filters.
  .PP
  In decreasing order of precedence,
-the values that may be returned by a seccomp filter are:
+the action values that may be returned by a seccomp filter are:
  .TP
-.BR SECCOMP_RET_KILL
-This value results in the task exiting immediately
-.\" We really do man "task" here, not "process"
-without executing the system call.
-The task terminates as though killed by a
+.BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)"
+.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
+.\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421
+This value results in immediate termination of the process,
+with a core dump.
+The system call is not executed.
+By contrast with
+.BR SECCOMP_RET_KILL_THREAD
+below, all threads in the thread group are terminated.
+(For a discussion of thread groups, see the description of the
+.BR CLONE_THREAD
+flag in
+.BR clone (2).)
+.IP
+The process terminates
+.I "as though"
+killed by a
  .B SIGSYS
-signal
-.RI ( not
-.BR SIGKILL ).
-Even if a signal handler has been registered and otherwise catches
+signal.
+Even if a signal handler has been registered for
  .BR SIGSYS ,
-the handler will be ignored in this case and the task always terminates.
+the handler will be ignored in this case and the process always terminates.
+To a parent process that is waiting on this process (using
+.BR waitpid (2)
+or similar), the returned
+.I wstatus
+will indicate that its child was terminated as though by a
+.BR SIGSYS
+signal.
+.TP
+.BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL )
+This value results in immediate termination of the thread
+that made the system call.
+The system call is not executed.
+Other threads in the same thread group will continue to execute.
+.IP
+The thread terminates
+.I "as though"
+killed by a
+.B SIGSYS
+signal.
+See
+.BR SECCOMP_RET_KILL_PROCESS
+above.
  .IP
  .\" See these commits:
-.\" seccomp: dump core when using SECCOMP_RET_KILL (b25e67161c295c98acda92123b2dd1e7d8642901)
-.\" seccomp: Only dump core when single-threaded (d7276e321ff8a53106a59c85ca46d03e34288893)
+.\" seccomp: dump core when using SECCOMP_RET_KILL
+.\"    (b25e67161c295c98acda92123b2dd1e7d8642901)
+.\" seccomp: Only dump core when single-threaded
+.\"    (d7276e321ff8a53106a59c85ca46d03e34288893)
  Before Linux 4.11,
-any process terminated this way would not trigger a coredump
+any process terminated in this way would not trigger a coredump
  (even though
  .B SIGSYS
  is documented in
  .BR signal (7)
  as having a default action of termination with a core dump).
  Since Linux 4.11,
-single threaded processes follow standard core dump behavior,
-but multithreaded processes still do not.
-There is no workaround currently for multithreaded processes.
+a single-threaded process will dump core if terminated in this way.
+.IP
+With the addition of
+.BR SECCOMP_RET_KILL_PROCESS
+in Linux 4.14,
+.BR SECCOMP_RET_KILL_THREAD
+was added as a synonym for
+.BR SECCOMP_RET_KILL ,
+in order to more clearly distinguish the two actions.
  .TP
  .BR SECCOMP_RET_TRAP
-This value results in the kernel sending a
+This value results in the kernel sending a thread-directed
  .BR SIGSYS
-signal to the triggering process without executing the system call.
+signal to the triggering thread.
+(The system call is not executed.)
  Various fields will be set in the
  .I siginfo_t
  structure (see
@@ -452,7 +520,7 @@ portion of the filter return value.
  .RE
  .IP
  The program counter will be as though the system call happened
-(i.e., it will not point to the system call instruction).
+(i.e., the program counter will not point to the system call instruction).
  The return value register will contain an architecture\-dependent value;
  if resuming execution, set it to something appropriate for the system call.
  (The architecture dependency is because replacing it with
@@ -508,8 +576,27 @@ of other
  sandboxed processes\(emwithout extreme care;
  ptracers can use this mechanism to escape from the seccomp sandbox.)
  .TP
+.BR SECCOMP_RET_LOG " (since Linux 4.14)"
+.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
+This value results in the system call being executed after
+the filter return action is logged.
+An administrator may override the logging of this action via
+the
+.IR /proc/sys/kernel/seccomp/actions_logged
+file.
+.TP
  .BR SECCOMP_RET_ALLOW
  This value results in the system call being executed.
+.PP
+If an action value other than one of the above is specified,
+then the filter action is treated as either
+.BR SECCOMP_RET_KILL_PROCESS
+(since Linux 4.14)
+.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
+or
+.BR SECCOMP_RET_KILL_THREAD
+(in Linux 4.13 and earlier).
+.\"
  .SS /proc interfaces
  The files in the directory
  .IR /proc/sys/kernel/seccomp
@@ -517,22 +604,25 @@ provide additional seccomp information and configuration:
  .TP
  .IR actions_avail " (since Linux 4.14)"
  .\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af
-A read-only ordered list of seccomp filter return actions in string form. The
-ordering, from left-to-right, is in decreasing order of precedence. The list
-represents the set of seccomp filter return actions supported by the kernel.
+A read-only ordered list of seccomp filter return actions in string form.
+The ordering, from left-to-right, is in decreasing order of precedence.
+The list represents the set of seccomp filter return actions
+supported by the kernel.
  .TP
  .IR actions_logged " (since Linux 4.14)"
  .\" commit 0ddec0fc8900201c0897b87b762b7c420436662f
-A read-write ordered list of seccomp filter return actions that are allowed to
-be logged. Writes to the file do not need to be in ordered form but reads from
+A read-write ordered list of seccomp filter return actions that
+are allowed to be logged.
+Writes to the file do not need to be in ordered form but reads from
  the file will be ordered in the same way as the
  .IR actions_avail
  file.
  .IP
  It is important to note that the value of
  .IR actions_logged
-does not prevent certain filter return actions from being logged when the audit
-subsystem is configured to audit a task. If the action is not found in the
+does not prevent certain filter return actions from being logged when
+the audit subsystem is configured to audit a task.
+If the action is not found in the
  .IR actions_logged
  file, the final decision on whether to audit the action for that task is
  ultimately left up to the audit subsystem to decide for all filter return
@@ -543,9 +633,47 @@ The "allow" string is not accepted in the
  .IR actions_logged
  file as it is not possible to log
  .BR SECCOMP_RET_ALLOW
-actions. Attempting to write "allow" to the file will result in
-.BR EINVAL
-being returned.
+actions.
+Attempting to write "allow" to the file will fail with the error
+.BR EINVAL .
+.\"
+.SS Audit logging of seccomp actions
+.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
+Since Linux 4.14, the kernel provides the facility to log the
+actions returned by seccomp filters in the audit log.
+The kernel makes the decision to log an action based on
+the action type,  whether or not the action is present in the
+.I actions_logged
+file, and whether kernel auditing is enabled
+(e.g., via the kernel boot option
+.IR audit=1 ).
+.\" or auditing could be enabled via the netlink API (AUDIT_SET)
+The rules are as follows:
+.IP * 3
+If the action is
+.BR SECCOMP_RET_ALLOW ,
+the action is not logged.
+.IP *
+Otherwise, if the action is either
+.BR SECCOMP_RET_KILL_PROCESS
+or
+.BR SECCOMP_RET_KILL_THREAD ,
+and that action appears in the
+.IR actions_logged
+file, the action is logged.
+.IP *
+Otherwise, if the filter has requested logging (the
+.BR SECCOMP_FILTER_FLAG_LOG
+flag)
+and the action appears in the
+.IR actions_logged
+file, the action is logged.
+.IP *
+Otherwise, if kernel auditing is enabled and the process is being audited
+.RB ( autrace (8)),
+the action is logged.
+.IP *
+Otherwise, the action is not logged.
  .SH RETURN VALUE
  On success,
  .BR seccomp ()
@@ -566,7 +694,7 @@ is set to indicate the cause of the error.
  .BR seccomp ()
  can fail for the following reasons:
  .TP
-.BR EACCESS
+.BR EACCES
  The caller did not have the
  .BR CAP_SYS_ADMIN
  capability in its user namespace, or had not set
@@ -580,7 +708,10 @@ was not a valid address.
  .TP
  .BR EINVAL
  .IR operation
-is unknown; or
+is unknown or is not supported by this kernel version or configuration.
+.TP
+.B EINVAL
+The specified
  .IR flags
  are invalid for the given
  .IR operation .
@@ -599,15 +730,6 @@ A secure computing mode has already been set, and
  differs from the existing setting.
  .TP
  .BR EINVAL
-.\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources
-.I operation
-specified
-.BR SECCOMP_SET_MODE_FILTER ,
-but the kernel was not built with
-.B CONFIG_SECCOMP_FILTER
-enabled.
-.TP
-.BR EINVAL
  .I operation
  specified
  .BR SECCOMP_SET_MODE_FILTER ,
@@ -670,10 +792,100 @@ operation (which does not support
  .IR flags ).
  .PP
  Since Linux 4.4, the
-.BR prctl (2)
+.BR ptrace (2)
  .B PTRACE_SECCOMP_GET_FILTER
  operation can be used to dump a process's seccomp filters.
  .\"
+.SS Architecture support for seccomp BPF
+Architecture support for seccomp BPF filtering
+.\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in
+.\" kernel source. Last checked in Linux 4.16-rc source.
+is available on the following architectures:
+.IP * 3
+x86-64, i386, x32 (since Linux 3.5)
+.PD 0
+.IP *
+ARM (since Linux 3.8)
+.IP *
+s390 (since Linux 3.8)
+.IP *
+MIPS (since Linux 3.16)
+.IP *
+ARM-64 (since Linux 3.19)
+.IP *
+PowerPC (since Linux 4.3)
+.IP *
+Tile (since Linux 4.3)
+.IP *
+PA-RISC (since Linux 4.6)
+.\" User mode Linux since Linux 4.6
+.PD
+.\"
+.SS Caveats
+There are various subtleties to consider when applying seccomp filters
+to a program, including the following:
+.IP * 3
+Some traditional system calls have user-space implementations in the
+.BR vdso (7)
+on many architectures.
+Notable examples include
+.BR clock_gettime (2),
+.BR gettimeofday (2),
+and
+.BR time (2).
+On such architectures,
+seccomp filtering for these system calls will have no effect.
+(However, there are cases where the
+.BR vdso (7)
+implementations may fall back to invoking the true system call,
+in which case seccomp filters would see the system call.)
+.IP *
+Seccomp filtering is based on system call numbers.
+However, applications typically do not directly invoke system calls,
+but instead call wrapper functions in the C library which
+in turn invoke the system calls.
+Consequently, one must be aware of the following:
+.RS
+.IP \(bu 3
+The glibc wrappers for some traditional system calls may actually
+employ system calls with different names in the kernel.
+For example, the
+.BR exit (2)
+wrapper function actually employs the
+.BR exit_group (2)
+system call, and the
+.BR fork (2)
+wrapper function actually calls
+.BR clone (2).
+.IP \(bu
+The behavior of wrapper functions may vary across architectures,
+according to the range of system calls provided on those architectures.
+In other words, the same wrapper function may invoke
+different system calls on different architectures.
+.IP \(bu
+Finally, the behavior of wrapper functions can change across glibc versions.
+For example, in older versions, the glibc wrapper function for
+.BR open (2)
+invoked the system call of the same name,
+but starting in glibc 2.26, the implementation switched to calling
+.BR openat (2)
+on all architectures.
+.RE
+.PP
+The consequence of the above points is that it may be necessary
+to filter for a system call other than might be expected.
+Various manual pages in Section 2 provide helpful details
+about the differences between wrapper functions and
+the underlying system calls in subsections entitled
+.IR "C library/kernel differences" .
+.PP
+Furthermore, note that the application of seccomp filters
+even risks causing bugs in an application,
+when the filters cause unexpected failures for legitimate operations
+that the application might need to perform.
+Such bugs may not easily be discovered when testing the seccomp
+filters if the bugs occur in rarely used application code paths.
+.\"
  .SS Seccomp-specific BPF details
  Note the following BPF details specific to seccomp filters:
  .IP * 3
@@ -731,7 +943,7 @@ numbers on this architecture:
  $ \fBuname -m\fP
  x86_64
  $ \fBsyscall_nr() {
-    cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\
+    cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e
      awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
  }\fP
  .EE
@@ -814,7 +1026,9 @@ install_filter(int syscall_nr, int t_arch, int f_errno)
  {
      unsigned int upper_nr_limit = 0xffffffff;
  
-    /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI */
+    /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI
+       (in the x32 ABI, all system calls have bit 30 set in the
+       'nr' field, meaning the numbers are >= X32_SYSCALL_BIT) */
      if (t_arch == AUDIT_ARCH_X86_64)
          upper_nr_limit = X32_SYSCALL_BIT - 1;
  
@@ -833,7 +1047,7 @@ install_filter(int syscall_nr, int t_arch, int f_errno)
          BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
                   (offsetof(struct seccomp_data, nr))),
  
-        /* [3] Check ABI - only needed for x86-64 in blacklist use
+        /* [3] Check ABI - only needed for x86-64 in deny-list use
                 cases.  Use BPF_JGT instead of checking against the bit
                 mask to avoid having to reload the syscall number. */
          BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
@@ -843,7 +1057,7 @@ install_filter(int syscall_nr, int t_arch, int f_errno)
          BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
  
          /* [5] Matching architecture and system call: don't execute
-              the system call, and return 'f_errno' in 'errno' */
+           the system call, and return 'f_errno' in 'errno' */
          BPF_STMT(BPF_RET | BPF_K,
                   SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
  
@@ -873,10 +1087,10 @@ main(int argc, char **argv)
  {
      if (argc < 5) {
          fprintf(stderr, "Usage: "
-                "%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
-                "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n"
-                "                 AUDIT_ARCH_X86_64: 0x%X\\n"
-                "\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
+                "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en"
+                "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en"
+                "                 AUDIT_ARCH_X86_64: 0x%X\en"
+                "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
          exit(EXIT_FAILURE);
      }
  
@@ -896,6 +1110,7 @@ main(int argc, char **argv)
  }
  .EE
  .SH SEE ALSO
+.BR bpfc (1),
  .BR strace (1),
  .BR bpf (2),
  .BR prctl (2),
@@ -909,11 +1124,11 @@ Various pages from the
  .I libseccomp
  library, including:
  .BR scmp_sys_resolver (1),
+.BR seccomp_export_bpf (3),
  .BR seccomp_init (3),
  .BR seccomp_load (3),
-.BR seccomp_rule_add (3),
  and
-.BR seccomp_export_bpf (3).
+.BR seccomp_rule_add (3).
  .PP
  The kernel source files
  .IR Documentation/networking/filter.txt
@@ -924,7 +1139,7 @@ and
  .IR Documentation/prctl/seccomp_filter.txt
  before Linux 4.13).
  .PP
-McCanne, S. and Jacobson, V. (1992)
+McCanne, S.\& and Jacobson, V.\& (1992)
  .IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
  Proceedings of the USENIX Winter 1993 Conference
  .UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf