]> git.ipfire.org Git - thirdparty/man-pages.git/blob - man2/seccomp_unotify.2
Many pages: Use correct letter case in page titles (TH)
[thirdparty/man-pages.git] / man2 / seccomp_unotify.2
1 .\" Copyright (C) 2020 Michael Kerrisk <mtk.manpages@gmail.com>
2 .\"
3 .\" SPDX-License-Identifier: Linux-man-pages-copyleft
4 .\"
5 .TH seccomp_unotify 2 (date) "Linux man-pages (unreleased)"
6 .SH NAME
7 seccomp_unotify \- Seccomp user-space notification mechanism
8 .SH LIBRARY
9 Standard C library
10 .RI ( libc ", " \-lc )
11 .SH SYNOPSIS
12 .nf
13 .B #include <linux/seccomp.h>
14 .B #include <linux/filter.h>
15 .B #include <linux/audit.h>
16 .PP
17 .BI "int seccomp(unsigned int " operation ", unsigned int " flags \
18 ", void *" args );
19 .PP
20 .B #include <sys/ioctl.h>
21 .PP
22 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_RECV,"
23 .BI " struct seccomp_notif *" req );
24 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_SEND,"
25 .BI " struct seccomp_notif_resp *" resp );
26 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ID_VALID, __u64 *" id );
27 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ADDFD,"
28 .BI " struct seccomp_notif_addfd *" addfd );
29 .fi
30 .SH DESCRIPTION
31 This page describes the user-space notification mechanism provided by the
32 Secure Computing (seccomp) facility.
33 As well as the use of the
34 .B SECCOMP_FILTER_FLAG_NEW_LISTENER
35 flag, the
36 .B SECCOMP_RET_USER_NOTIF
37 action value, and the
38 .B SECCOMP_GET_NOTIF_SIZES
39 operation described in
40 .BR seccomp (2),
41 this mechanism involves the use of a number of related
42 .BR ioctl (2)
43 operations (described below).
44 .\"
45 .SS Overview
46 In conventional usage of a seccomp filter,
47 the decision about how to treat a system call is made by the filter itself.
48 By contrast, the user-space notification mechanism allows
49 the seccomp filter to delegate
50 the handling of the system call to another user-space process.
51 Note that this mechanism is explicitly
52 .B not
53 intended as a method implementing security policy; see NOTES.
54 .PP
55 In the discussion that follows,
56 the thread(s) on which the seccomp filter is installed is (are)
57 referred to as the
58 .IR target ,
59 and the process that is notified by the user-space notification
60 mechanism is referred to as the
61 .IR supervisor .
62 .PP
63 A suitably privileged supervisor can use the user-space notification
64 mechanism to perform actions on behalf of the target.
65 The advantage of the user-space notification mechanism is that
66 the supervisor will
67 usually be able to retrieve information about the target and the
68 performed system call that the seccomp filter itself cannot.
69 (A seccomp filter is limited in the information it can obtain and
70 the actions that it can perform because it
71 is running on a virtual machine inside the kernel.)
72 .PP
73 An overview of the steps performed by the target and the supervisor
74 is as follows:
75 .\"-------------------------------------
76 .IP (1) 5
77 The target establishes a seccomp filter in the usual manner,
78 but with two differences:
79 .RS
80 .IP \(bu 3
81 The
82 .BR seccomp (2)
83 .I flags
84 argument includes the flag
85 .BR SECCOMP_FILTER_FLAG_NEW_LISTENER .
86 Consequently, the return value of the (successful)
87 .BR seccomp (2)
88 call is a new "listening"
89 file descriptor that can be used to receive notifications.
90 Only one "listening" seccomp filter can be installed for a thread.
91 .\" FIXME
92 .\" Is the last sentence above correct?
93 .\"
94 .\" Kees Cook (25 Oct 2020) notes:
95 .\"
96 .\" I like this limitation, but I expect that it'll need to change in the
97 .\" future. Even with LSMs, we see the need for arbitrary stacking, and the
98 .\" idea of there being only 1 supervisor will eventually break down. Right
99 .\" now there is only 1 because only container managers are using this
100 .\" feature. But if some daemon starts using it to isolate some thread,
101 .\" suddenly it might break if a container manager is trying to listen to it
102 .\" too, etc. I expect it won't be needed soon, but I do think it'll change.
103 .\"
104 .IP \(bu
105 In cases where it is appropriate, the seccomp filter returns the action value
106 .BR SECCOMP_RET_USER_NOTIF .
107 This return value will trigger a notification event.
108 .RE
109 .\"-------------------------------------
110 .IP (2)
111 In order that the supervisor can obtain notifications
112 using the listening file descriptor,
113 (a duplicate of) that file descriptor must be passed from
114 the target to the supervisor.
115 One way in which this could be done is by passing the file descriptor
116 over a UNIX domain socket connection between the target and the supervisor
117 (using the
118 .B SCM_RIGHTS
119 ancillary message type described in
120 .BR unix (7)).
121 Another way to do this is through the use of
122 .BR pidfd_getfd (2).
123 .\" Jann Horn:
124 .\" Instead of using unix domain sockets to send the fd to the
125 .\" parent, I think you could also use clone3() with
126 .\" flags==CLONE_FILES|SIGCHLD, dup2() the seccomp fd to an fd
127 .\" that was reserved in the parent, call unshare(CLONE_FILES)
128 .\" in the child after setting up the seccomp fd, and wake
129 .\" up the parent with something like pthread_cond_signal()?
130 .\" I'm not sure whether that'd look better or worse in the
131 .\" end though, so maybe just ignore this comment.
132 .\"-------------------------------------
133 .IP (3)
134 The supervisor will receive notification events
135 on the listening file descriptor.
136 These events are returned as structures of type
137 .IR seccomp_notif .
138 Because this structure and its size may evolve over kernel versions,
139 the supervisor must first determine the size of this structure
140 using the
141 .BR seccomp (2)
142 .B SECCOMP_GET_NOTIF_SIZES
143 operation, which returns a structure of type
144 .IR seccomp_notif_sizes .
145 The supervisor allocates a buffer of size
146 .I seccomp_notif_sizes.seccomp_notif
147 bytes to receive notification events.
148 In addition,the supervisor allocates another buffer of size
149 .I seccomp_notif_sizes.seccomp_notif_resp
150 bytes for the response (a
151 .I struct seccomp_notif_resp
152 structure)
153 that it will provide to the kernel (and thus the target).
154 .\"-------------------------------------
155 .IP (4)
156 The target then performs its workload,
157 which includes system calls that will be controlled by the seccomp filter.
158 Whenever one of these system calls causes the filter to return the
159 .B SECCOMP_RET_USER_NOTIF
160 action value, the kernel does
161 .I not
162 (yet) execute the system call;
163 instead, execution of the target is temporarily blocked inside
164 the kernel (in a sleep state that is interruptible by signals)
165 and a notification event is generated on the listening file descriptor.
166 .\"-------------------------------------
167 .IP (5)
168 The supervisor can now repeatedly monitor the
169 listening file descriptor for
170 .BR SECCOMP_RET_USER_NOTIF -triggered
171 events.
172 To do this, the supervisor uses the
173 .B SECCOMP_IOCTL_NOTIF_RECV
174 .BR ioctl (2)
175 operation to read information about a notification event;
176 this operation blocks until an event is available.
177 The operation returns a
178 .I seccomp_notif
179 structure containing information about the system call
180 that is being attempted by the target.
181 (As described in NOTES,
182 the file descriptor can also be monitored with
183 .BR select (2),
184 .BR poll (2),
185 or
186 .BR epoll (7).)
187 .\" FIXME
188 .\" Christian Brauner:
189 .\"
190 .\" Do we support O_NONBLOCK with SECCOMP_IOCTL_NOTIF_RECV and if
191 .\" not should we?
192 .\"
193 .\" Michael Kerrisk:
194 .\"
195 .\" A quick test suggests that O_NONBLOCK has no effect on the blocking
196 .\" behavior of SECCOMP_IOCTL_NOTIF_RECV.
197 .
198 .\"-------------------------------------
199 .IP (6)
200 The
201 .I seccomp_notif
202 structure returned by the
203 .B SECCOMP_IOCTL_NOTIF_RECV
204 operation includes the same information (a
205 .I seccomp_data
206 structure) that was passed to the seccomp filter.
207 This information allows the supervisor to discover the system call number and
208 the arguments for the target's system call.
209 In addition, the notification event contains the ID of the thread
210 that triggered the notification and a unique cookie value that
211 is used in subsequent
212 .B SECCOMP_IOCTL_NOTIF_ID_VALID
213 and
214 .B SECCOMP_IOCTL_NOTIF_SEND
215 operations.
216 .IP
217 The information in the notification can be used to discover the
218 values of pointer arguments for the target's system call.
219 (This is something that can't be done from within a seccomp filter.)
220 One way in which the supervisor can do this is to open the corresponding
221 .IR /proc/ tid /mem
222 file (see
223 .BR proc (5))
224 and read bytes from the location that corresponds to one of
225 the pointer arguments whose value is supplied in the notification event.
226 .\" Tycho Andersen mentioned that there are alternatives to /proc/PID/mem,
227 .\" such as ptrace() and /proc/PID/map_files
228 (The supervisor must be careful to avoid
229 a race condition that can occur when doing this;
230 see the description of the
231 .B SECCOMP_IOCTL_NOTIF_ID_VALID
232 .BR ioctl (2)
233 operation below.)
234 In addition,
235 the supervisor can access other system information that is visible
236 in user space but which is not accessible from a seccomp filter.
237 .\"-------------------------------------
238 .IP (7)
239 Having obtained information as per the previous step,
240 the supervisor may then choose to perform an action in response
241 to the target's system call
242 (which, as noted above, is not executed when the seccomp filter returns the
243 .B SECCOMP_RET_USER_NOTIF
244 action value).
245 .IP
246 One example use case here relates to containers.
247 The target may be located inside a container where
248 it does not have sufficient capabilities to mount a filesystem
249 in the container's mount namespace.
250 However, the supervisor may be a more privileged process that
251 does have sufficient capabilities to perform the mount operation.
252 .\"-------------------------------------
253 .IP (8)
254 The supervisor then sends a response to the notification.
255 The information in this response is used by the kernel to construct
256 a return value for the target's system call and provide
257 a value that will be assigned to the
258 .I errno
259 variable of the target.
260 .IP
261 The response is sent using the
262 .B SECCOMP_IOCTL_NOTIF_SEND
263 .BR ioctl (2)
264 operation, which is used to transmit a
265 .I seccomp_notif_resp
266 structure to the kernel.
267 This structure includes a cookie value that the supervisor obtained in the
268 .I seccomp_notif
269 structure returned by the
270 .B SECCOMP_IOCTL_NOTIF_RECV
271 operation.
272 This cookie value allows the kernel to associate the response with the
273 target.
274 This structure must include the cookie value that the supervisor
275 obtained in the
276 .I seccomp_notif
277 structure returned by the
278 .B SECCOMP_IOCTL_NOTIF_RECV
279 operation;
280 the cookie allows the kernel to associate the response with the target.
281 .\"-------------------------------------
282 .IP (9)
283 Once the notification has been sent,
284 the system call in the target thread unblocks,
285 returning the information that was provided by the supervisor
286 in the notification response.
287 .\"-------------------------------------
288 .PP
289 As a variation on the last two steps,
290 the supervisor can send a response that tells the kernel that it
291 should execute the target thread's system call; see the discussion of
292 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
293 below.
294 .\"
295 .SH IOCTL OPERATIONS
296 The following
297 .BR ioctl (2)
298 operations are supported by the seccomp user-space
299 notification file descriptor.
300 For each of these operations, the first (file descriptor) argument of
301 .BR ioctl (2)
302 is the listening file descriptor returned by a call to
303 .BR seccomp (2)
304 with the
305 .B SECCOMP_FILTER_FLAG_NEW_LISTENER
306 flag.
307 .\"
308 .SS SECCOMP_IOCTL_NOTIF_RECV
309 The
310 .B SECCOMP_IOCTL_NOTIF_RECV
311 operation (available since Linux 5.0) is used to obtain a user-space
312 notification event.
313 If no such event is currently pending,
314 the operation blocks until an event occurs.
315 The third
316 .BR ioctl (2)
317 argument is a pointer to a structure of the following form
318 which contains information about the event.
319 This structure must be zeroed out before the call.
320 .PP
321 .in +4n
322 .EX
323 struct seccomp_notif {
324 __u64 id; /* Cookie */
325 __u32 pid; /* TID of target thread */
326 __u32 flags; /* Currently unused (0) */
327 struct seccomp_data data; /* See seccomp(2) */
328 };
329 .EE
330 .in
331 .PP
332 The fields in this structure are as follows:
333 .TP
334 .I id
335 This is a cookie for the notification.
336 Each such cookie is guaranteed to be unique for the corresponding
337 seccomp filter.
338 .RS
339 .IP \(bu 3
340 The cookie can be used with the
341 .B SECCOMP_IOCTL_NOTIF_ID_VALID
342 .BR ioctl (2)
343 operation described below.
344 .IP \(bu
345 When returning a notification response to the kernel,
346 the supervisor must include the cookie value in the
347 .I seccomp_notif_resp
348 structure that is specified as the argument of the
349 .B SECCOMP_IOCTL_NOTIF_SEND
350 operation.
351 .RE
352 .TP
353 .I pid
354 This is the thread ID of the target thread that triggered
355 the notification event.
356 .TP
357 .I flags
358 This is a bit mask of flags providing further information on the event.
359 In the current implementation, this field is always zero.
360 .TP
361 .I data
362 This is a
363 .I seccomp_data
364 structure containing information about the system call that
365 triggered the notification.
366 This is the same structure that is passed to the seccomp filter.
367 See
368 .BR seccomp (2)
369 for details of this structure.
370 .PP
371 On success, this operation returns 0; on failure, \-1 is returned, and
372 .I errno
373 is set to indicate the cause of the error.
374 This operation can fail with the following errors:
375 .TP
376 .BR EINVAL " (since Linux 5.5)"
377 .\" commit 2882d53c9c6f3b8311d225062522f03772cf0179
378 The
379 .I seccomp_notif
380 structure that was passed to the call contained nonzero fields.
381 .TP
382 .B ENOENT
383 The target thread was killed by a signal as the notification information
384 was being generated,
385 or the target's (blocked) system call was interrupted by a signal handler.
386 .\" FIXME
387 .\" From my experiments,
388 .\" it appears that if a SECCOMP_IOCTL_NOTIF_RECV is done after
389 .\" the target thread terminates, then the ioctl() simply
390 .\" blocks (rather than returning an error to indicate that the
391 .\" target no longer exists).
392 .\"
393 .\" I found that surprising, and it required some contortions in
394 .\" the example program. It was not possible to code my SIGCHLD
395 .\" handler (which reaps the zombie when the worker/target
396 .\" terminates) to simply set a flag checked in the main
397 .\" handleNotifications() loop, since this created an
398 .\" unavoidable race where the child might terminate just after
399 .\" I had checked the flag, but before I blocked (forever!) in the
400 .\" SECCOMP_IOCTL_NOTIF_RECV operation. Instead, I had to code
401 .\" the signal handler to simply call _exit(2) in order to
402 .\" terminate the parent process (the supervisor).
403 .\"
404 .\" Is this expected behavior? It seems to me rather
405 .\" desirable that SECCOMP_IOCTL_NOTIF_RECV should give an error
406 .\" if the target has terminated.
407 .\"
408 .\" Jann posted a patch to rectify this, but there was no response
409 .\" (Lore link: https://bit.ly/3jvUBxk) to his question about fixing
410 .\" this issue. (I've tried building with the patch, but encountered
411 .\" an issue with the target process entering D state after a signal.)
412 .\"
413 .\" For now, this behavior is documented in BUGS.
414 .\"
415 .\" Kees Cook commented: Let's change [this] ASAP!
416 .\"
417 .SS SECCOMP_IOCTL_NOTIF_ID_VALID
418 The
419 .B SECCOMP_IOCTL_NOTIF_ID_VALID
420 operation (available since Linux 5.0) is used to check that a notification ID
421 returned by an earlier
422 .B SECCOMP_IOCTL_NOTIF_RECV
423 operation is still valid
424 (i.e., that the target still exists and its system call
425 is still blocked waiting for a response).
426 .PP
427 The third
428 .BR ioctl (2)
429 argument is a pointer to the cookie
430 .RI ( id )
431 returned by the
432 .B SECCOMP_IOCTL_NOTIF_RECV
433 operation.
434 .PP
435 This operation is necessary to avoid race conditions that can occur when the
436 .I pid
437 returned by the
438 .B SECCOMP_IOCTL_NOTIF_RECV
439 operation terminates, and that process ID is reused by another process.
440 An example of this kind of race is the following
441 .IP (1) 5
442 A notification is generated on the listening file descriptor.
443 The returned
444 .I seccomp_notif
445 contains the TID of the target thread (in the
446 .I pid
447 field of the structure).
448 .IP (2)
449 The target terminates.
450 .IP (3)
451 Another thread or process is created on the system that by chance reuses the
452 TID that was freed when the target terminated.
453 .IP (4)
454 The supervisor
455 .BR open (2)s
456 the
457 .IR /proc/ tid /mem
458 file for the TID obtained in step 1, with the intention of (say)
459 inspecting the memory location(s) that containing the argument(s) of
460 the system call that triggered the notification in step 1.
461 .PP
462 In the above scenario, the risk is that the supervisor may try
463 to access the memory of a process other than the target.
464 This race can be avoided by following the call to
465 .BR open (2)
466 with a
467 .B SECCOMP_IOCTL_NOTIF_ID_VALID
468 operation to verify that the process that generated the notification
469 is still alive.
470 (Note that if the target terminates after the latter step,
471 a subsequent
472 .BR read (2)
473 from the file descriptor may return 0, indicating end of file.)
474 .\" Jann Horn:
475 .\" the PID can be reused, but the /proc/$pid directory is
476 .\" internally not associated with the numeric PID, but,
477 .\" conceptually speaking, with a specific incarnation of the
478 .\" PID, or something like that. (Actually, it is associated
479 .\" with the "struct pid", which is not reused, instead of the
480 .\" numeric PID.
481 .PP
482 See NOTES for a discussion of other cases where
483 .B SECCOMP_IOCTL_NOTIF_ID_VALID
484 checks must be performed.
485 .PP
486 On success (i.e., the notification ID is still valid),
487 this operation returns 0.
488 On failure (i.e., the notification ID is no longer valid),
489 \-1 is returned, and
490 .I errno
491 is set to
492 .BR ENOENT .
493 .\"
494 .SS SECCOMP_IOCTL_NOTIF_SEND
495 The
496 .B SECCOMP_IOCTL_NOTIF_SEND
497 operation (available since Linux 5.0)
498 is used to send a notification response back to the kernel.
499 The third
500 .BR ioctl (2)
501 argument of this structure is a pointer to a structure of the following form:
502 .PP
503 .in +4n
504 .EX
505 struct seccomp_notif_resp {
506 __u64 id; /* Cookie value */
507 __s64 val; /* Success return value */
508 __s32 error; /* 0 (success) or negative error number */
509 __u32 flags; /* See below */
510 };
511 .EE
512 .in
513 .PP
514 The fields of this structure are as follows:
515 .TP
516 .I id
517 This is the cookie value that was obtained using the
518 .B SECCOMP_IOCTL_NOTIF_RECV
519 operation.
520 This cookie value allows the kernel to correctly associate this response
521 with the system call that triggered the user-space notification.
522 .TP
523 .I val
524 This is the value that will be used for a spoofed
525 success return for the target's system call; see below.
526 .TP
527 .I error
528 This is the value that will be used as the error number
529 .RI ( errno )
530 for a spoofed error return for the target's system call; see below.
531 .TP
532 .I flags
533 This is a bit mask that includes zero or more of the following flags:
534 .RS
535 .TP
536 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE " (since Linux 5.5)"
537 Tell the kernel to execute the target's system call.
538 .\" commit fb3c5386b382d4097476ce9647260fc89b34afdb
539 .RE
540 .PP
541 Two kinds of response are possible:
542 .IP \(bu 3
543 A response to the kernel telling it to execute the
544 target's system call.
545 In this case, the
546 .I flags
547 field includes
548 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
549 and the
550 .I error
551 and
552 .I val
553 fields must be zero.
554 .IP
555 This kind of response can be useful in cases where the supervisor needs
556 to do deeper analysis of the target's system call than is possible
557 from a seccomp filter (e.g., examining the values of pointer arguments),
558 and, having decided that the system call does not require emulation
559 by the supervisor, the supervisor wants the system call to
560 be executed normally in the target.
561 .IP
562 The
563 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
564 flag should be used with caution; see NOTES.
565 .IP \(bu
566 A spoofed return value for the target's system call.
567 In this case, the kernel does not execute the target's system call,
568 instead causing the system call to return a spoofed value as specified by
569 fields of the
570 .I seccomp_notif_resp
571 structure.
572 The supervisor should set the fields of this structure as follows:
573 .RS
574 .IP + 3
575 .I flags
576 does not contain
577 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE .
578 .IP +
579 .I error
580 is set either to 0 for a spoofed "success" return or to a negative
581 error number for a spoofed "failure" return.
582 In the former case, the kernel causes the target's system call
583 to return the value specified in the
584 .I val
585 field.
586 In the latter case, the kernel causes the target's system call
587 to return \-1, and
588 .I errno
589 is assigned the negated
590 .I error
591 value.
592 .IP +
593 .I val
594 is set to a value that will be used as the return value for a spoofed
595 "success" return for the target's system call.
596 The value in this field is ignored if the
597 .I error
598 field contains a nonzero value.
599 .\" FIXME
600 .\" Kees Cook suggested:
601 .\"
602 .\" Strictly speaking, this is architecture specific, but
603 .\" all architectures do it this way. Should seccomp enforce
604 .\" val == 0 when err != 0 ?
605 .\"
606 .\" Christian Brauner
607 .\"
608 .\" Feels like it should, at least for the SEND ioctl where we already
609 .\" verify that val and err are both 0 when CONTINUE is specified (as you
610 .\" pointed out correctly above).
611 .RE
612 .PP
613 On success, this operation returns 0; on failure, \-1 is returned, and
614 .I errno
615 is set to indicate the cause of the error.
616 This operation can fail with the following errors:
617 .TP
618 .B EINPROGRESS
619 A response to this notification has already been sent.
620 .TP
621 .B EINVAL
622 An invalid value was specified in the
623 .I flags field.
624 .TP
625 .B
626 .B EINVAL
627 The
628 .I flags
629 field contained
630 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
631 and the
632 .I error
633 or
634 .I val
635 field was not zero.
636 .TP
637 .B ENOENT
638 The blocked system call in the target
639 has been interrupted by a signal handler
640 or the target has terminated.
641 .\" Jann Horn notes:
642 .\" you could also get this [ENOENT] if a response has already
643 .\" been sent, instead of EINPROGRESS - the only difference is
644 .\" whether the target thread has picked up the response yet
645 .\"
646 .SS SECCOMP_IOCTL_NOTIF_ADDFD
647 The
648 .B SECCOMP_IOCTL_NOTIF_ADDFD
649 operation (available since Linux 5.9)
650 allows the supervisor to install a file descriptor
651 into the target's file descriptor table.
652 Much like the use of
653 .B SCM_RIGHTS
654 messages described in
655 .BR unix (7),
656 this operation is semantically equivalent to duplicating
657 a file descriptor from the supervisor's file descriptor table
658 into the target's file descriptor table.
659 .PP
660 The
661 .B SECCOMP_IOCTL_NOTIF_ADDFD
662 operation permits the supervisor to emulate a target system call (such as
663 .BR socket (2)
664 or
665 .BR openat (2))
666 that generates a file descriptor.
667 The supervisor can perform the system call that generates
668 the file descriptor (and associated open file description)
669 and then use this operation to allocate
670 a file descriptor that refers to the same open file description in the target.
671 (For an explanation of open file descriptions, see
672 .BR open (2).)
673 .PP
674 Once this operation has been performed,
675 the supervisor can close its copy of the file descriptor.
676 .PP
677 In the target,
678 the received file descriptor is subject to the same
679 Linux Security Module (LSM) checks as are applied to a file descriptor
680 that is received in an
681 .B SCM_RIGHTS
682 ancillary message.
683 If the file descriptor refers to a socket,
684 it inherits the cgroup version 1 network controller settings
685 .RI ( classid
686 and
687 .IR netprioidx )
688 of the target.
689 .PP
690 The third
691 .BR ioctl (2)
692 argument is a pointer to a structure of the following form:
693 .PP
694 .in +4n
695 .EX
696 struct seccomp_notif_addfd {
697 __u64 id; /* Cookie value */
698 __u32 flags; /* Flags */
699 __u32 srcfd; /* Local file descriptor number */
700 __u32 newfd; /* 0 or desired file descriptor
701 number in target */
702 __u32 newfd_flags; /* Flags to set on target file
703 descriptor */
704 };
705 .EE
706 .in
707 .PP
708 The fields in this structure are as follows:
709 .TP
710 .I id
711 This field should be set to the notification ID
712 (cookie value) that was obtained via
713 .BR SECCOMP_IOCTL_NOTIF_RECV .
714 .TP
715 .I flags
716 This field is a bit mask of flags that modify the behavior of the operation.
717 Currently, only one flag is supported:
718 .RS
719 .TP
720 .B SECCOMP_ADDFD_FLAG_SETFD
721 When allocating the file descriptor in the target,
722 use the file descriptor number specified in the
723 .I newfd
724 field.
725 .TP
726 .BR SECCOMP_ADDFD_FLAG_SEND " (since Linux 5.14)"
727 .\" commit 0ae71c7720e3ae3aabd2e8a072d27f7bd173d25c
728 Perform the equivalent of
729 .B SECCOMP_IOCTL_NOTIF_ADDFD
730 plus
731 .B SECCOMP_IOCTL_NOTIF_SEND
732 as an atomic operation.
733 On successful invocation, the target process's
734 .I errno
735 will be 0
736 and the return value will be the file descriptor number
737 that was allocated in the target.
738 If allocating the file descriptor in the target fails,
739 the target's system call continues to be blocked
740 until a successful response is sent.
741 .RE
742 .TP
743 .I srcfd
744 This field should be set to the number of the file descriptor
745 in the supervisor that is to be duplicated.
746 .TP
747 .I newfd
748 This field determines which file descriptor number is allocated in the target.
749 If the
750 .B SECCOMP_ADDFD_FLAG_SETFD
751 flag is set,
752 then this field specifies which file descriptor number should be allocated.
753 If this file descriptor number is already open in the target,
754 it is atomically closed and reused.
755 If the descriptor duplication fails due to an LSM check, or if
756 .I srcfd
757 is not a valid file descriptor,
758 the file descriptor
759 .I newfd
760 will not be closed in the target process.
761 .IP
762 If the
763 .B SECCOMP_ADDFD_FLAG_SETFD
764 flag it not set, then this field must be 0,
765 and the kernel allocates the lowest unused file descriptor number
766 in the target.
767 .TP
768 .I newfd_flags
769 This field is a bit mask specifying flags that should be set on
770 the file descriptor that is received in the target process.
771 Currently, only the following flag is implemented:
772 .RS
773 .TP
774 .B O_CLOEXEC
775 Set the close-on-exec flag on the received file descriptor.
776 .RE
777 .PP
778 On success, this
779 .BR ioctl (2)
780 call returns the number of the file descriptor that was allocated
781 in the target.
782 Assuming that the emulated system call is one that returns
783 a file descriptor as its function result (e.g.,
784 .BR socket (2)),
785 this value can be used as the return value
786 .RI ( resp.val )
787 that is supplied in the response that is subsequently sent with the
788 .B SECCOMP_IOCTL_NOTIF_SEND
789 operation.
790 .PP
791 On error, \-1 is returned and
792 .I errno
793 is set to indicate the cause of the error.
794 .PP
795 This operation can fail with the following errors:
796 .TP
797 .B EBADF
798 Allocating the file descriptor in the target would cause the target's
799 .B RLIMIT_NOFILE
800 limit to be exceeded (see
801 .BR getrlimit (2)).
802 .TP
803 .B EBUSY
804 If the flag
805 .B SECCOMP_IOCTL_NOTIF_SEND
806 is used, this means the operation can't proceed until other
807 .B SECCOMP_IOCTL_NOTIF_ADDFD
808 requests are processed.
809 .TP
810 .B EINPROGRESS
811 The user-space notification specified in the
812 .I id
813 field exists but has not yet been fetched (by a
814 .BR SECCOMP_IOCTL_NOTIF_RECV )
815 or has already been responded to (by a
816 .BR SECCOMP_IOCTL_NOTIF_SEND ).
817 .TP
818 .B EINVAL
819 An invalid flag was specified in the
820 .I flags
821 or
822 .I newfd_flags
823 field, or the
824 .I newfd
825 field is nonzero and the
826 .B SECCOMP_ADDFD_FLAG_SETFD
827 flag was not specified in the
828 .I flags
829 field.
830 .TP
831 .B EMFILE
832 The file descriptor number specified in
833 .I newfd
834 exceeds the limit specified in
835 .IR /proc/sys/fs/nr_open .
836 .TP
837 .B ENOENT
838 The blocked system call in the target
839 has been interrupted by a signal handler
840 or the target has terminated.
841 .PP
842 Here is some sample code (with error handling omitted) that uses the
843 .B SECCOMP_ADDFD_FLAG_SETFD
844 operation (here, to emulate a call to
845 .BR openat (2)):
846 .PP
847 .EX
848 .in +4n
849 int fd, removeFd;
850
851 fd = openat(req->data.args[0], path, req->data.args[2],
852 req->data.args[3]);
853
854 struct seccomp_notif_addfd addfd;
855 addfd.id = req->id; /* Cookie from SECCOMP_IOCTL_NOTIF_RECV */
856 addfd.srcfd = fd;
857 addfd.newfd = 0;
858 addfd.flags = 0;
859 addfd.newfd_flags = O_CLOEXEC;
860
861 targetFd = ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
862
863 close(fd); /* No longer needed in supervisor */
864
865 struct seccomp_notif_resp *resp;
866 /* Code to allocate 'resp' omitted */
867 resp->id = req->id;
868 resp->error = 0; /* "Success" */
869 resp->val = targetFd;
870 resp->flags = 0;
871 ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp);
872 .in
873 .EE
874 .SH NOTES
875 One example use case for the user-space notification
876 mechanism is to allow a container manager
877 (a process which is typically running with more privilege than
878 the processes inside the container)
879 to mount block devices or create device nodes for the container.
880 The mount use case provides an example of where the
881 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
882 .BR ioctl (2)
883 operation is useful.
884 Upon receiving a notification for the
885 .BR mount (2)
886 system call, the container manager (the "supervisor") can distinguish
887 a request to mount a block filesystem
888 (which would not be possible for a "target" process inside the container)
889 and mount that file system.
890 If, on the other hand, the container manager detects that the operation
891 could be performed by the process inside the container
892 (e.g., a mount of a
893 .BR tmpfs (5)
894 filesystem), it can notify the kernel that the target process's
895 .BR mount (2)
896 system call can continue.
897 .\"
898 .SS select()/poll()/epoll semantics
899 The file descriptor returned when
900 .BR seccomp (2)
901 is employed with the
902 .B SECCOMP_FILTER_FLAG_NEW_LISTENER
903 flag can be monitored using
904 .BR poll (2),
905 .BR epoll (7),
906 and
907 .BR select (2).
908 These interfaces indicate that the file descriptor is ready as follows:
909 .IP \(bu 3
910 When a notification is pending,
911 these interfaces indicate that the file descriptor is readable.
912 Following such an indication, a subsequent
913 .B SECCOMP_IOCTL_NOTIF_RECV
914 .BR ioctl (2)
915 will not block, returning either information about a notification
916 or else failing with the error
917 .B EINTR
918 if the target has been killed by a signal or its system call
919 has been interrupted by a signal handler.
920 .IP \(bu
921 After the notification has been received (i.e., by the
922 .B SECCOMP_IOCTL_NOTIF_RECV
923 .BR ioctl (2)
924 operation), these interfaces indicate that the file descriptor is writable,
925 meaning that a notification response can be sent using the
926 .B SECCOMP_IOCTL_NOTIF_SEND
927 .BR ioctl (2)
928 operation.
929 .IP \(bu
930 After the last thread using the filter has terminated and been reaped using
931 .BR waitpid (2)
932 (or similar),
933 the file descriptor indicates an end-of-file condition (readable in
934 .BR select (2);
935 .BR POLLHUP / EPOLLHUP
936 in
937 .BR poll (2)/
938 .BR epoll_wait (2)).
939 .SS Design goals; use of SECCOMP_USER_NOTIF_FLAG_CONTINUE
940 The intent of the user-space notification feature is
941 to allow system calls to be performed on behalf of the target.
942 The target's system call should either be handled by the supervisor or
943 allowed to continue normally in the kernel (where standard security
944 policies will be applied).
945 .PP
946 .BR "Note well" :
947 this mechanism must not be used to make security policy decisions
948 about the system call,
949 which would be inherently race-prone for reasons described next.
950 .PP
951 The
952 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
953 flag must be used with caution.
954 If set by the supervisor, the target's system call will continue.
955 However, there is a time-of-check, time-of-use race here,
956 since an attacker could exploit the interval of time where the target is
957 blocked waiting on the "continue" response to do things such as
958 rewriting the system call arguments.
959 .PP
960 Note furthermore that a user-space notifier can be bypassed if
961 the existing filters allow the use of
962 .BR seccomp (2)
963 or
964 .BR prctl (2)
965 to install a filter that returns an action value with a higher precedence than
966 .B SECCOMP_RET_USER_NOTIF
967 (see
968 .BR seccomp (2)).
969 .PP
970 It should thus be absolutely clear that the
971 seccomp user-space notification mechanism
972 .B can not
973 be used to implement a security policy!
974 It should only ever be used in scenarios where a more privileged process
975 supervises the system calls of a lesser privileged target to
976 get around kernel-enforced security restrictions when
977 the supervisor deems this safe.
978 In other words,
979 in order to continue a system call, the supervisor should be sure that
980 another security mechanism or the kernel itself will sufficiently block
981 the system call if its arguments are rewritten to something unsafe.
982 .\"
983 .SS Caveats regarding the use of /proc/[tid]/mem
984 The discussion above noted the need to use the
985 .B SECCOMP_IOCTL_NOTIF_ID_VALID
986 .BR ioctl (2)
987 when opening the
988 .IR /proc/ tid /mem
989 file of the target
990 to avoid the possibility of accessing the memory of the wrong process
991 in the event that the target terminates and its ID
992 is recycled by another (unrelated) thread.
993 However, the use of this
994 .BR ioctl (2)
995 operation is also necessary in other situations,
996 as explained in the following paragraphs.
997 .PP
998 Consider the following scenario, where the supervisor
999 tries to read the pathname argument of a target's blocked
1000 .BR mount (2)
1001 system call:
1002 .IP (1) 5
1003 From one of its functions
1004 .RI ( func() ),
1005 the target calls
1006 .BR mount (2),
1007 which triggers a user-space notification and causes the target to block.
1008 .IP (2)
1009 The supervisor receives the notification, opens
1010 .IR /proc/ tid /mem ,
1011 and (successfully) performs the
1012 .B SECCOMP_IOCTL_NOTIF_ID_VALID
1013 check.
1014 .IP (3)
1015 The target receives a signal, which causes the
1016 .BR mount (2)
1017 to abort.
1018 .IP (4)
1019 The signal handler executes in the target, and returns.
1020 .IP (5)
1021 Upon return from the handler, the execution of
1022 .I func()
1023 resumes, and it returns (and perhaps other functions are called,
1024 overwriting the memory that had been used for the stack frame of
1025 .IR func() ).
1026 .IP (6)
1027 Using the address provided in the notification information,
1028 the supervisor reads from the target's memory location that used to
1029 contain the pathname.
1030 .IP (7)
1031 The supervisor now calls
1032 .BR mount (2)
1033 with some arbitrary bytes obtained in the previous step.
1034 .PP
1035 The conclusion from the above scenario is this:
1036 since the target's blocked system call may be interrupted by a signal handler,
1037 the supervisor must be written to expect that the
1038 target may abandon its system call at
1039 .B any
1040 time;
1041 in such an event, any information that the supervisor obtained from
1042 the target's memory must be considered invalid.
1043 .PP
1044 To prevent such scenarios,
1045 every read from the target's memory must be separated from use of
1046 the bytes so obtained by a
1047 .B SECCOMP_IOCTL_NOTIF_ID_VALID
1048 check.
1049 In the above example, the check would be placed between the two final steps.
1050 An example of such a check is shown in EXAMPLES.
1051 .PP
1052 Following on from the above, it should be clear that
1053 a write by the supervisor into the target's memory can
1054 .B never
1055 be considered safe.
1056 .\"
1057 .SS Caveats regarding blocking system calls
1058 Suppose that the target performs a blocking system call (e.g.,
1059 .BR accept (2))
1060 that the supervisor should handle.
1061 The supervisor might then in turn execute the same blocking system call.
1062 .PP
1063 In this scenario,
1064 it is important to note that if the target's system call is now
1065 interrupted by a signal, the supervisor is
1066 .I not
1067 informed of this.
1068 If the supervisor does not take suitable steps to
1069 actively discover that the target's system call has been canceled,
1070 various difficulties can occur.
1071 Taking the example of
1072 .BR accept (2),
1073 the supervisor might remain blocked in its
1074 .BR accept (2)
1075 holding a port number that the target
1076 (which, after the interruption by the signal handler,
1077 perhaps closed its listening socket) might expect to be able to reuse in a
1078 .BR bind (2)
1079 call.
1080 .PP
1081 Therefore, when the supervisor wishes to emulate a blocking system call,
1082 it must do so in such a way that it gets informed if the target's
1083 system call is interrupted by a signal handler.
1084 For example, if the supervisor itself executes the same
1085 blocking system call, then it could employ a separate thread
1086 that uses the
1087 .B SECCOMP_IOCTL_NOTIF_ID_VALID
1088 operation to check if the target is still blocked in its system call.
1089 Alternatively, in the
1090 .BR accept (2)
1091 example, the supervisor might use
1092 .BR poll (2)
1093 to monitor both the notification file descriptor
1094 (so as to discover when the target's
1095 .BR accept (2)
1096 call has been interrupted) and the listening file descriptor
1097 (so as to know when a connection is available).
1098 .PP
1099 If the target's system call is interrupted,
1100 the supervisor must take care to release resources (e.g., file descriptors)
1101 that it acquired on behalf of the target.
1102 .\"
1103 .SS Interaction with SA_RESTART signal handlers
1104 Consider the following scenario:
1105 .IP (1) 5
1106 The target process has used
1107 .BR sigaction (2)
1108 to install a signal handler with the
1109 .B SA_RESTART
1110 flag.
1111 .IP (2)
1112 The target has made a system call that triggered a seccomp
1113 user-space notification and the target is currently blocked
1114 until the supervisor sends a notification response.
1115 .IP (3)
1116 A signal is delivered to the target and the signal handler is executed.
1117 .IP (4)
1118 When (if) the supervisor attempts to send a notification response, the
1119 .B SECCOMP_IOCTL_NOTIF_SEND
1120 .BR ioctl (2))
1121 operation will fail with the
1122 .B ENOENT
1123 error.
1124 .PP
1125 In this scenario, the kernel will restart the target's system call.
1126 Consequently, the supervisor will receive another user-space notification.
1127 Thus, depending on how many times the blocked system call
1128 is interrupted by a signal handler,
1129 the supervisor may receive multiple notifications for
1130 the same instance of a system call in the target.
1131 .PP
1132 One oddity is that system call restarting as described in this scenario
1133 will occur even for the blocking system calls listed in
1134 .BR signal (7)
1135 that would
1136 .B never
1137 normally be restarted by the
1138 .B SA_RESTART
1139 flag.
1140 .\" FIXME
1141 .\" About the above, Kees Cook commented:
1142 .\"
1143 .\" Does this need fixing? I imagine the correct behavior for this case
1144 .\" would be a response to _SEND of EINPROGRESS and the target would see
1145 .\" EINTR normally?
1146 .\"
1147 .\" I mean, it's not like seccomp doesn't already expose weirdness with
1148 .\" syscall restarts. Not even arm64 compat agrees[3] with arm32 in this
1149 .\" regard. :(
1150 .
1151 .\" FIXME
1152 .\" Michael Kerrisk:
1153 .\" I wonder about the effect of this oddity for system calls that
1154 .\" are normally nonrestartable because they have timeouts. My
1155 .\" understanding is that the kernel doesn't restart those system
1156 .\" calls because it's impossible for the kernel to restart the call
1157 .\" with the right timeout value. I wonder what happens when those
1158 .\" system calls are restarted in the scenario we're discussing.)
1159 .PP
1160 Furthermore, if the supervisor response is a file descriptor
1161 added with
1162 .BR SECCOMP_IOCTL_NOTIF_ADDFD ,
1163 then the flag
1164 .B SECCOMP_ADDFD_FLAG_SEND
1165 can be used to atomically add the file descriptor and return that value,
1166 making sure no file descriptors are inadvertently leaked into the target.
1167 .SH BUGS
1168 If a
1169 .B SECCOMP_IOCTL_NOTIF_RECV
1170 .BR ioctl (2)
1171 operation
1172 .\" or a poll/epoll/select
1173 is performed after the target terminates, then the
1174 .BR ioctl (2)
1175 call simply blocks (rather than returning an error to indicate that the
1176 target no longer exists).
1177 .\" FIXME
1178 .\" Comment from Kees Cook:
1179 .\"
1180 .\" I want this fixed. It caused me no end of pain when building the
1181 .\" selftests, and ended up spawning my implementing a global test timeout
1182 .\" in kselftest. :P Before the usage counter refactor, there was no sane
1183 .\" way to deal with this, but now I think we're close.
1184 .\"
1185 .SH EXAMPLES
1186 The (somewhat contrived) program shown below demonstrates the use of
1187 the interfaces described in this page.
1188 The program creates a child process that serves as the "target" process.
1189 The child process installs a seccomp filter that returns the
1190 .B SECCOMP_RET_USER_NOTIF
1191 action value if a call is made to
1192 .BR mkdir (2).
1193 The child process then calls
1194 .BR mkdir (2)
1195 once for each of the supplied command-line arguments,
1196 and reports the result returned by the call.
1197 After processing all arguments, the child process terminates.
1198 .PP
1199 The parent process acts as the supervisor, listening for the notifications
1200 that are generated when the target process calls
1201 .BR mkdir (2).
1202 When such a notification occurs,
1203 the supervisor examines the memory of the target process (using
1204 .IR /proc/ pid /mem )
1205 to discover the pathname argument that was supplied to the
1206 .BR mkdir (2)
1207 call, and performs one of the following actions:
1208 .IP \(bu 3
1209 If the pathname begins with the prefix "/tmp/",
1210 then the supervisor attempts to create the specified directory,
1211 and then spoofs a return for the target process based on the return
1212 value of the supervisor's
1213 .BR mkdir (2)
1214 call.
1215 In the event that that call succeeds,
1216 the spoofed success return value is the length of the pathname.
1217 .IP \(bu
1218 If the pathname begins with "./" (i.e., it is a relative pathname),
1219 the supervisor sends a
1220 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
1221 response to the kernel to say that the kernel should execute
1222 the target process's
1223 .BR mkdir (2)
1224 call.
1225 .IP \(bu
1226 If the pathname begins with some other prefix,
1227 the supervisor spoofs an error return for the target process,
1228 so that the target process's
1229 .BR mkdir (2)
1230 call appears to fail with the error
1231 .B EOPNOTSUPP
1232 ("Operation not supported").
1233 Additionally, if the specified pathname is exactly "/bye",
1234 then the supervisor terminates.
1235 .PP
1236 This program can be used to demonstrate various aspects of the
1237 behavior of the seccomp user-space notification mechanism.
1238 To help aid such demonstrations,
1239 the program logs various messages to show the operation
1240 of the target process (lines prefixed "T:") and the supervisor
1241 (indented lines prefixed "S:").
1242 .PP
1243 In the following example, the target attempts to create the directory
1244 .IR /tmp/x .
1245 Upon receiving the notification, the supervisor creates the directory on the
1246 target's behalf,
1247 and spoofs a success return to be received by the target process's
1248 .BR mkdir (2)
1249 call.
1250 .PP
1251 .in +4n
1252 .EX
1253 $ \fB./seccomp_unotify /tmp/x\fP
1254 T: PID = 23168
1255
1256 T: about to mkdir("/tmp/x")
1257 S: got notification (ID 0x17445c4a0f4e0e3c) for PID 23168
1258 S: executing: mkdir("/tmp/x", 0700)
1259 S: success! spoofed return = 6
1260 S: sending response (flags = 0; val = 6; error = 0)
1261 T: SUCCESS: mkdir(2) returned 6
1262
1263 T: terminating
1264 S: target has terminated; bye
1265 .EE
1266 .in
1267 .PP
1268 In the above output, note that the spoofed return value seen by the target
1269 process is 6 (the length of the pathname
1270 .IR /tmp/x ),
1271 whereas a normal
1272 .BR mkdir (2)
1273 call returns 0 on success.
1274 .PP
1275 In the next example, the target attempts to create a directory using the
1276 relative pathname
1277 .IR ./sub .
1278 Since this pathname starts with "./",
1279 the supervisor sends a
1280 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
1281 response to the kernel,
1282 and the kernel then (successfully) executes the target process's
1283 .BR mkdir (2)
1284 call.
1285 .PP
1286 .in +4n
1287 .EX
1288 $ \fB./seccomp_unotify ./sub\fP
1289 T: PID = 23204
1290
1291 T: about to mkdir("./sub")
1292 S: got notification (ID 0xddb16abe25b4c12) for PID 23204
1293 S: target can execute system call
1294 S: sending response (flags = 0x1; val = 0; error = 0)
1295 T: SUCCESS: mkdir(2) returned 0
1296
1297 T: terminating
1298 S: target has terminated; bye
1299 .EE
1300 .in
1301 .PP
1302 If the target process attempts to create a directory with
1303 a pathname that doesn't start with "." and doesn't begin with the prefix
1304 "/tmp/", then the supervisor spoofs an error return
1305 .RB ( EOPNOTSUPP ,
1306 "Operation not supported")
1307 for the target's
1308 .BR mkdir (2)
1309 call (which is not executed):
1310 .PP
1311 .in +4n
1312 .EX
1313 $ \fB./seccomp_unotify /xxx\fP
1314 T: PID = 23178
1315
1316 T: about to mkdir("/xxx")
1317 S: got notification (ID 0xe7dc095d1c524e80) for PID 23178
1318 S: spoofing error response (Operation not supported)
1319 S: sending response (flags = 0; val = 0; error = \-95)
1320 T: ERROR: mkdir(2): Operation not supported
1321
1322 T: terminating
1323 S: target has terminated; bye
1324 .EE
1325 .in
1326 .PP
1327 In the next example,
1328 the target process attempts to create a directory with the pathname
1329 .BR /tmp/nosuchdir/b .
1330 Upon receiving the notification,
1331 the supervisor attempts to create that directory, but the
1332 .BR mkdir (2)
1333 call fails because the directory
1334 .B /tmp/nosuchdir
1335 does not exist.
1336 Consequently, the supervisor spoofs an error return that passes the error
1337 that it received back to the target process's
1338 .BR mkdir (2)
1339 call.
1340 .PP
1341 .in +4n
1342 .EX
1343 $ \fB./seccomp_unotify /tmp/nosuchdir/b\fP
1344 T: PID = 23199
1345
1346 T: about to mkdir("/tmp/nosuchdir/b")
1347 S: got notification (ID 0x8744454293506046) for PID 23199
1348 S: executing: mkdir("/tmp/nosuchdir/b", 0700)
1349 S: failure! (errno = 2; No such file or directory)
1350 S: sending response (flags = 0; val = 0; error = \-2)
1351 T: ERROR: mkdir(2): No such file or directory
1352
1353 T: terminating
1354 S: target has terminated; bye
1355 .EE
1356 .in
1357 .PP
1358 If the supervisor receives a notification and sees that the
1359 argument of the target's
1360 .BR mkdir (2)
1361 is the string "/bye", then (as well as spoofing an
1362 .B EOPNOTSUPP
1363 error), the supervisor terminates.
1364 If the target process subsequently executes another
1365 .BR mkdir (2)
1366 that triggers its seccomp filter to return the
1367 .B SECCOMP_RET_USER_NOTIF
1368 action value, then the kernel causes the target process's system call to
1369 fail with the error
1370 .B ENOSYS
1371 ("Function not implemented").
1372 This is demonstrated by the following example:
1373 .PP
1374 .in +4n
1375 .EX
1376 $ \fB./seccomp_unotify /bye /tmp/y\fP
1377 T: PID = 23185
1378
1379 T: about to mkdir("/bye")
1380 S: got notification (ID 0xa81236b1d2f7b0f4) for PID 23185
1381 S: spoofing error response (Operation not supported)
1382 S: sending response (flags = 0; val = 0; error = \-95)
1383 S: terminating **********
1384 T: ERROR: mkdir(2): Operation not supported
1385
1386 T: about to mkdir("/tmp/y")
1387 T: ERROR: mkdir(2): Function not implemented
1388
1389 T: terminating
1390 .EE
1391 .in
1392 .\"
1393 .SS Program source
1394 .\" SRC BEGIN (seccomp_unotify.c)
1395 .EX
1396 #define _GNU_SOURCE
1397 #include <err.h>
1398 #include <errno.h>
1399 #include <fcntl.h>
1400 #include <limits.h>
1401 #include <linux/audit.h>
1402 #include <linux/filter.h>
1403 #include <linux/seccomp.h>
1404 #include <signal.h>
1405 #include <stdbool.h>
1406 #include <stddef.h>
1407 #include <stdint.h>
1408 #include <stdio.h>
1409 #include <stdlib.h>
1410 #include <string.h>
1411 #include <sys/ioctl.h>
1412 #include <sys/prctl.h>
1413 #include <sys/socket.h>
1414 #include <sys/stat.h>
1415 #include <sys/syscall.h>
1416 #include <sys/types.h>
1417 #include <sys/un.h>
1418 #include <unistd.h>
1419
1420 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
1421
1422 /* Send the file descriptor \(aqfd\(aq over the connected UNIX domain socket
1423 \(aqsockfd\(aq. Returns 0 on success, or \-1 on error. */
1424
1425 static int
1426 sendfd(int sockfd, int fd)
1427 {
1428 int data;
1429 struct iovec iov;
1430 struct msghdr msgh;
1431 struct cmsghdr *cmsgp;
1432
1433 /* Allocate a char array of suitable size to hold the ancillary data.
1434 However, since this buffer is in reality a \(aqstruct cmsghdr\(aq, use a
1435 union to ensure that it is suitably aligned. */
1436 union {
1437 char buf[CMSG_SPACE(sizeof(int))];
1438 /* Space large enough to hold an \(aqint\(aq */
1439 struct cmsghdr align;
1440 } controlMsg;
1441
1442 /* The \(aqmsg_name\(aq field can be used to specify the address of the
1443 destination socket when sending a datagram. However, we do not
1444 need to use this field because \(aqsockfd\(aq is a connected socket. */
1445
1446 msgh.msg_name = NULL;
1447 msgh.msg_namelen = 0;
1448
1449 /* On Linux, we must transmit at least one byte of real data in
1450 order to send ancillary data. We transmit an arbitrary integer
1451 whose value is ignored by recvfd(). */
1452
1453 msgh.msg_iov = &iov;
1454 msgh.msg_iovlen = 1;
1455 iov.iov_base = &data;
1456 iov.iov_len = sizeof(int);
1457 data = 12345;
1458
1459 /* Set \(aqmsghdr\(aq fields that describe ancillary data */
1460
1461 msgh.msg_control = controlMsg.buf;
1462 msgh.msg_controllen = sizeof(controlMsg.buf);
1463
1464 /* Set up ancillary data describing file descriptor to send */
1465
1466 cmsgp = CMSG_FIRSTHDR(&msgh);
1467 cmsgp\->cmsg_level = SOL_SOCKET;
1468 cmsgp\->cmsg_type = SCM_RIGHTS;
1469 cmsgp\->cmsg_len = CMSG_LEN(sizeof(int));
1470 memcpy(CMSG_DATA(cmsgp), &fd, sizeof(int));
1471
1472 /* Send real plus ancillary data */
1473
1474 if (sendmsg(sockfd, &msgh, 0) == \-1)
1475 return \-1;
1476
1477 return 0;
1478 }
1479
1480 /* Receive a file descriptor on a connected UNIX domain socket. Returns
1481 the received file descriptor on success, or \-1 on error. */
1482
1483 static int
1484 recvfd(int sockfd)
1485 {
1486 int data, fd;
1487 ssize_t nr;
1488 struct iovec iov;
1489 struct msghdr msgh;
1490
1491 /* Allocate a char buffer for the ancillary data. See the comments
1492 in sendfd() */
1493 union {
1494 char buf[CMSG_SPACE(sizeof(int))];
1495 struct cmsghdr align;
1496 } controlMsg;
1497 struct cmsghdr *cmsgp;
1498
1499 /* The \(aqmsg_name\(aq field can be used to obtain the address of the
1500 sending socket. However, we do not need this information. */
1501
1502 msgh.msg_name = NULL;
1503 msgh.msg_namelen = 0;
1504
1505 /* Specify buffer for receiving real data */
1506
1507 msgh.msg_iov = &iov;
1508 msgh.msg_iovlen = 1;
1509 iov.iov_base = &data; /* Real data is an \(aqint\(aq */
1510 iov.iov_len = sizeof(int);
1511
1512 /* Set \(aqmsghdr\(aq fields that describe ancillary data */
1513
1514 msgh.msg_control = controlMsg.buf;
1515 msgh.msg_controllen = sizeof(controlMsg.buf);
1516
1517 /* Receive real plus ancillary data; real data is ignored */
1518
1519 nr = recvmsg(sockfd, &msgh, 0);
1520 if (nr == \-1)
1521 return \-1;
1522
1523 cmsgp = CMSG_FIRSTHDR(&msgh);
1524
1525 /* Check the validity of the \(aqcmsghdr\(aq */
1526
1527 if (cmsgp == NULL
1528 || cmsgp\->cmsg_len != CMSG_LEN(sizeof(int))
1529 || cmsgp\->cmsg_level != SOL_SOCKET
1530 || cmsgp\->cmsg_type != SCM_RIGHTS)
1531 {
1532 errno = EINVAL;
1533 return \-1;
1534 }
1535
1536 /* Return the received file descriptor to our caller */
1537
1538 memcpy(&fd, CMSG_DATA(cmsgp), sizeof(int));
1539 return fd;
1540 }
1541
1542 static void
1543 sigchldHandler(int sig)
1544 {
1545 char msg[] = "\etS: target has terminated; bye\en";
1546
1547 write(STDOUT_FILENO, msg, sizeof(msg) \- 1);
1548 _exit(EXIT_SUCCESS);
1549 }
1550
1551 static int
1552 seccomp(unsigned int operation, unsigned int flags, void *args)
1553 {
1554 return syscall(SYS_seccomp, operation, flags, args);
1555 }
1556
1557 /* The following is the x86\-64\-specific BPF boilerplate code for checking
1558 that the BPF program is running on the right architecture + ABI. At
1559 completion of these instructions, the accumulator contains the system
1560 call number. */
1561
1562 /* For the x32 ABI, all system call numbers have bit 30 set */
1563
1564 #define X32_SYSCALL_BIT 0x40000000
1565
1566 #define X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR \e
1567 BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
1568 (offsetof(struct seccomp_data, arch))), \e
1569 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \e
1570 BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
1571 (offsetof(struct seccomp_data, nr))), \e
1572 BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \e
1573 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)
1574
1575 /* installNotifyFilter() installs a seccomp filter that generates
1576 user\-space notifications (SECCOMP_RET_USER_NOTIF) when the process
1577 calls mkdir(2); the filter allows all other system calls.
1578
1579 The function return value is a file descriptor from which the
1580 user\-space notifications can be fetched. */
1581
1582 static int
1583 installNotifyFilter(void)
1584 {
1585 int notifyFd;
1586
1587 struct sock_filter filter[] = {
1588 X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR,
1589
1590 /* mkdir() triggers notification to user\-space supervisor */
1591
1592 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_mkdir, 0, 1),
1593 BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
1594
1595 /* Every other system call is allowed */
1596
1597 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
1598 };
1599
1600 struct sock_fprog prog = {
1601 .len = ARRAY_SIZE(filter),
1602 .filter = filter,
1603 };
1604
1605 /* Install the filter with the SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
1606 as a result, seccomp() returns a notification file descriptor. */
1607
1608 notifyFd = seccomp(SECCOMP_SET_MODE_FILTER,
1609 SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
1610 if (notifyFd == \-1)
1611 err(EXIT_FAILURE, "seccomp\-install\-notify\-filter");
1612
1613 return notifyFd;
1614 }
1615
1616 /* Close a pair of sockets created by socketpair() */
1617
1618 static void
1619 closeSocketPair(int sockPair[2])
1620 {
1621 if (close(sockPair[0]) == \-1)
1622 err(EXIT_FAILURE, "closeSocketPair\-close\-0");
1623 if (close(sockPair[1]) == \-1)
1624 err(EXIT_FAILURE, "closeSocketPair\-close\-1");
1625 }
1626
1627 /* Implementation of the target process; create a child process that:
1628
1629 (1) installs a seccomp filter with the
1630 SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
1631 (2) writes the seccomp notification file descriptor returned from
1632 the previous step onto the UNIX domain socket, \(aqsockPair[0]\(aq;
1633 (3) calls mkdir(2) for each element of \(aqargv\(aq.
1634
1635 The function return value in the parent is the PID of the child
1636 process; the child does not return from this function. */
1637
1638 static pid_t
1639 targetProcess(int sockPair[2], char *argv[])
1640 {
1641 int notifyFd, s;
1642 pid_t targetPid;
1643
1644 targetPid = fork();
1645
1646 if (targetPid == \-1)
1647 err(EXIT_FAILURE, "fork");
1648
1649 if (targetPid > 0) /* In parent, return PID of child */
1650 return targetPid;
1651
1652 /* Child falls through to here */
1653
1654 printf("T: PID = %ld\en", (long) getpid());
1655
1656 /* Install seccomp filter(s) */
1657
1658 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1659 err(EXIT_FAILURE, "prctl");
1660
1661 notifyFd = installNotifyFilter();
1662
1663 /* Pass the notification file descriptor to the tracing process over
1664 a UNIX domain socket */
1665
1666 if (sendfd(sockPair[0], notifyFd) == \-1)
1667 err(EXIT_FAILURE, "sendfd");
1668
1669 /* Notification and socket FDs are no longer needed in target */
1670
1671 if (close(notifyFd) == \-1)
1672 err(EXIT_FAILURE, "close\-target\-notify\-fd");
1673
1674 closeSocketPair(sockPair);
1675
1676 /* Perform a mkdir() call for each of the command\-line arguments */
1677
1678 for (char **ap = argv; *ap != NULL; ap++) {
1679 printf("\enT: about to mkdir(\e"%s\e")\en", *ap);
1680
1681 s = mkdir(*ap, 0700);
1682 if (s == \-1)
1683 perror("T: ERROR: mkdir(2)");
1684 else
1685 printf("T: SUCCESS: mkdir(2) returned %d\en", s);
1686 }
1687
1688 printf("\enT: terminating\en");
1689 exit(EXIT_SUCCESS);
1690 }
1691
1692 /* Check that the notification ID provided by a SECCOMP_IOCTL_NOTIF_RECV
1693 operation is still valid. It will no longer be valid if the target
1694 process has terminated or is no longer blocked in the system call that
1695 generated the notification (because it was interrupted by a signal).
1696
1697 This operation can be used when doing such things as accessing
1698 /proc/PID files in the target process in order to avoid TOCTOU race
1699 conditions where the PID that is returned by SECCOMP_IOCTL_NOTIF_RECV
1700 terminates and is reused by another process. */
1701
1702 static bool
1703 cookieIsValid(int notifyFd, uint64_t id)
1704 {
1705 return ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) == 0;
1706 }
1707
1708 /* Access the memory of the target process in order to fetch the
1709 pathname referred to by the system call argument \(aqargNum\(aq in
1710 \(aqreq\->data.args[]\(aq. The pathname is returned in \(aqpath\(aq,
1711 a buffer of \(aqlen\(aq bytes allocated by the caller.
1712
1713 Returns true if the pathname is successfully fetched, and false
1714 otherwise. For possible causes of failure, see the comments below. */
1715
1716 static bool
1717 getTargetPathname(struct seccomp_notif *req, int notifyFd,
1718 int argNum, char *path, size_t len)
1719 {
1720 int procMemFd;
1721 char procMemPath[PATH_MAX];
1722 ssize_t nread;
1723
1724 snprintf(procMemPath, sizeof(procMemPath), "/proc/%d/mem", req\->pid);
1725
1726 procMemFd = open(procMemPath, O_RDONLY | O_CLOEXEC);
1727 if (procMemFd == \-1)
1728 return false;
1729
1730 /* Check that the process whose info we are accessing is still alive
1731 and blocked in the system call that caused the notification.
1732 If the SECCOMP_IOCTL_NOTIF_ID_VALID operation (performed in
1733 cookieIsValid()) succeeded, we know that the /proc/PID/mem file
1734 descriptor that we opened corresponded to the process for which we
1735 received a notification. If that process subsequently terminates,
1736 then read() on that file descriptor will return 0 (EOF). */
1737
1738 if (!cookieIsValid(notifyFd, req\->id)) {
1739 close(procMemFd);
1740 return false;
1741 }
1742
1743 /* Read bytes at the location containing the pathname argument */
1744
1745 nread = pread(procMemFd, path, len, req\->data.args[argNum]);
1746
1747 close(procMemFd);
1748
1749 if (nread <= 0)
1750 return false;
1751
1752 /* Once again check that the notification ID is still valid. The
1753 case we are particularly concerned about here is that just
1754 before we fetched the pathname, the target\(aqs blocked system
1755 call was interrupted by a signal handler, and after the handler
1756 returned, the target carried on execution (past the interrupted
1757 system call). In that case, we have no guarantees about what we
1758 are reading, since the target\(aqs memory may have been arbitrarily
1759 changed by subsequent operations. */
1760
1761 if (!cookieIsValid(notifyFd, req\->id)) {
1762 perror("\etS: notification ID check failed!!!");
1763 return false;
1764 }
1765
1766 /* Even if the target\(aqs system call was not interrupted by a signal,
1767 we have no guarantees about what was in the memory of the target
1768 process. (The memory may have been modified by another thread, or
1769 even by an external attacking process.) We therefore treat the
1770 buffer returned by pread() as untrusted input. The buffer should
1771 contain a terminating null byte; if not, then we will trigger an
1772 error for the target process. */
1773
1774 if (strnlen(path, nread) < nread)
1775 return true;
1776
1777 return false;
1778 }
1779
1780 /* Allocate buffers for the seccomp user\-space notification request and
1781 response structures. It is the caller\(aqs responsibility to free the
1782 buffers returned via \(aqreq\(aq and \(aqresp\(aq. */
1783
1784 static void
1785 allocSeccompNotifBuffers(struct seccomp_notif **req,
1786 struct seccomp_notif_resp **resp,
1787 struct seccomp_notif_sizes *sizes)
1788 {
1789 size_t resp_size;
1790
1791 /* Discover the sizes of the structures that are used to receive
1792 notifications and send notification responses, and allocate
1793 buffers of those sizes. */
1794
1795 if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, sizes) == \-1)
1796 err(EXIT_FAILURE, "seccomp\-SECCOMP_GET_NOTIF_SIZES");
1797
1798 *req = malloc(sizes\->seccomp_notif);
1799 if (*req == NULL)
1800 err(EXIT_FAILURE, "malloc\-seccomp_notif");
1801
1802 /* When allocating the response buffer, we must allow for the fact
1803 that the user\-space binary may have been built with user\-space
1804 headers where \(aqstruct seccomp_notif_resp\(aq is bigger than the
1805 response buffer expected by the (older) kernel. Therefore, we
1806 allocate a buffer that is the maximum of the two sizes. This
1807 ensures that if the supervisor places bytes into the response
1808 structure that are past the response size that the kernel expects,
1809 then the supervisor is not touching an invalid memory location. */
1810
1811 resp_size = sizes\->seccomp_notif_resp;
1812 if (sizeof(struct seccomp_notif_resp) > resp_size)
1813 resp_size = sizeof(struct seccomp_notif_resp);
1814
1815 *resp = malloc(resp_size);
1816 if (resp == NULL)
1817 err(EXIT_FAILURE, "malloc\-seccomp_notif_resp");
1818
1819 }
1820
1821 /* Handle notifications that arrive via the SECCOMP_RET_USER_NOTIF file
1822 descriptor, \(aqnotifyFd\(aq. */
1823
1824 static void
1825 handleNotifications(int notifyFd)
1826 {
1827 bool pathOK;
1828 char path[PATH_MAX];
1829 struct seccomp_notif *req;
1830 struct seccomp_notif_resp *resp;
1831 struct seccomp_notif_sizes sizes;
1832
1833 allocSeccompNotifBuffers(&req, &resp, &sizes);
1834
1835 /* Loop handling notifications */
1836
1837 for (;;) {
1838
1839 /* Wait for next notification, returning info in \(aq*req\(aq */
1840
1841 memset(req, 0, sizes.seccomp_notif);
1842 if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_RECV, req) == \-1) {
1843 if (errno == EINTR)
1844 continue;
1845 err(EXIT_FAILURE, "\etS: ioctl\-SECCOMP_IOCTL_NOTIF_RECV");
1846 }
1847
1848 printf("\etS: got notification (ID %#llx) for PID %d\en",
1849 req\->id, req\->pid);
1850
1851 /* The only system call that can generate a notification event
1852 is mkdir(2). Nevertheless, we check that the notified system
1853 call is indeed mkdir() as kind of future\-proofing of this
1854 code in case the seccomp filter is later modified to
1855 generate notifications for other system calls. */
1856
1857 if (req\->data.nr != SYS_mkdir) {
1858 printf("\etS: notification contained unexpected "
1859 "system call number; bye!!!\en");
1860 exit(EXIT_FAILURE);
1861 }
1862
1863 pathOK = getTargetPathname(req, notifyFd, 0, path, sizeof(path));
1864
1865 /* Prepopulate some fields of the response */
1866
1867 resp\->id = req\->id; /* Response includes notification ID */
1868 resp\->flags = 0;
1869 resp\->val = 0;
1870
1871 /* If getTargetPathname() failed, trigger an EINVAL error
1872 response (sending this response may yield an error if the
1873 failure occurred because the notification ID was no longer
1874 valid); if the directory is in /tmp, then create it on behalf
1875 of the supervisor; if the pathname starts with \(aq.\(aq, tell the
1876 kernel to let the target process execute the mkdir();
1877 otherwise, give an error for a directory pathname in any other
1878 location. */
1879
1880 if (!pathOK) {
1881 resp\->error = \-EINVAL;
1882 printf("\etS: spoofing error for invalid pathname (%s)\en",
1883 strerror(\-resp\->error));
1884 } else if (strncmp(path, "/tmp/", strlen("/tmp/")) == 0) {
1885 printf("\etS: executing: mkdir(\e"%s\e", %#llo)\en",
1886 path, req\->data.args[1]);
1887
1888 if (mkdir(path, req\->data.args[1]) == 0) {
1889 resp\->error = 0; /* "Success" */
1890 resp\->val = strlen(path); /* Used as return value of
1891 mkdir() in target */
1892 printf("\etS: success! spoofed return = %lld\en",
1893 resp\->val);
1894 } else {
1895
1896 /* If mkdir() failed in the supervisor, pass the error
1897 back to the target */
1898
1899 resp\->error = \-errno;
1900 printf("\etS: failure! (errno = %d; %s)\en", errno,
1901 strerror(errno));
1902 }
1903 } else if (strncmp(path, "./", strlen("./")) == 0) {
1904 resp\->error = resp\->val = 0;
1905 resp\->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
1906 printf("\etS: target can execute system call\en");
1907 } else {
1908 resp\->error = \-EOPNOTSUPP;
1909 printf("\etS: spoofing error response (%s)\en",
1910 strerror(\-resp\->error));
1911 }
1912
1913 /* Send a response to the notification */
1914
1915 printf("\etS: sending response "
1916 "(flags = %#x; val = %lld; error = %d)\en",
1917 resp\->flags, resp\->val, resp\->error);
1918
1919 if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp) == \-1) {
1920 if (errno == ENOENT)
1921 printf("\etS: response failed with ENOENT; "
1922 "perhaps target process\(aqs syscall was "
1923 "interrupted by a signal?\en");
1924 else
1925 perror("ioctl\-SECCOMP_IOCTL_NOTIF_SEND");
1926 }
1927
1928 /* If the pathname is just "/bye", then the supervisor breaks out
1929 of the loop and terminates. This allows us to see what happens
1930 if the target process makes further calls to mkdir(2). */
1931
1932 if (strcmp(path, "/bye") == 0)
1933 break;
1934 }
1935
1936 free(req);
1937 free(resp);
1938 printf("\etS: terminating **********\en");
1939 exit(EXIT_FAILURE);
1940 }
1941
1942 /* Implementation of the supervisor process:
1943
1944 (1) obtains the notification file descriptor from \(aqsockPair[1]\(aq
1945 (2) handles notifications that arrive on that file descriptor. */
1946
1947 static void
1948 supervisor(int sockPair[2])
1949 {
1950 int notifyFd;
1951
1952 notifyFd = recvfd(sockPair[1]);
1953
1954 if (notifyFd == \-1)
1955 err(EXIT_FAILURE, "recvfd");
1956
1957 closeSocketPair(sockPair); /* We no longer need the socket pair */
1958
1959 handleNotifications(notifyFd);
1960 }
1961
1962 int
1963 main(int argc, char *argv[])
1964 {
1965 int sockPair[2];
1966 struct sigaction sa;
1967
1968 setbuf(stdout, NULL);
1969
1970 if (argc < 2) {
1971 fprintf(stderr, "At least one pathname argument is required\en");
1972 exit(EXIT_FAILURE);
1973 }
1974
1975 /* Create a UNIX domain socket that is used to pass the seccomp
1976 notification file descriptor from the target process to the
1977 supervisor process. */
1978
1979 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockPair) == \-1)
1980 err(EXIT_FAILURE, "socketpair");
1981
1982 /* Create a child process\-\-the "target"\-\-that installs seccomp
1983 filtering. The target process writes the seccomp notification
1984 file descriptor onto \(aqsockPair[0]\(aq and then calls mkdir(2) for
1985 each directory in the command\-line arguments. */
1986
1987 (void) targetProcess(sockPair, &argv[optind]);
1988
1989 /* Catch SIGCHLD when the target terminates, so that the
1990 supervisor can also terminate. */
1991
1992 sa.sa_handler = sigchldHandler;
1993 sa.sa_flags = 0;
1994 sigemptyset(&sa.sa_mask);
1995 if (sigaction(SIGCHLD, &sa, NULL) == \-1)
1996 err(EXIT_FAILURE, "sigaction");
1997
1998 supervisor(sockPair);
1999
2000 exit(EXIT_SUCCESS);
2001 }
2002 .EE
2003 .\" SRC END
2004 .SH SEE ALSO
2005 .BR ioctl (2),
2006 .BR pidfd_getfd (2),
2007 .BR pidfd_open (2),
2008 .BR seccomp (2)
2009 .PP
2010 A further example program can be found in the kernel source file
2011 .IR samples/seccomp/user-trap.c .