]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/clone.2
clone.2: wfix
[thirdparty/man-pages.git] / man2 / clone.2
CommitLineData
fea681da 1.\" Copyright (c) 1992 Drew Eckhardt <drew@cs.colorado.edu>, March 28, 1992
fb1fa92b 2.\" and Copyright (c) Michael Kerrisk, 2001, 2002, 2005, 2013, 2019
2297bf0e 3.\"
fd0fc519 4.\" %%%LICENSE_START(GPL_NOVERSION_ONELINE)
fea681da 5.\" May be distributed under the GNU General Public License.
fd0fc519 6.\" %%%LICENSE_END
dccaff1e 7.\"
fea681da
MK
8.\" Modified by Michael Haardt <michael@moria.de>
9.\" Modified 24 Jul 1993 by Rik Faith <faith@cs.unc.edu>
10.\" Modified 21 Aug 1994 by Michael Chastain <mec@shell.portal.com>:
11.\" New man page (copied from 'fork.2').
12.\" Modified 10 June 1995 by Andries Brouwer <aeb@cwi.nl>
13.\" Modified 25 April 1998 by Xavier Leroy <Xavier.Leroy@inria.fr>
14.\" Modified 26 Jun 2001 by Michael Kerrisk
15.\" Mostly upgraded to 2.4.x
16.\" Added prototype for sys_clone() plus description
17.\" Added CLONE_THREAD with a brief description of thread groups
c13182ef 18.\" Added CLONE_PARENT and revised entire page remove ambiguity
fea681da
MK
19.\" between "calling process" and "parent process"
20.\" Added CLONE_PTRACE and CLONE_VFORK
21.\" Added EPERM and EINVAL error codes
fd8a5be4 22.\" Renamed "__clone" to "clone" (which is the prototype in <sched.h>)
fea681da 23.\" various other minor tidy ups and clarifications.
c11b1abf 24.\" Modified 26 Jun 2001 by Michael Kerrisk <mtk.manpages@gmail.com>
d9bfdb9c 25.\" Updated notes for 2.4.7+ behavior of CLONE_THREAD
c11b1abf 26.\" Modified 15 Oct 2002 by Michael Kerrisk <mtk.manpages@gmail.com>
fea681da
MK
27.\" Added description for CLONE_NEWNS, which was added in 2.4.19
28.\" Slightly rephrased, aeb.
29.\" Modified 1 Feb 2003 - added CLONE_SIGHAND restriction, aeb.
30.\" Modified 1 Jan 2004 - various updates, aeb
0967c11f 31.\" Modified 2004-09-10 - added CLONE_PARENT_SETTID etc. - aeb.
d9bfdb9c 32.\" 2005-04-12, mtk, noted the PID caching behavior of NPTL's getpid()
31830ef0 33.\" wrapper under BUGS.
fd8a5be4
MK
34.\" 2005-05-10, mtk, added CLONE_SYSVSEM, CLONE_UNTRACED, CLONE_STOPPED.
35.\" 2005-05-17, mtk, Substantially enhanced discussion of CLONE_THREAD.
4e836144 36.\" 2008-11-18, mtk, order CLONE_* flags alphabetically
82ee147a 37.\" 2008-11-18, mtk, document CLONE_NEWPID
43ce9dda 38.\" 2008-11-19, mtk, document CLONE_NEWUTS
667417b3 39.\" 2008-11-19, mtk, document CLONE_NEWIPC
cfdc761b 40.\" 2008-11-19, Jens Axboe, mtk, document CLONE_IO
fea681da 41.\"
a5409de9 42.TH CLONE 2 2019-11-19 "Linux" "Linux Programmer's Manual"
fea681da 43.SH NAME
a9e52b43 44clone, __clone2, clone3 \- create a child process
fea681da 45.SH SYNOPSIS
c10859eb 46.nf
81f10dad 47/* Prototype for the glibc wrapper function */
dbfe9c70 48.PP
4f71ba5d 49.B #define _GNU_SOURCE
fea681da 50.B #include <sched.h>
dbfe9c70 51.PP
81c2368f
MK
52.BI "int clone(int (*" "fn" ")(void *), void *" stack \
53", int " flags ", void *" "arg" ", ... "
54.BI " /* pid_t *" parent_tid ", void *" tls \
55", pid_t *" child_tid " */ );"
dbfe9c70 56.PP
faa0e55a
MK
57/* For the prototype of the raw clone() system call, see NOTES */
58.PP
75e28eba 59.BI "long clone3(struct clone_args *" cl_args ", size_t " size );
c10859eb 60.fi
faa0e55a
MK
61.PP
62.IR Note :
63There is not yet a glibc wrapper for
64.BR clone3 ();
65see NOTES.
fea681da 66.SH DESCRIPTION
faa0e55a 67These system calls
8dd6b0bc 68create a new ("child") process, in a manner similar to
fea681da 69.BR fork (2).
efeece04 70.PP
8dd6b0bc 71By contrast with
fea681da 72.BR fork (2),
225f5da8 73these system calls provide more precise control over what pieces of execution
10337567
MK
74context are shared between the calling process and the child process.
75For example, using these system calls, the caller can control whether
76or not the two processes share the virtual address space,
77the table of file descriptors, and the table of signal handlers.
225f5da8 78These system calls also allow the new child process to be placed
10337567
MK
79in separate
80.BR namespaces (7).
81.PP
82Note that in this manual
c13182ef 83page, "calling process" normally corresponds to "parent process".
a10c5a33 84But see the descriptions of
c13182ef 85.B CLONE_PARENT
a10c5a33
MK
86and
87.B CLONE_THREAD
10337567 88below.
efeece04 89.PP
faa0e55a
MK
90This page describes the following interfaces:
91.IP * 3
92The glibc
93.BR clone ()
94wrapper function and the underlying system call on which it is based.
95The main text describes the wrapper function;
96the differences for the raw system call
97are described toward the end of this page.
98.IP *
99The newer
100.BR clone3 ()
101system call.
5261b0fe
MK
102.PP
103In the remainder of this page, the terminology "the clone call" is used
324f6154 104when noting details that apply to all of these interfaces,
faa0e55a
MK
105.\"
106.SS The clone() wrapper function
107.PP
108When the child process is created with the
109.BR clone ()
110wrapper function,
7495cbc7
MK
111it commences execution by calling the function pointed to by the argument
112.IR fn .
fea681da 113(This differs from
c13182ef 114.BR fork (2),
fea681da 115where execution continues in the child from the point
c13182ef
MK
116of the
117.BR fork (2)
fea681da
MK
118call.)
119The
fea681da 120.I arg
7495cbc7
MK
121argument is passed as the argument of the function
122.IR fn .
efeece04 123.PP
c13182ef 124When the
fea681da 125.IR fn ( arg )
4ba17a6d 126function returns, the child process terminates.
c13182ef 127The integer returned by
fea681da 128.I fn
4ba17a6d 129is the exit status for the child process.
c13182ef 130The child process may also terminate explicitly by calling
fea681da
MK
131.BR exit (2)
132or after receiving a fatal signal.
efeece04 133.PP
fea681da 134The
81c2368f 135.I stack
c13182ef
MK
136argument specifies the location of the stack used by the child process.
137Since the child and calling process may share memory,
fea681da 138it is not possible for the child process to execute in the
c13182ef
MK
139same stack as the calling process.
140The calling process must therefore
fea681da
MK
141set up memory space for the child stack and pass a pointer to this
142space to
edcc65ff 143.BR clone ().
5fab2e7c 144Stacks grow downward on all processors that run Linux
fea681da 145(except the HP PA processors), so
81c2368f 146.I stack
fea681da
MK
147usually points to the topmost address of the memory space set up for
148the child stack.
faa0e55a
MK
149Note that
150.BR clone ()
151does not provide a means whereby the caller can inform the kernel of the
152size of the stack area.
153.PP
154The remaining arguments to
155.BR clone ()
156are discussed below.
157.\"
158.SS clone3()
159.PP
160The
161.BR clone3 ()
162system call provides a superset of the functionality of the older
163.BR clone ()
164interface.
165It also provides a number of API improvements, including:
166space for additional flags bits;
167cleaner separation in the use of various arguments;
168and the ability to specify the size of the child's stack area.
169.PP
170As with
171.BR fork (2),
172.BR clone3 ()
173returns in both the parent and the child.
174It returns 0 in the child process and returns the PID of the child
175in the parent.
176.PP
177The
178.I cl_args
179argument of
180.BR clone3 ()
181is a structure of the following form:
182.PP
183.in +4n
184.EX
185struct clone_args {
186 u64 flags; /* Flags bit mask */
187 u64 pidfd; /* Where to store PID file descriptor
be66dbc7 188 (\fIpid_t *\fP) */
faa0e55a 189 u64 child_tid; /* Where to store child TID,
be66dbc7 190 in child's memory (\fIpid_t *\fP) */
faa0e55a
MK
191 u64 parent_tid; /* Where to store child TID,
192 in parent's memory (\fIint *\fP) */
193 u64 exit_signal; /* Signal to deliver to parent on
194 child termination */
195 u64 stack; /* Pointer to lowest byte of stack */
196 u64 stack_size; /* Size of stack */
197 u64 tls; /* Location of new TLS */
f7d5e082
MK
198 u64 set_tid; /* Pointer to a \fIpid_t\fP array
199 (since Linux 5.5) */
200 u64 set_tid_size; /* Number of elements in \fIset_tid\fP
201 (since Linux 5.5) */
ed7c1377
MK
202 u64 cgroup; /* File descriptor for target cgroup
203 of child (since Linux 5.7) */
faa0e55a
MK
204};
205.EE
206.in
207.PP
208The
209.I size
210argument that is supplied to
211.BR clone3 ()
212should be initialized to the size of this structure.
213(The existence of the
214.I size
215argument permits future extensions to the
216.IR clone_args
217structure.)
218.PP
219The stack for the child process is specified via
220.IR cl_args.stack ,
221which points to the lowest byte of the stack area,
222and
223.IR cl_args.stack_size ,
224which specifies the size of the stack in bytes.
225In the case where the
226.BR CLONE_VM
227flag (see below) is specified, a stack must be explicitly allocated
228and specified.
229Otherwise, these two fields can be specified as NULL and 0,
230which causes the child to use the same stack area as the parent
231(in the child's own virtual address space).
232.PP
233The remaining fields in the
234.I cl_args
235argument are discussed below.
236.\"
237.SS Equivalence between clone() and clone3() arguments
238.PP
239Unlike the older
240.BR clone ()
241interface, where arguments are passed individually, in the newer
242.BR clone3 ()
243interface the arguments are packaged into the
244.I clone_args
245structure shown above.
246This structure allows for a superset of the information passed via the
247.BR clone ()
248arguments.
249.PP
250The following table shows the equivalence between the arguments of
251.BR clone ()
252and the fields in the
253.I clone_args
254argument supplied to
255.BR clone3 ():
256.RS
257.TS
258lb lb lb
259l l l
260li li l.
97883fae 261clone() clone3() Notes
faa0e55a 262 \fIcl_args\fP field
f5d5180f 263flags & ~0xff flags For most flags; details below
faa0e55a
MK
264parent_tid pidfd See CLONE_PIDFD
265child_tid child_tid See CLONE_CHILD_SETTID
266parent_tid parent_tid See CLONE_PARENT_SETTID
267flags & 0xff exit_signal
268stack stack
269\fP---\fP stack_size
270tls tls See CLONE_SETTLS
bf031aaa
AR
271\fP---\fP set_tid See below for details
272\fP---\fP set_tid_size
ed7c1377 273\fP---\fP cgroup See CLONE_INTO_CGROUP
faa0e55a
MK
274.TE
275.RE
5fbce8f2
MK
276.\"
277.SS The child termination signal
efeece04 278.PP
faa0e55a
MK
279When the child process terminates, a signal may be sent to the parent.
280The termination signal is specified in the low byte of
fea681da 281.I flags
faa0e55a
MK
282.RB ( clone ())
283or in
284.I cl_args.exit_signal
285.RB ( clone3 ()).
fd8a5be4 286If this signal is specified as anything other than
fea681da
MK
287.BR SIGCHLD ,
288then the parent process must specify the
c13182ef
MK
289.B __WALL
290or
fea681da 291.B __WCLONE
c13182ef
MK
292options when waiting for the child with
293.BR wait (2).
faa0e55a 294If no signal (i.e., zero) is specified, then the parent process is not signaled
fea681da 295when the child terminates.
5fbce8f2 296.\"
bf031aaa
AR
297.SS The set_tid array
298.PP
299By default, the kernel chooses the next sequential PID for the new
300process in each of the PID namespaces where it is present.
301When creating a process with
302.BR clone3 (),
303the
304.I set_tid
b386cee3
MK
305array (available since Linux 5.5)
306can be used to select specific PIDs for the process in some
bf031aaa 307or all of the PID namespaces where it is present.
ee8bb310 308If the PID of the newly created process should be set only for the current
bf031aaa
AR
309PID namespace or in the newly created PID namespace (if
310.I flags
311contains
312.BR CLONE_NEWPID )
313then the first element in the
314.I set_tid
315array has to be the desired PID and
316.I set_tid_size
317needs to be 1.
318.PP
319If the PID of the newly created process should have a certain value in
ee8bb310 320multiple PID namespaces, then the
bf031aaa 321.I set_tid
09007c4b
MK
322array can have multiple entries.
323The first entry defines the PID in the most
ee8bb310
MK
324deeply nested PID namespace and each of the following entries contains
325the PID in the
326corresponding ancestor PID namespace.
09007c4b 327The number of PID namespaces in which a PID
bf031aaa
AR
328should be set is defined by
329.I set_tid_size
330which cannot be larger than the number of currently nested PID namespaces.
331.PP
332To create a process with the following PIDs in a PID namespace hierarchy:
333.RS
334.TS
ee8bb310
MK
335lb lb lb
336l l l.
337PID NS level Requested PID Notes
3380 31496 Outermost PID namespace
bf031aaa 3391 42
ee8bb310 3402 7 Innermost PID namespace
bf031aaa
AR
341.TE
342.RE
343.PP
344Set the array to:
345.PP
ee8bb310 346.in +4n
bf031aaa 347.EX
ee8bb310
MK
348set_tid[0] = 7;
349set_tid[1] = 42;
350set_tid[2] = 31496;
351set_tid_size = 3;
bf031aaa 352.EE
ee8bb310 353.in
bf031aaa
AR
354.PP
355If only the PIDs in the two innermost PID namespaces
356need to be specified, set the array to:
357.PP
ee8bb310 358.in +4n
bf031aaa 359.EX
ee8bb310
MK
360set_tid[0] = 7;
361set_tid[1] = 42;
362set_tid_size = 2;
bf031aaa 363.EE
ee8bb310 364.in
bf031aaa
AR
365.PP
366The PID in the PID namespaces outside the two innermost PID namespaces
367will be selected the same way as any other PID is selected.
368.PP
369The
370.I set_tid
371feature requires
ee8bb310 372.BR CAP_SYS_ADMIN
bf031aaa
AR
373in all owning user namespaces of the target PID namespaces.
374.PP
ee8bb310
MK
375Callers may only choose a PID greater than 1 in a given PID namespace
376if an
377.BR init
378process (i.e., a process with PID 1) already exists in that namespace.
09007c4b 379Otherwise the PID
bf031aaa
AR
380entry for this PID namespace must be 1.
381.\"
16853a31 382.SS The flags mask
efeece04 383.PP
faa0e55a
MK
384Both
385.BR clone ()
386and
387.BR clone3 ()
388allow a flags bit mask that modifies their behavior
389and allows the caller to specify what is shared between the calling process
390and the child process.
5261b0fe
MK
391This bit mask\(emthe
392.I flags
393argument of
394.BR clone ()
395or the
396.I cl_args.flags
397field passed to
398.BR clone3 ()\(emis
16853a31
MK
399referred to as the
400.I flags
401mask in the remainder of this page.
402.PP
403The
404.I flags
405mask is specified as a bitwise-OR of zero or more of
406the constants listed below.
5261b0fe 407Except as noted below, these flags are available
faa0e55a
MK
408(and have the same effect) in both
409.BR clone ()
410and
411.BR clone3 ().
fea681da 412.TP
f5dbc7c8 413.BR CLONE_CHILD_CLEARTID " (since Linux 2.5.49)"
e2bf1234 414Clear (zero) the child thread ID at the location pointed to by
81c2368f 415.I child_tid
faa0e55a
MK
416.RB ( clone ())
417or
418.I cl_args.child_tid
419.RB ( clone3 ())
f5dbc7c8
MK
420in child memory when the child exits, and do a wakeup on the futex
421at that address.
422The address involved may be changed by the
423.BR set_tid_address (2)
424system call.
425This is used by threading libraries.
426.TP
427.BR CLONE_CHILD_SETTID " (since Linux 2.5.49)"
e2bf1234 428Store the child thread ID at the location pointed to by
81c2368f 429.I child_tid
faa0e55a
MK
430.RB ( clone ())
431or
432.I cl_args.child_tid
433.RB ( clone3 ())
8ef021ea 434in the child's memory.
5261b0fe 435The store operation completes before the clone call
6ab62ed8 436returns control to user space in the child process.
5261b0fe 437(Note that the store operation may not have completed before the clone call
6ab62ed8
MK
438returns in the parent process, which will be relevant if the
439.BR CLONE_VM
440flag is also employed.)
f5dbc7c8 441.TP
27f14b44
MK
442.BR CLONE_CLEAR_SIGHAND " (since Linux 5.5)"
443.\" commit b612e5df4587c934bd056bf05f4a1deca4de4f75
444By default, signal dispositions in the child thread are the same as
445in the parent.
446If this flag is specified,
447then all signals that are handled in the parent
448are reset to their default dispositions
449.RB ( SIG_DFL )
450in the child.
451.IP
452Specifying this flag together with
453.B CLONE_SIGHAND
454is nonsensical and disallowed.
455.TP
baa435c6
MK
456.BR CLONE_DETACHED " (historical)"
457For a while (during the Linux 2.5 development series)
458.\" added in 2.5.32; removed in 2.6.0-test4
459there was a
460.B CLONE_DETACHED
461flag,
462which caused the parent not to receive a signal when the child terminated.
463Ultimately, the effect of this flag was subsumed under the
464.BR CLONE_THREAD
465flag and by the time Linux 2.6.0 was released, this flag had no effect.
466Starting in Linux 2.6.2, the need to give this flag together with
467.B CLONE_THREAD
468disappeared.
469.IP
470This flag is still defined, but it is usually ignored when calling
471.BR clone ().
472However, see the description of
473.BR CLONE_PIDFD
474for some exceptions.
475.TP
1603d6a1 476.BR CLONE_FILES " (since Linux 2.0)"
fea681da 477If
f5dbc7c8
MK
478.B CLONE_FILES
479is set, the calling process and the child process share the same file
480descriptor table.
481Any file descriptor created by the calling process or by the child
482process is also valid in the other process.
483Similarly, if one of the processes closes a file descriptor,
484or changes its associated flags (using the
485.BR fcntl (2)
486.B F_SETFD
487operation), the other process is also affected.
8a76b19e
KE
488If a process sharing a file descriptor table calls
489.BR execve (2),
490its file descriptor table is duplicated (unshared).
efeece04 491.IP
fea681da 492If
f5dbc7c8
MK
493.B CLONE_FILES
494is not set, the child process inherits a copy of all file descriptors
5261b0fe 495opened in the calling process at the time of the clone call.
f5dbc7c8
MK
496Subsequent operations that open or close file descriptors,
497or change file descriptor flags,
498performed by either the calling
499process or the child process do not affect the other process.
db8ba2b4 500Note, however,
839d161f
MK
501that the duplicated file descriptors in the child refer to the same
502open file descriptions as the corresponding file descriptors
503in the calling process,
2433365b 504and thus share file offsets and file status flags (see
db8ba2b4 505.BR open (2)).
fea681da 506.TP
1603d6a1 507.BR CLONE_FS " (since Linux 2.0)"
fea681da
MK
508If
509.B CLONE_FS
9ee4a2b6 510is set, the caller and the child process share the same filesystem
c13182ef 511information.
9ee4a2b6 512This includes the root of the filesystem, the current
c13182ef
MK
513working directory, and the umask.
514Any call to
fea681da
MK
515.BR chroot (2),
516.BR chdir (2),
517or
518.BR umask (2)
edcc65ff 519performed by the calling process or the child process also affects the
fea681da 520other process.
efeece04 521.IP
c13182ef 522If
fea681da 523.B CLONE_FS
9ee4a2b6 524is not set, the child process works on a copy of the filesystem
5261b0fe 525information of the calling process at the time of the clone call.
fea681da
MK
526Calls to
527.BR chroot (2),
528.BR chdir (2),
4ba17a6d 529or
fea681da
MK
530.BR umask (2)
531performed later by one of the processes do not affect the other process.
fea681da 532.TP
edc1b9fc
MK
533.BR CLONE_INTO_CGROUP " (since Linux 5.7)"
534.\" commit ef2c41cf38a7559bbf91af42d5b6a4429db8fc68
535By default, a child process is placed in the same version 2
536cgroup as its parent.
537The
538.B CLONE_INTO_CGROUP
17d86030 539flag allows the child process to be created in a different version 2 cgroup.
edc1b9fc
MK
540(Note that
541.BR CLONE_INTO_CGROUP
542has effect only for version 2 cgroups.)
543.IP
544In order to place the child process in a different cgroup,
545the caller specifies
546.BR CLONE_INTO_CGROUP
547in
548.I cl_args.flags
549and passes a file descriptor that refers to a version 2 cgroup in the
550.I cl_args.cgroup
551field.
17d86030 552(This file descriptor can be obtained by opening a cgroup v2 directory
edc1b9fc
MK
553using either the
554.B O_RDONLY
555or the
556.B O_PATH
557flag.)
558Note that all of the usual restrictions (described in
559.BR cgroups (7))
560on placing a process into a version 2 cgroup apply.
561.IP
562Spawning a process into a cgroup different from the parent's cgroup
563makes it possible for a service manager to directly spawn new
564services into dedicated cgroups.
565This eliminates the accounting
566jitter that would be caused if the child process was first created in the
567same cgroup as the parent and then
568moved into the target cgroup.
569The
570.BR CLONE_INTO_CGROUP
571flag also allows the creation of
572frozen child processes by spawning them into a frozen cgroup.
573(See
574.BR cgroups (7)
575for a description of the freezer controller.)
576For threaded applications (or even thread implementations which
577make use of cgroups to limit individual threads), it is possible to
578establish a fixed cgroup layout before spawning each thread
579directly into its target cgroup.
580.TP
a4cc375e 581.BR CLONE_IO " (since Linux 2.6.25)"
11f27a1c
JA
582If
583.B CLONE_IO
584is set, then the new process shares an I/O context with
585the calling process.
586If this flag is not set, then (as with
587.BR fork (2))
588the new process has its own I/O context.
efeece04 589.IP
11f27a1c 590.\" The following based on text from Jens Axboe
d1f84ed7 591The I/O context is the I/O scope of the disk scheduler (i.e.,
11f27a1c
JA
592what the I/O scheduler uses to model scheduling of a process's I/O).
593If processes share the same I/O context,
594they are treated as one by the I/O scheduler.
595As a consequence, they get to share disk time.
596For some I/O schedulers,
597.\" the anticipatory and CFQ scheduler
598if two processes share an I/O context,
599they will be allowed to interleave their disk access.
600If several threads are doing I/O on behalf of the same process
601.RB ( aio_read (3),
602for instance), they should employ
603.BR CLONE_IO
604to get better I/O performance.
605.\" with CFQ and AS.
efeece04 606.IP
11f27a1c
JA
607If the kernel is not configured with the
608.B CONFIG_BLOCK
609option, this flag is a no-op.
610.TP
c5af0674
MK
611.BR CLONE_NEWCGROUP " (since Linux 4.6)"
612Create the process in a new cgroup namespace.
613If this flag is not set, then (as with
614.BR fork (2))
615the process is created in the same cgroup namespaces as the calling process.
efeece04 616.IP
c5af0674 617For further information on cgroup namespaces, see
b9fe4bc3 618.BR cgroup_namespaces (7).
efeece04 619.IP
c5af0674
MK
620Only a privileged process
621.RB ( CAP_SYS_ADMIN )
622can employ
623.BR CLONE_NEWCGROUP .
624.\"
625.TP
8722311b 626.BR CLONE_NEWIPC " (since Linux 2.6.19)"
667417b3
MK
627If
628.B CLONE_NEWIPC
629is set, then create the process in a new IPC namespace.
630If this flag is not set, then (as with
06b30458 631.BR fork (2)),
667417b3
MK
632the process is created in the same IPC namespace as
633the calling process.
efeece04 634.IP
981eda4a
MK
635For further information on IPC namespaces, see
636.BR ipc_namespaces (7).
637.IP
ab5dd83f
MK
638Only a privileged process
639.RB ( CAP_SYS_ADMIN )
640can employ
641.BR CLONE_NEWIPC .
667417b3
MK
642This flag can't be specified in conjunction with
643.BR CLONE_SYSVSEM .
644.TP
163bf178 645.BR CLONE_NEWNET " (since Linux 2.6.24)"
33a0ccb2 646(The implementation of this flag was completed only
9108d867 647by about kernel version 2.6.29.)
efeece04 648.IP
163bf178
MK
649If
650.B CLONE_NEWNET
651is set, then create the process in a new network namespace.
652If this flag is not set, then (as with
57ef8c39 653.BR fork (2))
163bf178
MK
654the process is created in the same network namespace as
655the calling process.
efeece04 656.IP
73680728 657For further information on network namespaces, see
40002795 658.BR network_namespaces (7).
efeece04 659.IP
ab5dd83f
MK
660Only a privileged process
661.RB ( CAP_SYS_ADMIN )
662can employ
663.BR CLONE_NEWNET .
163bf178 664.TP
c10859eb 665.BR CLONE_NEWNS " (since Linux 2.4.19)"
3dd2331c
MK
666If
667.B CLONE_NEWNS
668is set, the cloned child is started in a new mount namespace,
669initialized with a copy of the namespace of the parent.
670If
fea681da 671.B CLONE_NEWNS
3dd2331c 672is not set, the child lives in the same mount
4df2eb09 673namespace as the parent.
efeece04 674.IP
981eda4a
MK
675For further information on mount namespaces, see
676.BR namespaces (7)
677and
678.BR mount_namespaces (7).
679.IP
ab5dd83f
MK
680Only a privileged process
681.RB ( CAP_SYS_ADMIN )
682can employ
683.BR CLONE_NEWNS .
fea681da
MK
684It is not permitted to specify both
685.B CLONE_NEWNS
686and
687.B CLONE_FS
9219d208 688.\" See https://lwn.net/Articles/543273/
5261b0fe 689in the same clone call.
9d005472
MK
690.TP
691.BR CLONE_NEWPID " (since Linux 2.6.24)"
692.\" This explanation draws a lot of details from
693.\" http://lwn.net/Articles/259217/
694.\" Authors: Pavel Emelyanov <xemul@openvz.org>
695.\" and Kir Kolyshkin <kir@openvz.org>
696.\"
697.\" The primary kernel commit is 30e49c263e36341b60b735cbef5ca37912549264
698.\" Author: Pavel Emelyanov <xemul@openvz.org>
699If
700.B CLONE_NEWPID
701is set, then create the process in a new PID namespace.
702If this flag is not set, then (as with
703.BR fork (2))
704the process is created in the same PID namespace as
705the calling process.
efeece04 706.IP
9d005472 707For further information on PID namespaces, see
7e0e902b
MK
708.BR namespaces (7)
709and
39b3f005 710.BR pid_namespaces (7).
efeece04 711.IP
ab5dd83f
MK
712Only a privileged process
713.RB ( CAP_SYS_ADMIN )
714can employ
715.BR CLONE_NEWPID .
9d005472 716This flag can't be specified in conjunction with
f0007192
MK
717.BR CLONE_THREAD
718or
719.BR CLONE_PARENT .
70d21f17 720.TP
06b30458
MK
721.BR CLONE_NEWUSER
722(This flag first became meaningful for
723.BR clone ()
4d2b3ed7
MK
724in Linux 2.6.23,
725the current
11a38815 726.BR clone ()
4d2b3ed7
MK
727semantics were merged in Linux 3.5,
728and the final pieces to make the user namespaces completely usable were
729merged in Linux 3.8.)
efeece04 730.IP
70d21f17
EB
731If
732.B CLONE_NEWUSER
06b30458
MK
733is set, then create the process in a new user namespace.
734If this flag is not set, then (as with
57ef8c39 735.BR fork (2))
70d21f17 736the process is created in the same user namespace as the calling process.
efeece04 737.IP
981eda4a
MK
738For further information on user namespaces, see
739.BR namespaces (7)
740and
741.BR user_namespaces (7).
742.IP
fefbcba8
MK
743Before Linux 3.8, use of
744.BR CLONE_NEWUSER
745required that the caller have three capabilities:
746.BR CAP_SYS_ADMIN ,
747.BR CAP_SETUID ,
748and
749.BR CAP_SETGID .
750.\" Before Linux 2.6.29, it appears that only CAP_SYS_ADMIN was needed
06b30458 751Starting with Linux 3.8,
9d005472 752no privileges are needed to create a user namespace.
efeece04 753.IP
5e72cf7d
MK
754This flag can't be specified in conjunction with
755.BR CLONE_THREAD
756or
757.BR CLONE_PARENT .
758For security reasons,
759.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
760.\" https://lwn.net/Articles/543273/
761.\" The fix actually went into 3.9 and into 3.8.3. However, user namespaces
762.\" were, for practical purposes, unusable in earlier 3.8.x because of the
ab3311aa 763.\" various filesystems that didn't support userns.
f0007192
MK
764.BR CLONE_NEWUSER
765cannot be specified in conjunction with
5e72cf7d 766.BR CLONE_FS .
82ee147a 767.TP
43ce9dda
MK
768.BR CLONE_NEWUTS " (since Linux 2.6.19)"
769If
770.B CLONE_NEWUTS
e1b11906
MK
771is set, then create the process in a new UTS namespace,
772whose identifiers are initialized by duplicating the identifiers
773from the UTS namespace of the calling process.
43ce9dda 774If this flag is not set, then (as with
57ef8c39 775.BR fork (2))
43ce9dda
MK
776the process is created in the same UTS namespace as
777the calling process.
efeece04 778.IP
981eda4a
MK
779For further information on UTS namespaces, see
780.BR uts_namespaces (7).
781.IP
ab5dd83f
MK
782Only a privileged process
783.RB ( CAP_SYS_ADMIN )
784can employ
785.BR CLONE_NEWUTS .
43ce9dda 786.TP
f5dbc7c8
MK
787.BR CLONE_PARENT " (since Linux 2.3.12)"
788If
789.B CLONE_PARENT
790is set, then the parent of the new child (as returned by
791.BR getppid (2))
792will be the same as that of the calling process.
efeece04 793.IP
f5dbc7c8
MK
794If
795.B CLONE_PARENT
796is not set, then (as with
797.BR fork (2))
798the child's parent is the calling process.
efeece04 799.IP
f5dbc7c8
MK
800Note that it is the parent process, as returned by
801.BR getppid (2),
802which is signaled when the child terminates, so that
803if
804.B CLONE_PARENT
805is set, then the parent of the calling process, rather than the
806calling process itself, will be signaled.
a17b9d28 807.IP
4269a6ab 808The
a17b9d28 809.B CLONE_PARENT
4269a6ab
MK
810flag can't be used in clone calls by the
811global init process (PID 1 in the initial PID namespace)
812and init processes in other PID namespaces.
813This restriction prevents the creation of multi-rooted process trees
814as well as the creation of unreapable zombies in the initial PID namespace.
f5dbc7c8
MK
815.TP
816.BR CLONE_PARENT_SETTID " (since Linux 2.5.49)"
e2bf1234 817Store the child thread ID at the location pointed to by
81c2368f 818.I parent_tid
faa0e55a
MK
819.RB ( clone ())
820or
d5d482ec 821.I cl_args.parent_tid
faa0e55a 822.RB ( clone3 ())
8ef021ea 823in the parent's memory.
f5dbc7c8
MK
824(In Linux 2.5.32-2.5.48 there was a flag
825.B CLONE_SETTID
826that did this.)
5261b0fe 827The store operation completes before the clone call
b5da2f91 828returns control to user space.
f5dbc7c8 829.TP
1c173eb3 830.BR CLONE_PID " (Linux 2.0 to 2.5.15)"
f5dbc7c8
MK
831If
832.B CLONE_PID
833is set, the child process is created with the same process ID as
834the calling process.
835This is good for hacking the system, but otherwise
836of not much use.
1c173eb3 837From Linux 2.3.21 onward, this flag could be
f5dbc7c8 838specified only by the system boot process (PID 0).
1c173eb3 839The flag disappeared completely from the kernel sources in Linux 2.5.16.
f5d5180f 840Subsequently, the kernel silently ignored this bit if it was specified in the
16853a31
MK
841.IR flags
842mask.
f5d5180f
MK
843Much later, the same bit was recycled for use as the
844.B CLONE_PIDFD
845flag.
f5dbc7c8 846.TP
9f938981 847.BR CLONE_PIDFD " (since Linux 5.2)"
4e98b074 848.\" commit b3e5838252665ee4cfa76b82bdf1198dca81e5be
faa0e55a
MK
849If this flag is specified,
850a PID file descriptor referring to the child process is allocated
851and placed at a specified location in the parent's memory.
b4ebffb2 852The close-on-exec flag is set on this new file descriptor.
34a975f8
MK
853PID file descriptors can be used for the purposes described in
854.BR pidfd_open (2).
faa0e55a
MK
855.RS
856.IP * 3
857When using
858.BR clone3 (),
859the PID file descriptor is placed at the location pointed to by
860.IR cl_args.pidfd .
861.IP *
862When using
863.BR clone (),
864the PID file descriptor is placed at the location pointed to by
865.IR parent_tid .
9f938981 866Since the
81c2368f 867.I parent_tid
b97cc7ae 868argument is used to return the PID file descriptor,
9f938981
CB
869.B CLONE_PIDFD
870cannot be used with
faa0e55a
MK
871.B CLONE_PARENT_SETTID
872when calling
873.BR clone ().
874.RE
9f938981
CB
875.IP
876It is currently not possible to use this flag together with
877.B CLONE_THREAD.
b97cc7ae 878This means that the process identified by the PID file descriptor
f6183e5b 879will always be a thread group leader.
9f938981 880.IP
baa435c6 881If the obsolete
9f938981 882.B CLONE_DETACHED
baa435c6
MK
883flag is specified alongside
884.BR CLONE_PIDFD
885when calling
886.BR clone (),
4e98b074 887an error is returned.
baa435c6
MK
888An error also results if
889.B CLONE_DETACHED
890is specified when calling
891.BR clone3 ().
892This error behavior ensures that the bit corresponding to
893.BR CLONE_DETACHED
894can be reused for further PID file descriptor features in the future.
9f938981 895.TP
1603d6a1 896.BR CLONE_PTRACE " (since Linux 2.2)"
f5dbc7c8
MK
897If
898.B CLONE_PTRACE
899is specified, and the calling process is being traced,
900then trace the child also (see
901.BR ptrace (2)).
902.TP
903.BR CLONE_SETTLS " (since Linux 2.5.32)"
dd6d3d2e 904The TLS (Thread Local Storage) descriptor is set to
81c2368f 905.IR tls .
efeece04 906.IP
dd6d3d2e 907The interpretation of
81c2368f 908.I tls
dd6d3d2e
KF
909and the resulting effect is architecture dependent.
910On x86,
81c2368f 911.I tls
dd6d3d2e 912is interpreted as a
2551f801 913.IR "struct user_desc\ *"
35bf8cb4 914(see
dd6d3d2e 915.BR set_thread_area (2)).
9ea5bc66 916On x86-64 it is the new value to be set for the %fs base register
35bf8cb4 917(see the
2551f801 918.B ARCH_SET_FS
dd6d3d2e
KF
919argument to
920.BR arch_prctl (2)).
921On architectures with a dedicated TLS register, it is the new value
922of that register.
f5d5180f
MK
923.IP
924Use of this flag requires detailed knowledge and generally it
925should not be used except in libraries implementing threading.
f5dbc7c8 926.TP
1603d6a1 927.BR CLONE_SIGHAND " (since Linux 2.0)"
fea681da
MK
928If
929.B CLONE_SIGHAND
314c8ff4 930is set, the calling process and the child process share the same table of
c13182ef
MK
931signal handlers.
932If the calling process or child process calls
fea681da 933.BR sigaction (2)
c13182ef
MK
934to change the behavior associated with a signal, the behavior is
935changed in the other process as well.
936However, the calling process and child
fea681da 937processes still have distinct signal masks and sets of pending
c13182ef 938signals.
4ba17a6d 939So, one of them may block or unblock signals using
fea681da
MK
940.BR sigprocmask (2)
941without affecting the other process.
efeece04 942.IP
fea681da
MK
943If
944.B CLONE_SIGHAND
945is not set, the child process inherits a copy of the signal handlers
5261b0fe 946of the calling process at the time of the clone call.
c13182ef 947Calls to
fea681da
MK
948.BR sigaction (2)
949performed later by one of the processes have no effect on the other
950process.
efeece04 951.IP
d6bec36e
MK
952Since Linux 2.6.0,
953.\" Precisely: Linux 2.6.0-test6
16853a31 954the
29546c24 955.I flags
16853a31 956mask must also include
29546c24
MK
957.B CLONE_VM
958if
959.B CLONE_SIGHAND
960is specified
fea681da 961.TP
d6bec36e
MK
962.BR CLONE_STOPPED " (since Linux 2.6.0)"
963.\" Precisely: Linux 2.6.0-test2
a69b6bda
MK
964If
965.B CLONE_STOPPED
966is set, then the child is initially stopped (as though it was sent a
967.B SIGSTOP
968signal), and must be resumed by sending it a
969.B SIGCONT
970signal.
efeece04 971.IP
a60450a9
MK
972This flag was
973.I deprecated
974from Linux 2.6.25 onward,
975and was
976.I removed
28b44abc
MK
977altogether in Linux 2.6.38.
978Since then, the kernel silently ignores it without error.
a5a061ee 979.\" glibc 2.8 removed this defn from bits/sched.h
c5af0674
MK
980Starting with Linux 4.6, the same bit was reused for the
981.BR CLONE_NEWCGROUP
982flag.
a69b6bda 983.TP
f5dbc7c8 984.BR CLONE_SYSVSEM " (since Linux 2.5.10)"
fea681da 985If
f5dbc7c8
MK
986.B CLONE_SYSVSEM
987is set, then the child and the calling process share
5ada4b94
MK
988a single list of System V semaphore adjustment
989.RI ( semadj )
990values (see
f5dbc7c8 991.BR semop (2)).
5ada4b94
MK
992In this case, the shared list accumulates
993.I semadj
994values across all processes sharing the list,
995and semaphore adjustments are performed only when the last process
996that is sharing the list terminates (or ceases sharing the list using
997.BR unshare (2)).
f5d401dd 998If this flag is not set, then the child has a separate
5ada4b94
MK
999.I semadj
1000list that is initially empty.
fea681da 1001.TP
d6bec36e
MK
1002.BR CLONE_THREAD " (since Linux 2.4.0)"
1003.\" Precisely: Linux 2.6.0-test8
fea681da
MK
1004If
1005.B CLONE_THREAD
1006is set, the child is placed in the same thread group as the calling process.
fd8a5be4
MK
1007To make the remainder of the discussion of
1008.B CLONE_THREAD
1009more readable, the term "thread" is used to refer to the
1010processes within a thread group.
efeece04 1011.IP
fd8a5be4
MK
1012Thread groups were a feature added in Linux 2.4 to support the
1013POSIX threads notion of a set of threads that share a single PID.
1014Internally, this shared PID is the so-called
1015thread group identifier (TGID) for the thread group.
c13182ef 1016Since Linux 2.4, calls to
fea681da 1017.BR getpid (2)
fd8a5be4 1018return the TGID of the caller.
efeece04 1019.IP
fd8a5be4
MK
1020The threads within a group can be distinguished by their (system-wide)
1021unique thread IDs (TID).
1022A new thread's TID is available as the function result
5261b0fe 1023returned to the caller,
fd8a5be4
MK
1024and a thread can obtain
1025its own TID using
1026.BR gettid (2).
efeece04 1027.IP
5261b0fe 1028When a clone call is made without specifying
fd8a5be4
MK
1029.BR CLONE_THREAD ,
1030then the resulting thread is placed in a new thread group
1031whose TGID is the same as the thread's TID.
1032This thread is the
1033.I leader
1034of the new thread group.
efeece04 1035.IP
fd8a5be4
MK
1036A new thread created with
1037.B CLONE_THREAD
5261b0fe 1038has the same parent process as the process that made the clone call
c13182ef 1039(i.e., like
fd8a5be4
MK
1040.BR CLONE_PARENT ),
1041so that calls to
1042.BR getppid (2)
1043return the same value for all of the threads in a thread group.
1044When a
c13182ef 1045.B CLONE_THREAD
5261b0fe 1046thread terminates, the thread that created it is not sent a
fd8a5be4
MK
1047.B SIGCHLD
1048(or other termination) signal;
1049nor can the status of such a thread be obtained
1050using
1051.BR wait (2).
1052(The thread is said to be
1053.IR detached .)
efeece04 1054.IP
e2fbf61d
MK
1055After all of the threads in a thread group terminate
1056the parent process of the thread group is sent a
fd8a5be4
MK
1057.B SIGCHLD
1058(or other termination) signal.
efeece04 1059.IP
fd8a5be4
MK
1060If any of the threads in a thread group performs an
1061.BR execve (2),
1062then all threads other than the thread group leader are terminated,
1063and the new program is executed in the thread group leader.
efeece04 1064.IP
f7110f60
MK
1065If one of the threads in a thread group creates a child using
1066.BR fork (2),
1067then any thread in the group can
1068.BR wait (2)
1069for that child.
efeece04 1070.IP
16853a31 1071Since Linux 2.5.35, the
fd8a5be4 1072.I flags
16853a31 1073mask must also include
fd8a5be4
MK
1074.B CLONE_SIGHAND
1075if
1076.B CLONE_THREAD
6fd69f33 1077is specified
d6bec36e
MK
1078(and note that, since Linux 2.6.0,
1079.\" Precisely: Linux 2.6.0-test6
6fd69f33
MK
1080.BR CLONE_SIGHAND
1081also requires
1082.BR CLONE_VM
1083to be included).
efeece04 1084.IP
e2fbf61d
MK
1085Signal dispositions and actions are process-wide:
1086if an unhandled signal is delivered to a thread, then
1087it will affect (terminate, stop, continue, be ignored in)
1088all members of the thread group.
efeece04 1089.IP
99408a60 1090Each thread has its own signal mask, as set by
f957eebd
MK
1091.BR sigprocmask (2).
1092.IP
1093A signal may be process-directed or thread-directed.
1094A process-directed signal is targeted at a thread group (i.e., a TGID),
1095and is delivered to an arbitrarily selected thread from among those
1096that are not blocking the signal.
ed4f87f0 1097A signal may be process-directed because it was generated by the kernel
f957eebd
MK
1098for reasons other than a hardware exception, or because it was sent using
1099.BR kill (2)
1100or
1101.BR sigqueue (3).
1102A thread-directed signal is targeted at (i.e., delivered to)
1103a specific thread.
1104A signal may be thread directed because it was sent using
1105.BR tgkill (2)
1106or
1107.BR pthread_sigqueue (3),
1108or because the thread executed a machine language instruction that triggered
1109a hardware exception
1110(e.g., invalid memory access triggering
1111.BR SIGSEGV
1112or a floating-point exception triggering
1113.BR SIGFPE ).
1114.IP
99408a60
MK
1115A call to
1116.BR sigpending (2)
f957eebd
MK
1117returns a signal set that is the union of the pending process-directed
1118signals and the signals that are pending for the calling thread.
efeece04 1119.IP
475c2753 1120If a process-directed signal is delivered to a thread group,
e2fbf61d
MK
1121and the thread group has installed a handler for the signal, then
1122the handler will be invoked in exactly one, arbitrarily selected
1123member of the thread group that has not blocked the signal.
c13182ef 1124If multiple threads in a group are waiting to accept the same signal using
e2fbf61d
MK
1125.BR sigwaitinfo (2),
1126the kernel will arbitrarily select one of these threads
475c2753 1127to receive the signal.
a69b6bda 1128.TP
f5dbc7c8 1129.BR CLONE_UNTRACED " (since Linux 2.5.46)"
a69b6bda 1130If
f5dbc7c8
MK
1131.B CLONE_UNTRACED
1132is specified, then a tracing process cannot force
1133.B CLONE_PTRACE
1134on this child process.
fea681da 1135.TP
1603d6a1 1136.BR CLONE_VFORK " (since Linux 2.2)"
f5dbc7c8
MK
1137If
1138.B CLONE_VFORK
1139is set, the execution of the calling process is suspended
1140until the child releases its virtual memory
1141resources via a call to
1142.BR execve (2)
1143or
1144.BR _exit (2)
1145(as with
1146.BR vfork (2)).
efeece04 1147.IP
f5dbc7c8
MK
1148If
1149.B CLONE_VFORK
4b4a853a 1150is not set, then both the calling process and the child are schedulable
f5dbc7c8
MK
1151after the call, and an application should not rely on execution occurring
1152in any particular order.
fea681da 1153.TP
1603d6a1 1154.BR CLONE_VM " (since Linux 2.0)"
f5dbc7c8
MK
1155If
1156.B CLONE_VM
1157is set, the calling process and the child process run in the same memory
1158space.
1159In particular, memory writes performed by the calling process
1160or by the child process are also visible in the other process.
1161Moreover, any memory mapping or unmapping performed with
1162.BR mmap (2)
1163or
1164.BR munmap (2)
1165by the child or calling process also affects the other process.
efeece04 1166.IP
f5dbc7c8
MK
1167If
1168.B CLONE_VM
1169is not set, the child process runs in a separate copy of the memory
5261b0fe 1170space of the calling process at the time of the clone call.
f5dbc7c8
MK
1171Memory writes or file mappings/unmappings performed by one of the
1172processes do not affect the other, as with
1173.BR fork (2).
47297adb 1174.SH RETURN VALUE
0bfa087b
MK
1175.\" gettid(2) returns current->pid;
1176.\" getpid(2) returns current->tgid;
fea681da 1177On success, the thread ID of the child process is returned
c13182ef 1178in the caller's thread of execution.
84811e86 1179On failure, \-1 is returned
fea681da
MK
1180in the caller's context, no child process will be created, and
1181.I errno
1182will be set appropriately.
fea681da
MK
1183.SH ERRORS
1184.TP
1185.B EAGAIN
e1b6e186
MK
1186Too many processes are already running; see
1187.BR fork (2).
fea681da 1188.TP
6ba79da9
MK
1189.BR EBUSY " (" clone3 "() only)"
1190.B CLONE_INTO_CGROUP
1191was specified in
1192.IR cl_args.flags ,
1193but the file descriptor specified in
1194.IR cl_args.cgroup
1195refers to a version 2 cgroup in which a domain controller is enabled.
1196.TP
bf031aaa 1197.BR EEXIST " (" clone3 "() only)"
ee8bb310 1198One (or more) of the PIDs specified in
bf031aaa
AR
1199.I set_tid
1200already exists in the corresponding PID namespace.
1201.TP
fea681da 1202.B EINVAL
27f14b44
MK
1203Both
1204.B CLONE_SIGHAND
1205and
1206.B CLONE_CLEAR_SIGHAND
1207were specified in the
1208.I flags
1209mask.
1210.TP
1211.B EINVAL
fea681da 1212.B CLONE_SIGHAND
16853a31
MK
1213was specified in the
1214.I flags
1215mask, but
fea681da 1216.B CLONE_VM
2e8a7fb3 1217was not.
d6bec36e
MK
1218(Since Linux 2.6.0.)
1219.\" Precisely: Linux 2.6.0-test6
fea681da
MK
1220.TP
1221.B EINVAL
1222.B CLONE_THREAD
16853a31
MK
1223was specified in the
1224.I flags
1225mask, but
fea681da 1226.B CLONE_SIGHAND
6387216b
MK
1227was not.
1228(Since Linux 2.5.35.)
29546c24
MK
1229.\" .TP
1230.\" .B EINVAL
1231.\" Precisely one of
1232.\" .B CLONE_DETACHED
1233.\" and
1234.\" .B CLONE_THREAD
6387216b
MK
1235.\" was specified.
1236.\" (Since Linux 2.6.0-test6.)
fea681da
MK
1237.TP
1238.B EINVAL
d6868c69 1239.B CLONE_THREAD
16853a31
MK
1240was specified in the
1241.I flags
1242mask, but the current process previously called
d6868c69
JH
1243.BR unshare (2)
1244with the
1245.B CLONE_NEWPID
1246flag or used
1247.BR setns (2)
1248to reassociate itself with a PID namespace.
1249.TP
1250.B EINVAL
d34e5645 1251.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
fea681da
MK
1252Both
1253.B CLONE_FS
1254and
1255.B CLONE_NEWNS
16853a31
MK
1256were specified in the
1257.IR flags
1258mask.
fea681da 1259.TP
d34e5645
MK
1260.BR EINVAL " (since Linux 3.9)"
1261Both
1262.B CLONE_NEWUSER
1263and
1264.B CLONE_FS
16853a31
MK
1265were specified in the
1266.IR flags
1267mask.
d34e5645 1268.TP
fea681da 1269.B EINVAL
82ee147a 1270Both
667417b3
MK
1271.B CLONE_NEWIPC
1272and
1273.B CLONE_SYSVSEM
16853a31
MK
1274were specified in the
1275.IR flags
1276mask.
667417b3
MK
1277.TP
1278.B EINVAL
f0007192 1279One (or both) of
82ee147a 1280.BR CLONE_NEWPID
f0007192
MK
1281or
1282.BR CLONE_NEWUSER
1283and one (or both) of
82ee147a 1284.BR CLONE_THREAD
f0007192
MK
1285or
1286.BR CLONE_PARENT
16853a31
MK
1287were specified in the
1288.IR flags
1289mask.
82ee147a 1290.TP
be479fdf
MK
1291.BR EINVAL " (since Linux 2.6.32)"
1292.\" commit 123be07b0b399670a7cc3d82fef0cb4f93ef885c
1293.BR CLONE_PARENT
1294was specified, and the caller is an init process.
1295.TP
82ee147a 1296.B EINVAL
d4748fad 1297Returned by the glibc
edcc65ff 1298.BR clone ()
d4748fad
MK
1299wrapper function when
1300.IR fn
1301or
81c2368f 1302.IR stack
d4748fad 1303is specified as NULL.
fea681da 1304.TP
28cad2c1 1305.B EINVAL
667417b3 1306.BR CLONE_NEWIPC
16853a31
MK
1307was specified in the
1308.IR flags
1309mask,
667417b3
MK
1310but the kernel was not configured with the
1311.B CONFIG_SYSVIPC
1312and
1313.BR CONFIG_IPC_NS
1314options.
1315.TP
1316.B EINVAL
163bf178 1317.BR CLONE_NEWNET
16853a31
MK
1318was specified in the
1319.IR flags
1320mask,
163bf178
MK
1321but the kernel was not configured with the
1322.B CONFIG_NET_NS
1323option.
1324.TP
1325.B EINVAL
28cad2c1 1326.BR CLONE_NEWPID
16853a31
MK
1327was specified in the
1328.IR flags
1329mask,
28cad2c1
MK
1330but the kernel was not configured with the
1331.B CONFIG_PID_NS
1332option.
1333.TP
43ce9dda 1334.B EINVAL
231d0bbe 1335.BR CLONE_NEWUSER
16853a31
MK
1336was specified in the
1337.IR flags
1338mask,
231d0bbe
MK
1339but the kernel was not configured with the
1340.B CONFIG_USER_NS
1341option.
1342.TP
1343.B EINVAL
43ce9dda 1344.BR CLONE_NEWUTS
16853a31
MK
1345was specified in the
1346.IR flags
1347mask,
43ce9dda 1348but the kernel was not configured with the
832fe8ea 1349.B CONFIG_UTS_NS
43ce9dda
MK
1350option.
1351.TP
c550a897 1352.B EINVAL
81c2368f 1353.I stack
c550a897
MK
1354is not aligned to a suitable boundary for this architecture.
1355For example, on aarch64,
81c2368f 1356.I stack
c550a897
MK
1357must be a multiple of 16.
1358.TP
bc03b116 1359.BR EINVAL " (" clone3 "() only)"
baa435c6
MK
1360.B CLONE_DETACHED
1361was specified in the
1362.I flags
1363mask.
1364.TP
bc03b116 1365.BR EINVAL " (" clone "() only)"
9f938981
CB
1366.B CLONE_PIDFD
1367was specified together with
16853a31
MK
1368.B CLONE_DETACHED
1369in the
1370.I flags
1371mask.
9f938981
CB
1372.TP
1373.B EINVAL
1374.B CLONE_PIDFD
1375was specified together with
16853a31
MK
1376.B CLONE_THREAD
1377in the
1378.I flags
1379mask.
9f938981 1380.TP
faa0e55a 1381.BR "EINVAL " "(" clone "() only)"
9f938981
CB
1382.B CLONE_PIDFD
1383was specified together with
16853a31
MK
1384.B CLONE_PARENT_SETTID
1385in the
1386.I flags
1387mask.
9f938981 1388.TP
bf031aaa
AR
1389.BR EINVAL " (" clone3 "() only)"
1390.I set_tid_size
ee8bb310 1391is greater than the number of nested PID namespaces.
bf031aaa
AR
1392.TP
1393.BR EINVAL " (" clone3 "() only)"
2a2b2a5d 1394One of the PIDs specified in
bf031aaa 1395.I set_tid
2a2b2a5d 1396was an invalid.
bf031aaa 1397.TP
ba9ae75d
MK
1398.BR EINVAL " (AArch64 only, Linux 4.6 and earlier)"
1399.I stack
1400was not aligned to a 126-bit boundary.
1401.TP
fea681da
MK
1402.B ENOMEM
1403Cannot allocate sufficient memory to allocate a task structure for the
1404child, or to copy those parts of the caller's context that need to be
1405copied.
1406.TP
b20e22ae
MK
1407.BR ENOSPC " (since Linux 3.7)"
1408.\" commit f2302505775fd13ba93f034206f1e2a587017929
1409.B CLONE_NEWPID
16853a31
MK
1410was specified in the
1411.I flags
1412mask,
b20e22ae
MK
1413but the limit on the nesting depth of PID namespaces
1414would have been exceeded; see
1415.BR pid_namespaces (7).
1416.TP
b5742ecc
MK
1417.BR ENOSPC " (since Linux 4.9; beforehand " EUSERS )
1418.B CLONE_NEWUSER
16853a31
MK
1419was specified in the
1420.IR flags
1421mask, and the call would cause the limit on the number of
b5742ecc
MK
1422nested user namespaces to be exceeded.
1423See
1424.BR user_namespaces (7).
efeece04 1425.IP
b5742ecc
MK
1426From Linux 3.11 to Linux 4.8, the error diagnosed in this case was
1427.BR EUSERS .
1428.TP
2f7a331e 1429.BR ENOSPC " (since Linux 4.9)"
16853a31 1430One of the values in the
2f7a331e 1431.I flags
16853a31 1432mask specified the creation of a new user namespace,
2f7a331e
MK
1433but doing so would have caused the limit defined by the corresponding file in
1434.IR /proc/sys/user
1435to be exceeded.
1436For further details, see
1437.BR namespaces (7).
1438.TP
6ba79da9
MK
1439.BR EOPNOTSUP " (" clone3 "() only)"
1440.B CLONE_INTO_CGROUP
1441was specified in
1442.IR cl_args.flags ,
1443but the file descriptor specified in
1444.IR cl_args.cgroup
1445refers to a version 2 cgroup that is in the
1446.IR "domain invalid"
1447state.
1448.TP
fea681da 1449.B EPERM
aa825b59 1450.BR CLONE_NEWCGROUP ,
667417b3 1451.BR CLONE_NEWIPC ,
163bf178 1452.BR CLONE_NEWNET ,
43ce9dda
MK
1453.BR CLONE_NEWNS ,
1454.BR CLONE_NEWPID ,
82ee147a 1455or
43ce9dda 1456.BR CLONE_NEWUTS
00b08db3 1457was specified by an unprivileged process (process without \fBCAP_SYS_ADMIN\fP).
fea681da
MK
1458.TP
1459.B EPERM
1460.B CLONE_PID
1461was specified by a process other than process 0.
1c173eb3 1462(This error occurs only on Linux 2.5.15 and earlier.)
365d292a
MK
1463.TP
1464.B EPERM
1465.BR CLONE_NEWUSER
16853a31
MK
1466was specified in the
1467.IR flags
1468mask,
365d292a
MK
1469but either the effective user ID or the effective group ID of the caller
1470does not have a mapping in the parent namespace (see
f58fb24f 1471.BR user_namespaces (7)).
6fd119e7 1472.TP
ac007938
MK
1473.BR EPERM " (since Linux 3.9)"
1474.\" commit 3151527ee007b73a0ebd296010f1c0454a919c7d
11a38815 1475.B CLONE_NEWUSER
16853a31 1476was specified in the
ac007938 1477.I flags
16853a31 1478mask and the caller is in a chroot environment
ac007938
MK
1479.\" FIXME What is the rationale for this restriction?
1480(i.e., the caller's root directory does not match the root directory
1481of the mount namespace in which it resides).
1482.TP
bf031aaa
AR
1483.BR EPERM " (" clone3 "() only)"
1484.I set_tid_size
1485was greater than zero, and the caller lacks the
1486.B CAP_SYS_ADMIN
1487capability in one or more of the user namespaces that own the
1488corresponding PID namespaces.
1489.TP
6717ee86
MK
1490.BR ERESTARTNOINTR " (since Linux 2.6.17)"
1491.\" commit 4a2c7a7837da1b91468e50426066d988050e4d56
1492System call was interrupted by a signal and will be restarted.
1493(This can be seen only during a trace.)
1494.TP
b5742ecc 1495.BR EUSERS " (Linux 3.11 to Linux 4.8)"
6fd119e7 1496.B CLONE_NEWUSER
16853a31
MK
1497was specified in the
1498.IR flags
1499mask,
b5742ecc
MK
1500and the limit on the number of nested user namespaces would be exceeded.
1501See the discussion of the
1502.BR ENOSPC
1503error above.
faa0e55a
MK
1504.SH VERSIONS
1505The
1506.BR clone3 ()
1507system call first appeared in Linux 5.3.
92b72224
MK
1508.\" There is no entry for
1509.\" .BR clone ()
1510.\" in libc5.
1511.\" glibc2 provides
1512.\" .BR clone ()
1513.\" as described in this manual page.
47297adb 1514.SH CONFORMING TO
faa0e55a
MK
1515These system calls
1516are Linux-specific and should not be used in programs
a1d5f77c 1517intended to be portable.
fea681da 1518.SH NOTES
673d16da
MK
1519.PP
1520One use of these systems calls
1521is to implement threads: multiple flows of control in a program that
1522run concurrently in a shared address space.
1523.PP
1524Glibc does not provide a wrapper for
1525.BR clone3 ();
1526call it using
1527.BR syscall (2).
1528.PP
1529Note that the glibc
1530.BR clone ()
1531wrapper function makes some changes
1532in the memory pointed to by
1533.I stack
1534(changes required to set the stack up correctly for the child)
1535.I before
1536invoking the
1537.BR clone ()
1538system call.
1539So, in cases where
1540.BR clone ()
1541is used to recursively create children,
1542do not use the buffer employed for the parent's stack
1543as the stack of the child.
1544.PP
79bdcc4a
MK
1545The
1546.BR kcmp (2)
1547system call can be used to test whether two processes share various
49dba87f 1548resources such as a file descriptor table,
79bdcc4a 1549System V semaphore undo operations, or a virtual address space.
efeece04
MK
1550.PP
1551.PP
c471c363
MK
1552Handlers registered using
1553.BR pthread_atfork (3)
5261b0fe 1554are not executed during a clone call.
efeece04 1555.PP
ca8b1e32 1556In the Linux 2.4.x series,
fd8a5be4
MK
1557.B CLONE_THREAD
1558generally does not make the parent of the new thread the same
1559as the parent of the calling process.
1560However, for kernel versions 2.4.7 to 2.4.18 the
1561.B CLONE_THREAD
1562flag implied the
c13182ef 1563.B CLONE_PARENT
ca8b1e32 1564flag (as in Linux 2.6.0 and later).
efeece04 1565.PP
34ccb744 1566On i386,
a5a997ca
MK
1567.BR clone ()
1568should not be called through vsyscall, but directly through
1569.IR "int $0x80" .
673d16da
MK
1570.\"
1571.SS C library/kernel differences
1572The raw
1573.BR clone ()
1574system call corresponds more closely to
1575.BR fork (2)
1576in that execution in the child continues from the point of the
1577call.
1578As such, the
1579.I fn
1580and
1581.I arg
1582arguments of the
1583.BR clone ()
1584wrapper function are omitted.
1585.PP
1586In contrast to the glibc wrapper, the raw
1587.BR clone ()
1588system call accepts NULL as a
1589.I stack
1590argument (and
1591.BR clone3 ()
1592likewise allows
1593.I cl_args.stack
1594to be NULL).
1595In this case, the child uses a duplicate of the parent's stack.
1596(Copy-on-write semantics ensure that the child gets separate copies
1597of stack pages when either process modifies the stack.)
1598In this case, for correct operation, the
1599.B CLONE_VM
1600option should not be specified.
1601(If the child
1602.I shares
1603the parent's memory because of the use of the
1604.BR CLONE_VM
1605flag,
1606then no copy-on-write duplication occurs and chaos is likely to result.)
1607.PP
1608The order of the arguments also differs in the raw system call,
1609and there are variations in the arguments across architectures,
1610as detailed in the following paragraphs.
1611.PP
1612The raw system call interface on x86-64 and some other architectures
1613(including sh, tile, and alpha) is:
1614.PP
1615.in +4
1616.EX
1617.BI "long clone(unsigned long " flags ", void *" stack ,
1618.BI " int *" parent_tid ", int *" child_tid ,
1619.BI " unsigned long " tls );
1620.EE
1621.in
1622.PP
1623On x86-32, and several other common architectures
1624(including score, ARM, ARM 64, PA-RISC, arc, Power PC, xtensa,
1625and MIPS),
1626.\" CONFIG_CLONE_BACKWARDS
1627the order of the last two arguments is reversed:
1628.PP
1629.in +4
1630.EX
1631.BI "long clone(unsigned long " flags ", void *" stack ,
1632.BI " int *" parent_tid ", unsigned long " tls ,
1633.BI " int *" child_tid );
1634.EE
1635.in
1636.PP
1637On the cris and s390 architectures,
1638.\" CONFIG_CLONE_BACKWARDS2
1639the order of the first two arguments is reversed:
1640.PP
1641.in +4
1642.EX
1643.BI "long clone(void *" stack ", unsigned long " flags ,
1644.BI " int *" parent_tid ", int *" child_tid ,
1645.BI " unsigned long " tls );
1646.EE
1647.in
1648.PP
1649On the microblaze architecture,
1650.\" CONFIG_CLONE_BACKWARDS3
1651an additional argument is supplied:
1652.PP
1653.in +4
1654.EX
1655.BI "long clone(unsigned long " flags ", void *" stack ,
1656.BI " int " stack_size , "\fR /* Size of stack */"
1657.BI " int *" parent_tid ", int *" child_tid ,
1658.BI " unsigned long " tls );
1659.EE
1660.in
1661.\"
1662.SS blackfin, m68k, and sparc
1663.\" Mike Frysinger noted in a 2013 mail:
1664.\" these arches don't define __ARCH_WANT_SYS_CLONE:
1665.\" blackfin ia64 m68k sparc
1666The argument-passing conventions on
1667blackfin, m68k, and sparc are different from the descriptions above.
1668For details, see the kernel (and glibc) source.
1669.SS ia64
1670On ia64, a different interface is used:
1671.PP
1672.in +4
1673.EX
1674.BI "int __clone2(int (*" "fn" ")(void *), "
1675.BI " void *" stack_base ", size_t " stack_size ,
1676.BI " int " flags ", void *" "arg" ", ... "
1677.BI " /* pid_t *" parent_tid ", struct user_desc *" tls ,
1678.BI " pid_t *" child_tid " */ );"
1679.EE
1680.in
1681.PP
1682The prototype shown above is for the glibc wrapper function;
1683for the system call itself,
1684the prototype can be described as follows (it is identical to the
1685.BR clone ()
1686prototype on microblaze):
1687.PP
1688.in +4
1689.EX
1690.BI "long clone2(unsigned long " flags ", void *" stack_base ,
1691.BI " int " stack_size , "\fR /* Size of stack */"
1692.BI " int *" parent_tid ", int *" child_tid ,
1693.BI " unsigned long " tls );
1694.EE
1695.in
1696.PP
1697.BR __clone2 ()
1698operates in the same way as
1699.BR clone (),
1700except that
1701.I stack_base
1702points to the lowest address of the child's stack area,
1703and
1704.I stack_size
1705specifies the size of the stack pointed to by
1706.IR stack_base .
1707.SS Linux 2.4 and earlier
1708In Linux 2.4 and earlier,
1709.BR clone ()
1710does not take arguments
1711.IR parent_tid ,
1712.IR tls ,
1713and
1714.IR child_tid .
31830ef0 1715.SH BUGS
abcf3b1d
MK
1716GNU C library versions 2.3.4 up to and including 2.24
1717contained a wrapper function for
0bfa087b 1718.BR getpid (2)
abcf3b1d
MK
1719that performed caching of PIDs.
1720This caching relied on support in the glibc wrapper for
c60237c9 1721.BR clone (),
abcf3b1d
MK
1722but limitations in the implementation
1723meant that the cache was not up to date in some circumstances.
c60237c9 1724In particular,
abcf3b1d 1725if a signal was delivered to the child immediately after the
c60237c9
MK
1726.BR clone ()
1727call, then a call to
0b80cf56 1728.BR getpid (2)
abcf3b1d 1729in a handler for the signal could return the PID
c60237c9 1730of the calling process ("the parent"),
abcf3b1d 1731if the clone wrapper had not yet had a chance to update the PID
c60237c9
MK
1732cache in the child.
1733(This discussion ignores the case where the child was created using
9291ce36 1734.BR CLONE_THREAD ,
c60237c9 1735when
0b80cf56 1736.BR getpid (2)
c60237c9
MK
1737.I should
1738return the same value in the child and in the process that called
1739.BR clone (),
a1d48abb 1740since the caller and the child are in the same thread group.
e7d807b7 1741The stale-cache problem also does not occur if the
a1d48abb
JR
1742.I flags
1743argument includes
1744.BR CLONE_VM .)
abcf3b1d
MK
1745To get the truth, it was sometimes necessary to use code such as the following:
1746.PP
47f743f1
MK
1747.in +4n
1748.EX
1749#include <syscall.h>
31830ef0 1750
47f743f1 1751pid_t mypid;
31830ef0 1752
47f743f1
MK
1753mypid = syscall(SYS_getpid);
1754.EE
1755.in
c60237c9
MK
1756.\" See also the following bug reports
1757.\" https://bugzilla.redhat.com/show_bug.cgi?id=417521
1758.\" http://sourceware.org/bugzilla/show_bug.cgi?id=6910
abcf3b1d
MK
1759.PP
1760Because of the stale-cache problem, as well as other problems noted in
1761.BR getpid (2),
1762the PID caching feature was removed in glibc 2.25.
8c7b566c 1763.SH EXAMPLE
8c7b566c 1764The following program demonstrates the use of
9c13072a 1765.BR clone ()
8c7b566c
MK
1766to create a child process that executes in a separate UTS namespace.
1767The child changes the hostname in its UTS namespace.
1768Both parent and child then display the system hostname,
1769making it possible to see that the hostname
1770differs in the UTS namespaces of the parent and child.
1771For an example of the use of this program, see
1772.BR setns (2).
99c3a000
MK
1773.PP
1774Within the sample program, we allocate the memory that is to
1775be used for the child's stack using
1776.BR mmap (2)
1777rather than
1778.BR malloc (3)
1779for the following reasons:
1780.IP * 3
1781.BR mmap (2)
1782allocates a block of memory that starts on a page
1783boundary and is a multiple of the page size.
1784This is useful if we want to establish a guard page (a page with protection
1785.BR PROT_NONE )
1786at the end of the stack using
1787.BR mprotect (2).
1788.IP *
1789We can specify the
1790.BR MAP_STACK
1791flag to request a mapping that is suitable for a stack.
1792For the moment, this flag is a no-op on Linux,
1793but it exists and has effect on some other systems,
1794so we should include it for portability.
f30b7415 1795.SS Program source
e7d0bb47 1796.EX
8c7b566c
MK
1797#define _GNU_SOURCE
1798#include <sys/wait.h>
1799#include <sys/utsname.h>
1800#include <sched.h>
1801#include <string.h>
1802#include <stdio.h>
1803#include <stdlib.h>
1804#include <unistd.h>
99c3a000 1805#include <sys/mman.h>
8c7b566c 1806
d1a71985 1807#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
8c7b566c
MK
1808 } while (0)
1809
1810static int /* Start function for cloned child */
1811childFunc(void *arg)
1812{
1813 struct utsname uts;
1814
1815 /* Change hostname in UTS namespace of child */
1816
1817 if (sethostname(arg, strlen(arg)) == \-1)
1818 errExit("sethostname");
1819
07d4e6ea 1820 /* Retrieve and display hostname */
8c7b566c
MK
1821
1822 if (uname(&uts) == \-1)
1823 errExit("uname");
d1a71985 1824 printf("uts.nodename in child: %s\en", uts.nodename);
8c7b566c
MK
1825
1826 /* Keep the namespace open for a while, by sleeping.
1827 This allows some experimentation\-\-for example, another
1828 process might join the namespace. */
9f1b9726 1829
8c7b566c
MK
1830 sleep(200);
1831
1832 return 0; /* Child terminates now */
1833}
1834
1835#define STACK_SIZE (1024 * 1024) /* Stack size for cloned child */
1836
1837int
1838main(int argc, char *argv[])
1839{
1840 char *stack; /* Start of stack buffer */
1841 char *stackTop; /* End of stack buffer */
1842 pid_t pid;
1843 struct utsname uts;
1844
1845 if (argc < 2) {
d1a71985 1846 fprintf(stderr, "Usage: %s <child\-hostname>\en", argv[0]);
8c7b566c
MK
1847 exit(EXIT_SUCCESS);
1848 }
1849
99c3a000 1850 /* Allocate memory to be used for the stack of the child */
8c7b566c 1851
99c3a000
MK
1852 stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
1853 MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, \-1, 0);
8eea66b8 1854 if (stack == MAP_FAILED)
99c3a000
MK
1855 errExit("mmap");
1856
8c7b566c
MK
1857 stackTop = stack + STACK_SIZE; /* Assume stack grows downward */
1858
1859 /* Create child that has its own UTS namespace;
1860 child commences execution in childFunc() */
1861
1862 pid = clone(childFunc, stackTop, CLONE_NEWUTS | SIGCHLD, argv[1]);
1863 if (pid == \-1)
1864 errExit("clone");
d1a71985 1865 printf("clone() returned %ld\en", (long) pid);
8c7b566c
MK
1866
1867 /* Parent falls through to here */
1868
1869 sleep(1); /* Give child time to change its hostname */
1870
9f1b9726 1871 /* Display hostname in parent\(aqs UTS namespace. This will be
8c7b566c
MK
1872 different from hostname in child\(aqs UTS namespace. */
1873
1874 if (uname(&uts) == \-1)
1875 errExit("uname");
d1a71985 1876 printf("uts.nodename in parent: %s\en", uts.nodename);
8c7b566c
MK
1877
1878 if (waitpid(pid, NULL, 0) == \-1) /* Wait for child */
1879 errExit("waitpid");
d1a71985 1880 printf("child has terminated\en");
8c7b566c
MK
1881
1882 exit(EXIT_SUCCESS);
1883}
e7d0bb47 1884.EE
47297adb 1885.SH SEE ALSO
fea681da 1886.BR fork (2),
2b44301c 1887.BR futex (2),
fea681da
MK
1888.BR getpid (2),
1889.BR gettid (2),
6f8746e4 1890.BR kcmp (2),
99c3a000 1891.BR mmap (2),
d8837668 1892.BR pidfd_open (2),
f2d0bbf1 1893.BR set_thread_area (2),
2b44301c 1894.BR set_tid_address (2),
8403481f 1895.BR setns (2),
f2d0bbf1 1896.BR tkill (2),
5cc01e9c 1897.BR unshare (2),
fea681da 1898.BR wait (2),
3616b7c0 1899.BR capabilities (7),
41096af1 1900.BR namespaces (7),
3616b7c0 1901.BR pthreads (7)