]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/clone.2
setns.2: A process can't join a new userns if it shares CLONE_FS attributes
[thirdparty/man-pages.git] / man2 / clone.2
CommitLineData
fea681da 1.\" Copyright (c) 1992 Drew Eckhardt <drew@cs.colorado.edu>, March 28, 1992
8c7b566c 2.\" and Copyright (c) Michael Kerrisk, 2001, 2002, 2005, 2013
2297bf0e 3.\"
fd0fc519 4.\" %%%LICENSE_START(GPL_NOVERSION_ONELINE)
fea681da 5.\" May be distributed under the GNU General Public License.
fd0fc519 6.\" %%%LICENSE_END
dccaff1e 7.\"
fea681da
MK
8.\" Modified by Michael Haardt <michael@moria.de>
9.\" Modified 24 Jul 1993 by Rik Faith <faith@cs.unc.edu>
10.\" Modified 21 Aug 1994 by Michael Chastain <mec@shell.portal.com>:
11.\" New man page (copied from 'fork.2').
12.\" Modified 10 June 1995 by Andries Brouwer <aeb@cwi.nl>
13.\" Modified 25 April 1998 by Xavier Leroy <Xavier.Leroy@inria.fr>
14.\" Modified 26 Jun 2001 by Michael Kerrisk
15.\" Mostly upgraded to 2.4.x
16.\" Added prototype for sys_clone() plus description
17.\" Added CLONE_THREAD with a brief description of thread groups
c13182ef 18.\" Added CLONE_PARENT and revised entire page remove ambiguity
fea681da
MK
19.\" between "calling process" and "parent process"
20.\" Added CLONE_PTRACE and CLONE_VFORK
21.\" Added EPERM and EINVAL error codes
fd8a5be4 22.\" Renamed "__clone" to "clone" (which is the prototype in <sched.h>)
fea681da 23.\" various other minor tidy ups and clarifications.
c11b1abf 24.\" Modified 26 Jun 2001 by Michael Kerrisk <mtk.manpages@gmail.com>
d9bfdb9c 25.\" Updated notes for 2.4.7+ behavior of CLONE_THREAD
c11b1abf 26.\" Modified 15 Oct 2002 by Michael Kerrisk <mtk.manpages@gmail.com>
fea681da
MK
27.\" Added description for CLONE_NEWNS, which was added in 2.4.19
28.\" Slightly rephrased, aeb.
29.\" Modified 1 Feb 2003 - added CLONE_SIGHAND restriction, aeb.
30.\" Modified 1 Jan 2004 - various updates, aeb
0967c11f 31.\" Modified 2004-09-10 - added CLONE_PARENT_SETTID etc. - aeb.
d9bfdb9c 32.\" 2005-04-12, mtk, noted the PID caching behavior of NPTL's getpid()
31830ef0 33.\" wrapper under BUGS.
fd8a5be4
MK
34.\" 2005-05-10, mtk, added CLONE_SYSVSEM, CLONE_UNTRACED, CLONE_STOPPED.
35.\" 2005-05-17, mtk, Substantially enhanced discussion of CLONE_THREAD.
4e836144 36.\" 2008-11-18, mtk, order CLONE_* flags alphabetically
82ee147a 37.\" 2008-11-18, mtk, document CLONE_NEWPID
43ce9dda 38.\" 2008-11-19, mtk, document CLONE_NEWUTS
667417b3 39.\" 2008-11-19, mtk, document CLONE_NEWIPC
cfdc761b 40.\" 2008-11-19, Jens Axboe, mtk, document CLONE_IO
fea681da 41.\"
8980a500 42.TH CLONE 2 2014-08-19 "Linux" "Linux Programmer's Manual"
fea681da 43.SH NAME
9b0e0996 44clone, __clone2 \- create a child process
fea681da 45.SH SYNOPSIS
c10859eb 46.nf
81f10dad
MK
47/* Prototype for the glibc wrapper function */
48
fea681da 49.B #include <sched.h>
c10859eb 50
ff929e3b
MK
51.BI "int clone(int (*" "fn" ")(void *), void *" child_stack ,
52.BI " int " flags ", void *" "arg" ", ... "
d3dbc9b1 53.BI " /* pid_t *" ptid ", struct user_desc *" tls \
ff929e3b 54", pid_t *" ctid " */ );"
81f10dad 55
e585064b 56/* Prototype for the raw system call */
81f10dad
MK
57
58.BI "long clone(unsigned long " flags ", void *" child_stack ,
59.BI " void *" ptid ", void *" ctid ,
60.BI " struct pt_regs *" regs );
c10859eb 61.fi
e73b3103
MK
62.sp
63.in -4n
81f10dad 64Feature Test Macro Requirements for glibc wrapper function (see
e73b3103
MK
65.BR feature_test_macros (7)):
66.in
67.sp
68.BR clone ():
69.ad l
70.RS 4
71.PD 0
72.TP 4
73Since glibc 2.14:
74_GNU_SOURCE
75.TP 4
bd297db0 76.\" See http://sources.redhat.com/bugzilla/show_bug.cgi?id=4749
e73b3103
MK
77Before glibc 2.14:
78_BSD_SOURCE || _SVID_SOURCE
79 /* _GNU_SOURCE also suffices */
80.PD
81.RE
82.ad b
fea681da 83.SH DESCRIPTION
edcc65ff
MK
84.BR clone ()
85creates a new process, in a manner similar to
fea681da 86.BR fork (2).
81f10dad
MK
87
88This page describes both the glibc
e511ffb6 89.BR clone ()
e585064b 90wrapper function and the underlying system call on which it is based.
81f10dad 91The main text describes the wrapper function;
e585064b 92the differences for the raw system call
81f10dad 93are described toward the end of this page.
fea681da
MK
94
95Unlike
96.BR fork (2),
81f10dad
MK
97.BR clone ()
98allows the child process to share parts of its execution context with
fea681da 99the calling process, such as the memory space, the table of file
c13182ef
MK
100descriptors, and the table of signal handlers.
101(Note that on this manual
102page, "calling process" normally corresponds to "parent process".
103But see the description of
104.B CLONE_PARENT
fea681da
MK
105below.)
106
107The main use of
edcc65ff 108.BR clone ()
fea681da
MK
109is to implement threads: multiple threads of control in a program that
110run concurrently in a shared memory space.
111
112When the child process is created with
c13182ef 113.BR clone (),
fea681da 114it executes the function
c13182ef 115.IR fn ( arg ).
fea681da 116(This differs from
c13182ef 117.BR fork (2),
fea681da 118where execution continues in the child from the point
c13182ef
MK
119of the
120.BR fork (2)
fea681da
MK
121call.)
122The
123.I fn
124argument is a pointer to a function that is called by the child
125process at the beginning of its execution.
126The
127.I arg
128argument is passed to the
129.I fn
130function.
131
c13182ef 132When the
fea681da 133.IR fn ( arg )
c13182ef
MK
134function application returns, the child process terminates.
135The integer returned by
fea681da 136.I fn
c13182ef
MK
137is the exit code for the child process.
138The child process may also terminate explicitly by calling
fea681da
MK
139.BR exit (2)
140or after receiving a fatal signal.
141
142The
143.I child_stack
c13182ef
MK
144argument specifies the location of the stack used by the child process.
145Since the child and calling process may share memory,
fea681da 146it is not possible for the child process to execute in the
c13182ef
MK
147same stack as the calling process.
148The calling process must therefore
fea681da
MK
149set up memory space for the child stack and pass a pointer to this
150space to
edcc65ff 151.BR clone ().
5fab2e7c 152Stacks grow downward on all processors that run Linux
fea681da
MK
153(except the HP PA processors), so
154.I child_stack
155usually points to the topmost address of the memory space set up for
156the child stack.
157
158The low byte of
159.I flags
fd8a5be4
MK
160contains the number of the
161.I "termination signal"
162sent to the parent when the child dies.
163If this signal is specified as anything other than
fea681da
MK
164.BR SIGCHLD ,
165then the parent process must specify the
c13182ef
MK
166.B __WALL
167or
fea681da 168.B __WCLONE
c13182ef
MK
169options when waiting for the child with
170.BR wait (2).
fea681da
MK
171If no signal is specified, then the parent process is not signaled
172when the child terminates.
173
174.I flags
fd8a5be4
MK
175may also be bitwise-or'ed with zero or more of the following constants,
176in order to specify what is shared between the calling process
fea681da 177and the child process:
fea681da 178.TP
f5dbc7c8
MK
179.BR CLONE_CHILD_CLEARTID " (since Linux 2.5.49)"
180Erase child thread ID at location
d3dbc9b1 181.I ctid
f5dbc7c8
MK
182in child memory when the child exits, and do a wakeup on the futex
183at that address.
184The address involved may be changed by the
185.BR set_tid_address (2)
186system call.
187This is used by threading libraries.
188.TP
189.BR CLONE_CHILD_SETTID " (since Linux 2.5.49)"
190Store child thread ID at location
d3dbc9b1 191.I ctid
f5dbc7c8
MK
192in child memory.
193.TP
1603d6a1 194.BR CLONE_FILES " (since Linux 2.0)"
fea681da 195If
f5dbc7c8
MK
196.B CLONE_FILES
197is set, the calling process and the child process share the same file
198descriptor table.
199Any file descriptor created by the calling process or by the child
200process is also valid in the other process.
201Similarly, if one of the processes closes a file descriptor,
202or changes its associated flags (using the
203.BR fcntl (2)
204.B F_SETFD
205operation), the other process is also affected.
fea681da
MK
206
207If
f5dbc7c8
MK
208.B CLONE_FILES
209is not set, the child process inherits a copy of all file descriptors
210opened in the calling process at the time of
211.BR clone ().
212(The duplicated file descriptors in the child refer to the
213same open file descriptions (see
214.BR open (2))
215as the corresponding file descriptors in the calling process.)
216Subsequent operations that open or close file descriptors,
217or change file descriptor flags,
218performed by either the calling
219process or the child process do not affect the other process.
fea681da 220.TP
1603d6a1 221.BR CLONE_FS " (since Linux 2.0)"
fea681da
MK
222If
223.B CLONE_FS
9ee4a2b6 224is set, the caller and the child process share the same filesystem
c13182ef 225information.
9ee4a2b6 226This includes the root of the filesystem, the current
c13182ef
MK
227working directory, and the umask.
228Any call to
fea681da
MK
229.BR chroot (2),
230.BR chdir (2),
231or
232.BR umask (2)
edcc65ff 233performed by the calling process or the child process also affects the
fea681da
MK
234other process.
235
c13182ef 236If
fea681da 237.B CLONE_FS
9ee4a2b6 238is not set, the child process works on a copy of the filesystem
fea681da 239information of the calling process at the time of the
edcc65ff 240.BR clone ()
fea681da
MK
241call.
242Calls to
243.BR chroot (2),
244.BR chdir (2),
245.BR umask (2)
246performed later by one of the processes do not affect the other process.
fea681da 247.TP
a4cc375e 248.BR CLONE_IO " (since Linux 2.6.25)"
11f27a1c
JA
249If
250.B CLONE_IO
251is set, then the new process shares an I/O context with
252the calling process.
253If this flag is not set, then (as with
254.BR fork (2))
255the new process has its own I/O context.
256
257.\" The following based on text from Jens Axboe
a113945f 258The I/O context is the I/O scope of the disk scheduler (i.e,
11f27a1c
JA
259what the I/O scheduler uses to model scheduling of a process's I/O).
260If processes share the same I/O context,
261they are treated as one by the I/O scheduler.
262As a consequence, they get to share disk time.
263For some I/O schedulers,
264.\" the anticipatory and CFQ scheduler
265if two processes share an I/O context,
266they will be allowed to interleave their disk access.
267If several threads are doing I/O on behalf of the same process
268.RB ( aio_read (3),
269for instance), they should employ
270.BR CLONE_IO
271to get better I/O performance.
272.\" with CFQ and AS.
273
274If the kernel is not configured with the
275.B CONFIG_BLOCK
276option, this flag is a no-op.
277.TP
8722311b 278.BR CLONE_NEWIPC " (since Linux 2.6.19)"
667417b3
MK
279If
280.B CLONE_NEWIPC
281is set, then create the process in a new IPC namespace.
282If this flag is not set, then (as with
06b30458 283.BR fork (2)),
667417b3
MK
284the process is created in the same IPC namespace as
285the calling process.
0236bea9 286This flag is intended for the implementation of containers.
667417b3 287
efbfd7ec 288An IPC namespace provides an isolated view of System\ V IPC objects (see
009a049e
MK
289.BR svipc (7))
290and (since Linux 2.6.30)
291.\" commit 7eafd7c74c3f2e67c27621b987b28397110d643f
292.\" https://lwn.net/Articles/312232/
293POSIX message queues
294(see
295.BR mq_overview (7)).
19911fa5
MK
296The common characteristic of these IPC mechanisms is that IPC
297objects are identified by mechanisms other than filesystem
298pathnames.
009a049e 299
c440fe01 300Objects created in an IPC namespace are visible to all other processes
667417b3
MK
301that are members of that namespace,
302but are not visible to processes in other IPC namespaces.
303
83c1f4b5 304When an IPC namespace is destroyed
009a049e 305(i.e., when the last process that is a member of the namespace terminates),
83c1f4b5
MK
306all IPC objects in the namespace are automatically destroyed.
307
ab5dd83f
MK
308Only a privileged process
309.RB ( CAP_SYS_ADMIN )
310can employ
311.BR CLONE_NEWIPC .
667417b3
MK
312This flag can't be specified in conjunction with
313.BR CLONE_SYSVSEM .
9343f8e7
MK
314
315For further information on IPC namespaces, see
316.BR namespaces (7).
667417b3 317.TP
163bf178 318.BR CLONE_NEWNET " (since Linux 2.6.24)"
33a0ccb2 319(The implementation of this flag was completed only
9108d867 320by about kernel version 2.6.29.)
163bf178
MK
321
322If
323.B CLONE_NEWNET
324is set, then create the process in a new network namespace.
325If this flag is not set, then (as with
57ef8c39 326.BR fork (2))
163bf178
MK
327the process is created in the same network namespace as
328the calling process.
329This flag is intended for the implementation of containers.
330
331A network namespace provides an isolated view of the networking stack
332(network device interfaces, IPv4 and IPv6 protocol stacks,
333IP routing tables, firewall rules, the
334.I /proc/net
335and
336.I /sys/class/net
337directory trees, sockets, etc.).
338A physical network device can live in exactly one
339network namespace.
340A virtual network device ("veth") pair provides a pipe-like abstraction
bea08fec 341.\" FIXME . Add pointer to veth(4) page when it is eventually completed
163bf178
MK
342that can be used to create tunnels between network namespaces,
343and can be used to create a bridge to a physical network device
344in another namespace.
345
bf032425
SH
346When a network namespace is freed
347(i.e., when the last process in the namespace terminates),
348its physical network devices are moved back to the
349initial network namespace (not to the parent of the process).
73680728
MK
350For further information on network namespaces, see
351.BR namespaces (7).
bf032425 352
ab5dd83f
MK
353Only a privileged process
354.RB ( CAP_SYS_ADMIN )
355can employ
356.BR CLONE_NEWNET .
163bf178 357.TP
c10859eb 358.BR CLONE_NEWNS " (since Linux 2.4.19)"
3dd2331c
MK
359If
360.B CLONE_NEWNS
361is set, the cloned child is started in a new mount namespace,
362initialized with a copy of the namespace of the parent.
363If
fea681da 364.B CLONE_NEWNS
3dd2331c 365is not set, the child lives in the same mount
4df2eb09 366namespace as the parent.
fea681da 367
3dd2331c
MK
368For further information on mount namespaces, see
369.BR namespaces (7).
fea681da 370
ab5dd83f
MK
371Only a privileged process
372.RB ( CAP_SYS_ADMIN )
373can employ
374.BR CLONE_NEWNS .
fea681da
MK
375It is not permitted to specify both
376.B CLONE_NEWNS
377and
378.B CLONE_FS
379in the same
e511ffb6 380.BR clone ()
fea681da 381call.
9d005472
MK
382.TP
383.BR CLONE_NEWPID " (since Linux 2.6.24)"
384.\" This explanation draws a lot of details from
385.\" http://lwn.net/Articles/259217/
386.\" Authors: Pavel Emelyanov <xemul@openvz.org>
387.\" and Kir Kolyshkin <kir@openvz.org>
388.\"
389.\" The primary kernel commit is 30e49c263e36341b60b735cbef5ca37912549264
390.\" Author: Pavel Emelyanov <xemul@openvz.org>
391If
392.B CLONE_NEWPID
393is set, then create the process in a new PID namespace.
394If this flag is not set, then (as with
395.BR fork (2))
396the process is created in the same PID namespace as
397the calling process.
398This flag is intended for the implementation of containers.
399
400For further information on PID namespaces, see
7e0e902b
MK
401.BR namespaces (7)
402and
403.BR pid_namespaces (7)
9d005472 404
ab5dd83f
MK
405Only a privileged process
406.RB ( CAP_SYS_ADMIN )
407can employ
408.BR CLONE_NEWPID .
9d005472 409This flag can't be specified in conjunction with
f0007192
MK
410.BR CLONE_THREAD
411or
412.BR CLONE_PARENT .
70d21f17 413.TP
06b30458
MK
414.BR CLONE_NEWUSER
415(This flag first became meaningful for
416.BR clone ()
4d2b3ed7
MK
417in Linux 2.6.23,
418the current
419.BR clone()
420semantics were merged in Linux 3.5,
421and the final pieces to make the user namespaces completely usable were
422merged in Linux 3.8.)
423
70d21f17
EB
424If
425.B CLONE_NEWUSER
06b30458
MK
426is set, then create the process in a new user namespace.
427If this flag is not set, then (as with
57ef8c39 428.BR fork (2))
70d21f17
EB
429the process is created in the same user namespace as the calling process.
430
9d005472 431For further information on user namespaces, see
f58fb24f
MK
432.BR namespaces (7)
433and
434.BR user_namespaces (7)
06b30458 435
fefbcba8
MK
436Before Linux 3.8, use of
437.BR CLONE_NEWUSER
438required that the caller have three capabilities:
439.BR CAP_SYS_ADMIN ,
440.BR CAP_SETUID ,
441and
442.BR CAP_SETGID .
443.\" Before Linux 2.6.29, it appears that only CAP_SYS_ADMIN was needed
06b30458 444Starting with Linux 3.8,
9d005472 445no privileges are needed to create a user namespace.
f0007192 446
5e72cf7d
MK
447This flag can't be specified in conjunction with
448.BR CLONE_THREAD
449or
450.BR CLONE_PARENT .
451For security reasons,
452.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
453.\" https://lwn.net/Articles/543273/
454.\" The fix actually went into 3.9 and into 3.8.3. However, user namespaces
455.\" were, for practical purposes, unusable in earlier 3.8.x because of the
456.\" various file systems that didn't support userns.
f0007192
MK
457.BR CLONE_NEWUSER
458cannot be specified in conjunction with
5e72cf7d
MK
459.BR CLONE_FS .
460
461For further information on user namespaces, see
462.BR user_namespaces (7).
82ee147a 463.TP
43ce9dda
MK
464.BR CLONE_NEWUTS " (since Linux 2.6.19)"
465If
466.B CLONE_NEWUTS
e1b11906
MK
467is set, then create the process in a new UTS namespace,
468whose identifiers are initialized by duplicating the identifiers
469from the UTS namespace of the calling process.
43ce9dda 470If this flag is not set, then (as with
57ef8c39 471.BR fork (2))
43ce9dda
MK
472the process is created in the same UTS namespace as
473the calling process.
0236bea9 474This flag is intended for the implementation of containers.
43ce9dda
MK
475
476A UTS namespace is the set of identifiers returned by
477.BR uname (2);
850905cf 478among these, the domain name and the hostname can be modified by
43ce9dda
MK
479.BR setdomainname (2)
480and
43ce9dda
MK
481.BR sethostname (2),
482respectively.
c440fe01
MK
483Changes made to the identifiers in a UTS namespace
484are visible to all other processes in the same namespace,
43ce9dda
MK
485but are not visible to processes in other UTS namespaces.
486
ab5dd83f
MK
487Only a privileged process
488.RB ( CAP_SYS_ADMIN )
489can employ
490.BR CLONE_NEWUTS .
9cc7ad66 491
83d9e9b2 492For further information on UTS namespaces, see
9cc7ad66 493.BR namespaces (7).
43ce9dda 494.TP
f5dbc7c8
MK
495.BR CLONE_PARENT " (since Linux 2.3.12)"
496If
497.B CLONE_PARENT
498is set, then the parent of the new child (as returned by
499.BR getppid (2))
500will be the same as that of the calling process.
501
502If
503.B CLONE_PARENT
504is not set, then (as with
505.BR fork (2))
506the child's parent is the calling process.
507
508Note that it is the parent process, as returned by
509.BR getppid (2),
510which is signaled when the child terminates, so that
511if
512.B CLONE_PARENT
513is set, then the parent of the calling process, rather than the
514calling process itself, will be signaled.
515.TP
516.BR CLONE_PARENT_SETTID " (since Linux 2.5.49)"
517Store child thread ID at location
d3dbc9b1 518.I ptid
f5dbc7c8
MK
519in parent and child memory.
520(In Linux 2.5.32-2.5.48 there was a flag
521.B CLONE_SETTID
522that did this.)
523.TP
524.BR CLONE_PID " (obsolete)"
525If
526.B CLONE_PID
527is set, the child process is created with the same process ID as
528the calling process.
529This is good for hacking the system, but otherwise
530of not much use.
531Since 2.3.21 this flag can be
532specified only by the system boot process (PID 0).
533It disappeared in Linux 2.5.16.
534.TP
1603d6a1 535.BR CLONE_PTRACE " (since Linux 2.2)"
f5dbc7c8
MK
536If
537.B CLONE_PTRACE
538is specified, and the calling process is being traced,
539then trace the child also (see
540.BR ptrace (2)).
541.TP
542.BR CLONE_SETTLS " (since Linux 2.5.32)"
543The
544.I newtls
545argument is the new TLS (Thread Local Storage) descriptor.
546(See
547.BR set_thread_area (2).)
548.TP
1603d6a1 549.BR CLONE_SIGHAND " (since Linux 2.0)"
fea681da
MK
550If
551.B CLONE_SIGHAND
314c8ff4 552is set, the calling process and the child process share the same table of
c13182ef
MK
553signal handlers.
554If the calling process or child process calls
fea681da 555.BR sigaction (2)
c13182ef
MK
556to change the behavior associated with a signal, the behavior is
557changed in the other process as well.
558However, the calling process and child
fea681da 559processes still have distinct signal masks and sets of pending
c13182ef
MK
560signals.
561So, one of them may block or unblock some signals using
fea681da
MK
562.BR sigprocmask (2)
563without affecting the other process.
564
565If
566.B CLONE_SIGHAND
567is not set, the child process inherits a copy of the signal handlers
568of the calling process at the time
edcc65ff 569.BR clone ()
c13182ef
MK
570is called.
571Calls to
fea681da
MK
572.BR sigaction (2)
573performed later by one of the processes have no effect on the other
574process.
29546c24
MK
575
576Since Linux 2.6.0-test6,
577.I flags
578must also include
579.B CLONE_VM
580if
581.B CLONE_SIGHAND
582is specified
fea681da 583.TP
a69b6bda
MK
584.BR CLONE_STOPPED " (since Linux 2.6.0-test2)"
585If
586.B CLONE_STOPPED
587is set, then the child is initially stopped (as though it was sent a
588.B SIGSTOP
589signal), and must be resumed by sending it a
590.B SIGCONT
591signal.
ef37eaf2 592
a60450a9
MK
593This flag was
594.I deprecated
595from Linux 2.6.25 onward,
596and was
597.I removed
598altogether in Linux 2.6.38.
a5a061ee 599.\" glibc 2.8 removed this defn from bits/sched.h
a69b6bda 600.TP
f5dbc7c8 601.BR CLONE_SYSVSEM " (since Linux 2.5.10)"
fea681da 602If
f5dbc7c8
MK
603.B CLONE_SYSVSEM
604is set, then the child and the calling process share
5ada4b94
MK
605a single list of System V semaphore adjustment
606.RI ( semadj )
607values (see
f5dbc7c8 608.BR semop (2)).
5ada4b94
MK
609In this case, the shared list accumulates
610.I semadj
611values across all processes sharing the list,
612and semaphore adjustments are performed only when the last process
613that is sharing the list terminates (or ceases sharing the list using
614.BR unshare (2)).
615If this flag is not set, then the child has a separate
616.I semadj
617list that is initially empty.
fea681da
MK
618.TP
619.BR CLONE_THREAD " (since Linux 2.4.0-test8)"
620If
621.B CLONE_THREAD
622is set, the child is placed in the same thread group as the calling process.
fd8a5be4
MK
623To make the remainder of the discussion of
624.B CLONE_THREAD
625more readable, the term "thread" is used to refer to the
626processes within a thread group.
fea681da 627
fd8a5be4
MK
628Thread groups were a feature added in Linux 2.4 to support the
629POSIX threads notion of a set of threads that share a single PID.
630Internally, this shared PID is the so-called
631thread group identifier (TGID) for the thread group.
c13182ef 632Since Linux 2.4, calls to
fea681da 633.BR getpid (2)
fd8a5be4
MK
634return the TGID of the caller.
635
636The threads within a group can be distinguished by their (system-wide)
637unique thread IDs (TID).
638A new thread's TID is available as the function result
639returned to the caller of
640.BR clone (),
641and a thread can obtain
642its own TID using
643.BR gettid (2).
644
c13182ef 645When a call is made to
fd8a5be4
MK
646.BR clone ()
647without specifying
648.BR CLONE_THREAD ,
649then the resulting thread is placed in a new thread group
650whose TGID is the same as the thread's TID.
651This thread is the
652.I leader
653of the new thread group.
654
655A new thread created with
656.B CLONE_THREAD
657has the same parent process as the caller of
658.BR clone ()
c13182ef 659(i.e., like
fd8a5be4
MK
660.BR CLONE_PARENT ),
661so that calls to
662.BR getppid (2)
663return the same value for all of the threads in a thread group.
664When a
c13182ef 665.B CLONE_THREAD
fd8a5be4
MK
666thread terminates, the thread that created it using
667.BR clone ()
668is not sent a
669.B SIGCHLD
670(or other termination) signal;
671nor can the status of such a thread be obtained
672using
673.BR wait (2).
674(The thread is said to be
675.IR detached .)
676
e2fbf61d
MK
677After all of the threads in a thread group terminate
678the parent process of the thread group is sent a
fd8a5be4
MK
679.B SIGCHLD
680(or other termination) signal.
681
682If any of the threads in a thread group performs an
683.BR execve (2),
684then all threads other than the thread group leader are terminated,
685and the new program is executed in the thread group leader.
686
f7110f60
MK
687If one of the threads in a thread group creates a child using
688.BR fork (2),
689then any thread in the group can
690.BR wait (2)
691for that child.
692
edcc65ff 693Since Linux 2.5.35,
fd8a5be4
MK
694.I flags
695must also include
696.B CLONE_SIGHAND
697if
698.B CLONE_THREAD
6fd69f33
MK
699is specified
700(and note that, since Linux 2.6.0-test6,
701.BR CLONE_SIGHAND
702also requires
703.BR CLONE_VM
704to be included).
e2fbf61d
MK
705
706Signals may be sent to a thread group as a whole (i.e., a TGID) using
707.BR kill (2),
708or to a specific thread (i.e., TID) using
709.BR tgkill (2).
710
711Signal dispositions and actions are process-wide:
712if an unhandled signal is delivered to a thread, then
713it will affect (terminate, stop, continue, be ignored in)
714all members of the thread group.
715
99408a60 716Each thread has its own signal mask, as set by
e2fbf61d 717.BR sigprocmask (2),
82a06020 718but signals can be pending either: for the whole process
e2fbf61d
MK
719(i.e., deliverable to any member of the thread group),
720when sent with
82a06020 721.BR kill (2);
e2fbf61d
MK
722or for an individual thread, when sent with
723.BR tgkill (2).
99408a60
MK
724A call to
725.BR sigpending (2)
726returns a signal set that is the union of the signals pending for the
727whole process and the signals that are pending for the calling thread.
e2fbf61d 728
c13182ef 729If
e2fbf61d
MK
730.BR kill (2)
731is used to send a signal to a thread group,
732and the thread group has installed a handler for the signal, then
733the handler will be invoked in exactly one, arbitrarily selected
734member of the thread group that has not blocked the signal.
c13182ef 735If multiple threads in a group are waiting to accept the same signal using
e2fbf61d
MK
736.BR sigwaitinfo (2),
737the kernel will arbitrarily select one of these threads
c13182ef 738to receive a signal sent using
e2fbf61d 739.BR kill (2).
a69b6bda 740.TP
f5dbc7c8 741.BR CLONE_UNTRACED " (since Linux 2.5.46)"
a69b6bda 742If
f5dbc7c8
MK
743.B CLONE_UNTRACED
744is specified, then a tracing process cannot force
745.B CLONE_PTRACE
746on this child process.
fea681da 747.TP
1603d6a1 748.BR CLONE_VFORK " (since Linux 2.2)"
f5dbc7c8
MK
749If
750.B CLONE_VFORK
751is set, the execution of the calling process is suspended
752until the child releases its virtual memory
753resources via a call to
754.BR execve (2)
755or
756.BR _exit (2)
757(as with
758.BR vfork (2)).
759
760If
761.B CLONE_VFORK
4b4a853a 762is not set, then both the calling process and the child are schedulable
f5dbc7c8
MK
763after the call, and an application should not rely on execution occurring
764in any particular order.
fea681da 765.TP
1603d6a1 766.BR CLONE_VM " (since Linux 2.0)"
f5dbc7c8
MK
767If
768.B CLONE_VM
769is set, the calling process and the child process run in the same memory
770space.
771In particular, memory writes performed by the calling process
772or by the child process are also visible in the other process.
773Moreover, any memory mapping or unmapping performed with
774.BR mmap (2)
775or
776.BR munmap (2)
777by the child or calling process also affects the other process.
778
779If
780.B CLONE_VM
781is not set, the child process runs in a separate copy of the memory
782space of the calling process at the time of
783.BR clone ().
784Memory writes or file mappings/unmappings performed by one of the
785processes do not affect the other, as with
786.BR fork (2).
e8796f63 787.SS C library/kernel ABI differences
e585064b
MK
788The raw
789.BR clone ()
fea681da
MK
790system call corresponds more closely to
791.BR fork (2)
792in that execution in the child continues from the point of the
c13182ef 793call.
5add3af3
MK
794As such, the
795.I fn
c13182ef 796and
5add3af3
MK
797.I arg
798arguments of the
799.BR clone ()
800wrapper function are omitted.
801Furthermore, the argument order changes.
c787510f 802The raw system call interface on x86 and many other architectures is roughly:
5add3af3
MK
803.in +4
804.nf
805
806.BI "long clone(unsigned long " flags ", void *" child_stack ,
807.BI " void *" ptid ", void *" ctid ,
808.BI " struct pt_regs *" regs );
fea681da 809
5add3af3
MK
810.fi
811.in
e585064b 812Another difference for the raw system call is that the
fea681da 813.I child_stack
c13182ef 814argument may be zero, in which case copy-on-write semantics ensure that the
fea681da 815child gets separate copies of stack pages when either process modifies
c13182ef
MK
816the stack.
817In this case, for correct operation, the
fea681da
MK
818.B CLONE_VM
819option should not be specified.
c787510f 820
e585064b 821For some architectures, the order of the arguments for the system call
c787510f 822differs from that shown above.
7d2e6d74 823On the score, microblaze, ARM, ARM 64, PA-RISC, arc, Power PC, xtensa,
c787510f
MK
824and MIPS architectures,
825the order of the fourth and fifth arguments is reversed.
826On the cris and s390 architectures,
827the order of the first and second arguments is reversed.
251113d0
MK
828.SS blackfin, m68k, and sparc
829The argument-passing conventions on
04346be5 830blackfin, m68k, and sparc are different from the descriptions above.
251113d0 831For details, see the kernel (and glibc) source.
574c92b6 832.SS ia64
097a1f3b
MK
833On ia64, a different interface is used:
834.nf
835
836.BI "int __clone2(int (*" "fn" ")(void *), "
837.BI " void *" child_stack_base ", size_t " stack_size ,
838.BI " int " flags ", void *" "arg" ", ... "
839.BI " /* pid_t *" ptid ", struct user_desc *" tls \
840", pid_t *" ctid " */ );"
841.fi
842.PP
843The prototype shown above is for the glibc wrapper function;
844the raw system call interface has no
845.I fn
846or
847.I arg
848argument, and changes the order of the arguments so that
849.I flags
850is the first argument, and
851.I tls
852is the last argument.
853.PP
854.BR __clone2 ()
855operates in the same way as
856.BR clone (),
857except that
858.I child_stack_base
859points to the lowest address of the child's stack area,
860and
861.I stack_size
862specifies the size of the stack pointed to by
863.IR child_stack_base .
5add3af3 864.SS Linux 2.4 and earlier
577f9b62
MK
865In Linux 2.4 and earlier,
866.BR clone ()
867does not take arguments
868.IR ptid ,
869.IR tls ,
870and
130b2e49 871.IR ctid .
47297adb 872.SH RETURN VALUE
0bfa087b
MK
873.\" gettid(2) returns current->pid;
874.\" getpid(2) returns current->tgid;
fea681da 875On success, the thread ID of the child process is returned
c13182ef 876in the caller's thread of execution.
84811e86 877On failure, \-1 is returned
fea681da
MK
878in the caller's context, no child process will be created, and
879.I errno
880will be set appropriately.
fea681da
MK
881.SH ERRORS
882.TP
883.B EAGAIN
e1b6e186
MK
884Too many processes are already running; see
885.BR fork (2).
fea681da
MK
886.TP
887.B EINVAL
888.B CLONE_SIGHAND
889was specified, but
890.B CLONE_VM
2e8a7fb3
MK
891was not.
892(Since Linux 2.6.0-test6.)
fea681da
MK
893.TP
894.B EINVAL
895.B CLONE_THREAD
896was specified, but
897.B CLONE_SIGHAND
6387216b
MK
898was not.
899(Since Linux 2.5.35.)
29546c24
MK
900.\" .TP
901.\" .B EINVAL
902.\" Precisely one of
903.\" .B CLONE_DETACHED
904.\" and
905.\" .B CLONE_THREAD
6387216b
MK
906.\" was specified.
907.\" (Since Linux 2.6.0-test6.)
fea681da
MK
908.TP
909.B EINVAL
d34e5645 910.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
fea681da
MK
911Both
912.B CLONE_FS
913and
914.B CLONE_NEWNS
915were specified in
916.IR flags .
917.TP
d34e5645
MK
918.BR EINVAL " (since Linux 3.9)"
919Both
920.B CLONE_NEWUSER
921and
922.B CLONE_FS
923were specified in
924.IR flags .
925.TP
fea681da 926.B EINVAL
82ee147a 927Both
667417b3
MK
928.B CLONE_NEWIPC
929and
930.B CLONE_SYSVSEM
931were specified in
932.IR flags .
933.TP
934.B EINVAL
f0007192 935One (or both) of
82ee147a 936.BR CLONE_NEWPID
f0007192
MK
937or
938.BR CLONE_NEWUSER
939and one (or both) of
82ee147a 940.BR CLONE_THREAD
f0007192
MK
941or
942.BR CLONE_PARENT
82ee147a
MK
943were specified in
944.IR flags .
945.TP
946.B EINVAL
c13182ef 947Returned by
edcc65ff 948.BR clone ()
c13182ef 949when a zero value is specified for
fea681da
MK
950.IR child_stack .
951.TP
28cad2c1 952.B EINVAL
667417b3
MK
953.BR CLONE_NEWIPC
954was specified in
955.IR flags ,
956but the kernel was not configured with the
957.B CONFIG_SYSVIPC
958and
959.BR CONFIG_IPC_NS
960options.
961.TP
962.B EINVAL
163bf178
MK
963.BR CLONE_NEWNET
964was specified in
965.IR flags ,
966but the kernel was not configured with the
967.B CONFIG_NET_NS
968option.
969.TP
970.B EINVAL
28cad2c1
MK
971.BR CLONE_NEWPID
972was specified in
973.IR flags ,
974but the kernel was not configured with the
975.B CONFIG_PID_NS
976option.
977.TP
43ce9dda
MK
978.B EINVAL
979.BR CLONE_NEWUTS
980was specified in
981.IR flags ,
982but the kernel was not configured with the
983.B CONFIG_UTS
984option.
985.TP
fea681da
MK
986.B ENOMEM
987Cannot allocate sufficient memory to allocate a task structure for the
988child, or to copy those parts of the caller's context that need to be
989copied.
990.TP
991.B EPERM
667417b3 992.BR CLONE_NEWIPC ,
163bf178 993.BR CLONE_NEWNET ,
43ce9dda
MK
994.BR CLONE_NEWNS ,
995.BR CLONE_NEWPID ,
82ee147a 996or
43ce9dda 997.BR CLONE_NEWUTS
00b08db3 998was specified by an unprivileged process (process without \fBCAP_SYS_ADMIN\fP).
fea681da
MK
999.TP
1000.B EPERM
1001.B CLONE_PID
1002was specified by a process other than process 0.
365d292a
MK
1003.TP
1004.B EPERM
1005.BR CLONE_NEWUSER
1006was specified in
1007.IR flags ,
1008but either the effective user ID or the effective group ID of the caller
1009does not have a mapping in the parent namespace (see
f58fb24f 1010.BR user_namespaces (7)).
365d292a
MK
1011.SH VERSIONS
1012There is no entry for
1013.BR clone ()
1014in libc5.
1015glibc2 provides
1016.BR clone ()
1017as described in this manual page.
47297adb 1018.SH CONFORMING TO
a1d5f77c 1019.BR clone ()
e585064b 1020is Linux-specific and should not be used in programs
a1d5f77c 1021intended to be portable.
fea681da 1022.SH NOTES
fd8a5be4
MK
1023In the kernel 2.4.x series,
1024.B CLONE_THREAD
1025generally does not make the parent of the new thread the same
1026as the parent of the calling process.
1027However, for kernel versions 2.4.7 to 2.4.18 the
1028.B CLONE_THREAD
1029flag implied the
c13182ef 1030.B CLONE_PARENT
fd8a5be4 1031flag (as in kernel 2.6).
fea681da 1032
c13182ef
MK
1033For a while there was
1034.B CLONE_DETACHED
a5053dcb 1035(introduced in 2.5.32):
c13182ef 1036parent wants no child-exit signal.
a5053dcb 1037In 2.6.2 the need to give this
c13182ef
MK
1038together with
1039.B CLONE_THREAD
a5053dcb
MK
1040disappeared.
1041This flag is still defined, but has no effect.
1042
34ccb744 1043On i386,
a5a997ca
MK
1044.BR clone ()
1045should not be called through vsyscall, but directly through
1046.IR "int $0x80" .
31830ef0
MK
1047.SH BUGS
1048Versions of the GNU C library that include the NPTL threading library
c13182ef 1049contain a wrapper function for
0bfa087b 1050.BR getpid (2)
31830ef0 1051that performs caching of PIDs.
c60237c9
MK
1052This caching relies on support in the glibc wrapper for
1053.BR clone (),
1054but as currently implemented,
1055the cache may not be up to date in some circumstances.
1056In particular,
1057if a signal is delivered to the child immediately after the
1058.BR clone ()
1059call, then a call to
0b80cf56 1060.BR getpid (2)
c60237c9
MK
1061in a handler for the signal may return the PID
1062of the calling process ("the parent"),
88619baf 1063if the clone wrapper has not yet had a chance to update the PID
c60237c9
MK
1064cache in the child.
1065(This discussion ignores the case where the child was created using
9291ce36 1066.BR CLONE_THREAD ,
c60237c9 1067when
0b80cf56 1068.BR getpid (2)
c60237c9
MK
1069.I should
1070return the same value in the child and in the process that called
1071.BR clone (),
a1d48abb 1072since the caller and the child are in the same thread group.
e7d807b7 1073The stale-cache problem also does not occur if the
a1d48abb
JR
1074.I flags
1075argument includes
1076.BR CLONE_VM .)
c60237c9 1077To get the truth, it may be necessary to use code such as the following:
31830ef0
MK
1078.nf
1079
1080 #include <syscall.h>
1081
1082 pid_t mypid;
1083
1084 mypid = syscall(SYS_getpid);
1085.fi
c60237c9
MK
1086.\" See also the following bug reports
1087.\" https://bugzilla.redhat.com/show_bug.cgi?id=417521
1088.\" http://sourceware.org/bugzilla/show_bug.cgi?id=6910
8c7b566c 1089.SH EXAMPLE
8c7b566c 1090The following program demonstrates the use of
9c13072a 1091.BR clone ()
8c7b566c
MK
1092to create a child process that executes in a separate UTS namespace.
1093The child changes the hostname in its UTS namespace.
1094Both parent and child then display the system hostname,
1095making it possible to see that the hostname
1096differs in the UTS namespaces of the parent and child.
1097For an example of the use of this program, see
1098.BR setns (2).
f30b7415 1099.SS Program source
8c7b566c
MK
1100.nf
1101#define _GNU_SOURCE
1102#include <sys/wait.h>
1103#include <sys/utsname.h>
1104#include <sched.h>
1105#include <string.h>
1106#include <stdio.h>
1107#include <stdlib.h>
1108#include <unistd.h>
1109
1110#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\
1111 } while (0)
1112
1113static int /* Start function for cloned child */
1114childFunc(void *arg)
1115{
1116 struct utsname uts;
1117
1118 /* Change hostname in UTS namespace of child */
1119
1120 if (sethostname(arg, strlen(arg)) == \-1)
1121 errExit("sethostname");
1122
07d4e6ea 1123 /* Retrieve and display hostname */
8c7b566c
MK
1124
1125 if (uname(&uts) == \-1)
1126 errExit("uname");
1127 printf("uts.nodename in child: %s\\n", uts.nodename);
1128
1129 /* Keep the namespace open for a while, by sleeping.
1130 This allows some experimentation\-\-for example, another
1131 process might join the namespace. */
9f1b9726 1132
8c7b566c
MK
1133 sleep(200);
1134
1135 return 0; /* Child terminates now */
1136}
1137
1138#define STACK_SIZE (1024 * 1024) /* Stack size for cloned child */
1139
1140int
1141main(int argc, char *argv[])
1142{
1143 char *stack; /* Start of stack buffer */
1144 char *stackTop; /* End of stack buffer */
1145 pid_t pid;
1146 struct utsname uts;
1147
1148 if (argc < 2) {
1149 fprintf(stderr, "Usage: %s <child\-hostname>\\n", argv[0]);
1150 exit(EXIT_SUCCESS);
1151 }
1152
1153 /* Allocate stack for child */
1154
1155 stack = malloc(STACK_SIZE);
1156 if (stack == NULL)
1157 errExit("malloc");
1158 stackTop = stack + STACK_SIZE; /* Assume stack grows downward */
1159
1160 /* Create child that has its own UTS namespace;
1161 child commences execution in childFunc() */
1162
1163 pid = clone(childFunc, stackTop, CLONE_NEWUTS | SIGCHLD, argv[1]);
1164 if (pid == \-1)
1165 errExit("clone");
1166 printf("clone() returned %ld\\n", (long) pid);
1167
1168 /* Parent falls through to here */
1169
1170 sleep(1); /* Give child time to change its hostname */
1171
9f1b9726 1172 /* Display hostname in parent\(aqs UTS namespace. This will be
8c7b566c
MK
1173 different from hostname in child\(aqs UTS namespace. */
1174
1175 if (uname(&uts) == \-1)
1176 errExit("uname");
1177 printf("uts.nodename in parent: %s\\n", uts.nodename);
1178
1179 if (waitpid(pid, NULL, 0) == \-1) /* Wait for child */
1180 errExit("waitpid");
1181 printf("child has terminated\\n");
1182
1183 exit(EXIT_SUCCESS);
1184}
1185.fi
47297adb 1186.SH SEE ALSO
fea681da 1187.BR fork (2),
2b44301c 1188.BR futex (2),
fea681da
MK
1189.BR getpid (2),
1190.BR gettid (2),
6f8746e4 1191.BR kcmp (2),
f2d0bbf1 1192.BR set_thread_area (2),
2b44301c 1193.BR set_tid_address (2),
8403481f 1194.BR setns (2),
f2d0bbf1 1195.BR tkill (2),
5cc01e9c 1196.BR unshare (2),
fea681da 1197.BR wait (2),
3616b7c0 1198.BR capabilities (7),
41096af1 1199.BR namespaces (7),
3616b7c0 1200.BR pthreads (7)