]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/perf_event_open.2
perf_event_open.2: Document aux_{head,tail,offset,size} support
[thirdparty/man-pages.git] / man2 / perf_event_open.2
CommitLineData
f2b1d720
MK
1.\" Copyright (c) 2012, Vincent Weaver
2.\"
1dd72f9c 3.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
f2b1d720
MK
4.\" This is free documentation; you can redistribute it and/or
5.\" modify it under the terms of the GNU General Public License as
6.\" published by the Free Software Foundation; either version 2 of
7.\" the License, or (at your option) any later version.
8.\"
9.\" The GNU General Public License's references to "object code"
10.\" and "executables" are to be interpreted as the output of any
11.\" document formatting or typesetting system, including
12.\" intermediate and printed output.
13.\"
14.\" This manual is distributed in the hope that it will be useful,
15.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
16.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17.\" GNU General Public License for more details.
18.\"
19.\" You should have received a copy of the GNU General Public
20.\" License along with this manual; if not, see
21.\" <http://www.gnu.org/licenses/>.
6a8d8745 22.\" %%%LICENSE_END
f2b1d720
MK
23.\"
24.\" This document is based on the perf_event.h header file, the
25.\" tools/perf/design.txt file, and a lot of bitter experience.
26.\"
5722c835 27.TH PERF_EVENT_OPEN 2 2015-07-23 "Linux" "Linux Programmer's Manual"
f2b1d720
MK
28.SH NAME
29perf_event_open \- set up performance monitoring
30.SH SYNOPSIS
31.nf
32.B #include <linux/perf_event.h>
33.B #include <linux/hw_breakpoint.h>
34.sp
35.BI "int perf_event_open(struct perf_event_attr *" attr ,
36.BI " pid_t " pid ", int " cpu ", int " group_fd ,
37.BI " unsigned long " flags );
38.fi
39
40.IR Note :
41There is no glibc wrapper for this system call; see NOTES.
42.SH DESCRIPTION
43Given a list of parameters,
44.BR perf_event_open ()
45returns a file descriptor, for use in subsequent system calls
46.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
47.PP
48A call to
49.BR perf_event_open ()
50creates a file descriptor that allows measuring performance
51information.
52Each file descriptor corresponds to one
53event that is measured; these can be grouped together
54to measure multiple events simultaneously.
55.PP
56Events can be enabled and disabled in two ways: via
57.BR ioctl (2)
58and via
0fe9e4b1 59.BR prctl (2).
f2b1d720
MK
60When an event is disabled it does not count or generate overflows but does
61continue to exist and maintain its count value.
62.PP
63Events come in two flavors: counting and sampled.
64A
65.I counting
66event is one that is used for counting the aggregate number of events
67that occur.
68In general, counting event results are gathered with a
69.BR read (2)
70call.
71A
72.I sampling
73event periodically writes measurements to a buffer that can then
74be accessed via
0fe9e4b1 75.BR mmap (2).
f2b1d720
MK
76.SS Arguments
77.P
f2b1d720 78The
a02a1737 79.I pid
f2b1d720 80and
a02a1737
VW
81.I cpu
82arguments allow specifying which process and CPU to monitor:
83.TP
f2d15dc9 84.BR "pid == 0" " and " "cpu == \-1"
ee7b0cbf 85This measures the calling process/thread on any CPU.
a02a1737 86.TP
f2d15dc9 87.BR "pid == 0" " and " "cpu >= 0"
ee7b0cbf 88This measures the calling process/thread only
a02a1737
VW
89when running on the specified CPU.
90.TP
f2d15dc9 91.BR "pid > 0" " and " "cpu == \-1"
a02a1737
VW
92This measures the specified process/thread on any CPU.
93.TP
f2d15dc9 94.BR "pid > 0" " and " "cpu >= 0"
a02a1737
VW
95This measures the specified process/thread only
96when running on the specified CPU.
97.TP
f2d15dc9 98.BR "pid == \-1" " and " "cpu >= 0"
a02a1737 99This measures all processes/threads on the specified CPU.
ce88f77b 100This requires
f2b1d720
MK
101.B CAP_SYS_ADMIN
102capability or a
103.I /proc/sys/kernel/perf_event_paranoid
104value of less than 1.
a02a1737 105.TP
ce88f77b 106.BR "pid == \-1" " and " "cpu == \-1"
a02a1737 107This setting is invalid and will return an error.
f2b1d720
MK
108.P
109The
110.I group_fd
111argument allows event groups to be created.
112An event group has one event which is the group leader.
113The leader is created first, with
114.IR group_fd " = \-1."
115The rest of the group members are created with subsequent
116.BR perf_event_open ()
117calls with
118.IR group_fd
bec6277e 119being set to the file descriptor of the group leader.
f2b1d720
MK
120(A single event on its own is created with
121.IR group_fd " = \-1"
122and is considered to be a group with only 1 member.)
33a0ccb2 123An event group is scheduled onto the CPU as a unit: it will
d1007d14 124be put onto the CPU only if all of the events in the group can be put onto
f2b1d720
MK
125the CPU.
126This means that the values of the member events can be
ce88f77b 127meaningfully compared\(emadded, divided (to get ratios), and so on\(emwith each
f2b1d720
MK
128other, since they have counted events for the same set of executed
129instructions.
130.P
131The
132.I flags
08e325e8 133argument is formed by ORing together zero or more of the following values:
f2b1d720 134.TP
60dafbc1
MK
135.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)"
136.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e
e9b1ab78
MK
137This flag enables the close-on-exec flag for the created
138event file descriptor,
139so that the file descriptor is automatically closed on
140.BR execve (2).
8bad22e5
MK
141Setting the close-on-exec flags at creation time, rather than later with
142.BR fcntl (2),
e9b1ab78
MK
143avoids potential race conditions where the calling thread invokes
144.BR perf_event_open ()
a61dba34
MK
145and
146.BR fcntl (2)
e9b1ab78
MK
147at the same time as another thread calls
148.BR fork (2)
149then
150.BR execve (2).
151.TP
f2b1d720 152.BR PERF_FLAG_FD_NO_GROUP
31266c04
VW
153This flag tells the event to ignore the
154.IR group_fd
155parameter except for the purpose of setting up output redirection
156using the
157.B PERF_FLAG_FD_OUTPUT
158flag.
f2b1d720 159.TP
3117263f 160.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)"
747a6e7c 161.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318
31266c04
VW
162This flag re-routes the event's sampled output to instead
163be included in the mmap buffer of the event specified by
164.IR group_fd .
f2b1d720 165.TP
3117263f 166.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)"
60dafbc1 167.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25
f2b1d720
MK
168This flag activates per-container system-wide monitoring.
169A container
ce88f77b 170is an abstraction that isolates a set of resources for finer-grained
699893d8 171control (CPUs, memory, etc.).
f2b1d720
MK
172In this mode, the event is measured
173only if the thread running on the monitored CPU belongs to the designated
174container (cgroup).
175The cgroup is identified by passing a file descriptor
176opened on its directory in the cgroupfs filesystem.
177For instance, if the
178cgroup to monitor is called
179.IR test ,
180then a file descriptor opened on
181.I /dev/cgroup/test
182(assuming cgroupfs is mounted on
183.IR /dev/cgroup )
184must be passed as the
185.I pid
186parameter.
33a0ccb2 187cgroup monitoring is available only
f2b1d720
MK
188for system-wide events and may therefore require extra permissions.
189.P
190The
191.I perf_event_attr
192structure provides detailed configuration information
193for the event being created.
194
195.in +4n
196.nf
197struct perf_event_attr {
ce88f77b
MK
198 __u32 type; /* Type of event */
199 __u32 size; /* Size of attribute structure */
200 __u64 config; /* Type-specific configuration */
f2b1d720
MK
201
202 union {
203 __u64 sample_period; /* Period of sampling */
204 __u64 sample_freq; /* Frequency of sampling */
205 };
206
ce88f77b
MK
207 __u64 sample_type; /* Specifies values included in sample */
208 __u64 read_format; /* Specifies values returned in read */
209
210 __u64 disabled : 1, /* off by default */
211 inherit : 1, /* children inherit it */
212 pinned : 1, /* must always be on PMU */
213 exclusive : 1, /* only group on PMU */
214 exclude_user : 1, /* don't count user */
215 exclude_kernel : 1, /* don't count kernel */
216 exclude_hv : 1, /* don't count hypervisor */
217 exclude_idle : 1, /* don't count when idle */
218 mmap : 1, /* include mmap data */
219 comm : 1, /* include comm data */
220 freq : 1, /* use freq, not period */
221 inherit_stat : 1, /* per task counts */
222 enable_on_exec : 1, /* next exec enables */
223 task : 1, /* trace fork/exit */
224 watermark : 1, /* wakeup_watermark */
225 precise_ip : 2, /* skid constraint */
226 mmap_data : 1, /* non-exec mmap data */
227 sample_id_all : 1, /* sample_type all events */
228 exclude_host : 1, /* don't count in host */
229 exclude_guest : 1, /* don't count in guest */
230 exclude_callchain_kernel : 1,
231 /* exclude kernel callchains */
232 exclude_callchain_user : 1,
233 /* exclude user callchains */
9bfc542b 234 mmap2 : 1, /* include mmap with inode data */
49bc411c 235 comm_exec : 1, /* flag comm events that are due to exec */
6bd5186a
VW
236 use_clockid : 1, /* use clockid for time fields */
237
238 __reserved_1 : 38;
f2b1d720
MK
239
240 union {
241 __u32 wakeup_events; /* wakeup every n events */
7db515ef 242 __u32 wakeup_watermark; /* bytes before wakeup */
f2b1d720
MK
243 };
244
245 __u32 bp_type; /* breakpoint type */
246
247 union {
248 __u64 bp_addr; /* breakpoint address */
249 __u64 config1; /* extension of config */
250 };
251
252 union {
253 __u64 bp_len; /* breakpoint length */
254 __u64 config2; /* extension of config1 */
255 };
ce88f77b
MK
256 __u64 branch_sample_type; /* enum perf_branch_sample_type */
257 __u64 sample_regs_user; /* user regs to dump on samples */
258 __u32 sample_stack_user; /* size of stack to dump on
7db515ef 259 samples */
6bd5186a 260 __s32 clockid; /* clock to use for time fields */
f5281dfd 261 __u64 sample_regs_intr; /* regs to dump on samples */
f2b1d720
MK
262};
263.fi
264.in
265
266The fields of the
267.I perf_event_attr
268structure are described in more detail below:
f2b1d720
MK
269.TP
270.I type
271This field specifies the overall event type.
272It has one of the following values:
273.RS
274.TP
275.B PERF_TYPE_HARDWARE
276This indicates one of the "generalized" hardware events provided
277by the kernel.
278See the
279.I config
280field definition for more details.
281.TP
282.B PERF_TYPE_SOFTWARE
283This indicates one of the software-defined events provided by the kernel
284(even if no hardware support is available).
285.TP
286.B PERF_TYPE_TRACEPOINT
287This indicates a tracepoint
288provided by the kernel tracepoint infrastructure.
289.TP
290.B PERF_TYPE_HW_CACHE
291This indicates a hardware cache event.
292This has a special encoding, described in the
293.I config
294field definition.
295.TP
296.B PERF_TYPE_RAW
297This indicates a "raw" implementation-specific event in the
298.IR config " field."
299.TP
31c1f2b0 300.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
60dafbc1 301.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
302This indicates a hardware breakpoint as provided by the CPU.
303Breakpoints can be read/write accesses to an address as well as
304execution of an instruction address.
305.TP
306.RB "dynamic PMU"
747a6e7c
VW
307Since Linux 2.6.38,
308.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19
7db515ef 309.BR perf_event_open ()
f2b1d720
MK
310can support multiple PMUs.
311To enable this, a value exported by the kernel can be used in the
312.I type
313field to indicate which PMU to use.
314The value to use can be found in the sysfs filesystem:
315there is a subdirectory per PMU instance under
316.IR /sys/bus/event_source/devices .
7d182bb6 317In each subdirectory there is a
f2b1d720
MK
318.I type
319file whose content is an integer that can be used in the
320.I type
321field.
322For instance,
323.I /sys/bus/event_source/devices/cpu/type
324contains the value for the core CPU PMU, which is usually 4.
325.RE
f2b1d720
MK
326.TP
327.I "size"
328The size of the
329.I perf_event_attr
330structure for forward/backward compatibility.
331Set this using
332.I sizeof(struct perf_event_attr)
333to allow the kernel to see
334the struct size at the time of compilation.
335
336The related define
337.B PERF_ATTR_SIZE_VER0
338is set to 64; this was the size of the first published struct.
339.B PERF_ATTR_SIZE_VER1
340is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
747a6e7c
VW
341.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
342.\" this was added much later when PERF_ATTR_SIZE_VER2 happened
343.\" but the actual attr_size had increased in 2.6.33
f2b1d720
MK
344.B PERF_ATTR_SIZE_VER2
345is 80 corresponding to the addition of branch sampling in Linux 3.4.
747a6e7c 346.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
d2a6be2f 347.B PERF_ATTR_SIZE_VER3
f2b1d720 348is 96 corresponding to the addition
7ede2f66
DP
349of
350.I sample_regs_user
351and
352.I sample_stack_user
353in Linux 3.7.
747a6e7c 354.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03
f5281dfd
VW
355.B PERF_ATTR_SIZE_VER4
356is 104 corresponding to the addition of
357.I sample_regs_intr
358in Linux 3.19.
359.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
f2b1d720
MK
360.TP
361.I "config"
362This specifies which event you want, in conjunction with
363the
364.I type
365field.
366The
367.IR config1 " and " config2
368fields are also taken into account in cases where 64 bits is not
369enough to fully specify the event.
370The encoding of these fields are event dependent.
371
f2b1d720
MK
372There are various ways to set the
373.I config
374field that are dependent on the value of the previously
375described
376.I type
377field.
378What follows are various possible settings for
379.I config
380separated out by
381.IR type .
382
383If
384.I type
385is
386.BR PERF_TYPE_HARDWARE ,
387we are measuring one of the generalized hardware CPU events.
388Not all of these are available on all platforms.
389Set
390.I config
391to one of the following:
392.RS 12
393.TP
394.B PERF_COUNT_HW_CPU_CYCLES
395Total cycles.
2b538c3e 396Be wary of what happens during CPU frequency scaling.
f2b1d720
MK
397.TP
398.B PERF_COUNT_HW_INSTRUCTIONS
399Retired instructions.
400Be careful, these can be affected by various
2b538c3e 401issues, most notably hardware interrupt counts.
f2b1d720
MK
402.TP
403.B PERF_COUNT_HW_CACHE_REFERENCES
404Cache accesses.
405Usually this indicates Last Level Cache accesses but this may
406vary depending on your CPU.
407This may include prefetches and coherency messages; again this
408depends on the design of your CPU.
409.TP
410.B PERF_COUNT_HW_CACHE_MISSES
411Cache misses.
412Usually this indicates Last Level Cache misses; this is intended to be
413used in conjunction with the
414.B PERF_COUNT_HW_CACHE_REFERENCES
415event to calculate cache miss rates.
416.TP
417.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
418Retired branch instructions.
747a6e7c 419Prior to Linux 2.6.35, this used
f2b1d720 420the wrong event on AMD processors.
747a6e7c 421.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
f2b1d720
MK
422.TP
423.B PERF_COUNT_HW_BRANCH_MISSES
424Mispredicted branch instructions.
425.TP
426.B PERF_COUNT_HW_BUS_CYCLES
427Bus cycles, which can be different from total cycles.
428.TP
31c1f2b0 429.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
747a6e7c 430.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
431Stalled cycles during issue.
432.TP
31c1f2b0 433.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
747a6e7c 434.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
435Stalled cycles during retirement.
436.TP
31c1f2b0 437.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
60dafbc1 438.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890
f2b1d720
MK
439Total cycles; not affected by CPU frequency scaling.
440.RE
441.IP
442If
443.I type
444is
445.BR PERF_TYPE_SOFTWARE ,
446we are measuring software events provided by the kernel.
447Set
448.I config
449to one of the following:
450.RS 12
451.TP
452.B PERF_COUNT_SW_CPU_CLOCK
453This reports the CPU clock, a high-resolution per-CPU timer.
454.TP
455.B PERF_COUNT_SW_TASK_CLOCK
456This reports a clock count specific to the task that is running.
457.TP
458.B PERF_COUNT_SW_PAGE_FAULTS
459This reports the number of page faults.
460.TP
461.B PERF_COUNT_SW_CONTEXT_SWITCHES
462This counts context switches.
463Until Linux 2.6.34, these were all reported as user-space
464events, after that they are reported as happening in the kernel.
747a6e7c 465.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21
f2b1d720
MK
466.TP
467.B PERF_COUNT_SW_CPU_MIGRATIONS
468This reports the number of times the process
469has migrated to a new CPU.
470.TP
471.B PERF_COUNT_SW_PAGE_FAULTS_MIN
472This counts the number of minor page faults.
473These did not require disk I/O to handle.
474.TP
475.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
476This counts the number of major page faults.
477These required disk I/O to handle.
478.TP
31c1f2b0 479.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
60dafbc1 480.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
481This counts the number of alignment faults.
482These happen when unaligned memory accesses happen; the kernel
483can handle these but it reduces performance.
33a0ccb2 484This happens only on some architectures (never on x86).
f2b1d720 485.TP
31c1f2b0 486.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
60dafbc1 487.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
488This counts the number of emulation faults.
489The kernel sometimes traps on unimplemented instructions
7db515ef 490and emulates them for user space.
f2b1d720 491This can negatively impact performance.
dab38455 492.TP
31c1f2b0 493.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
60dafbc1 494.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77
dab38455
VW
495This is a placeholder event that counts nothing.
496Informational sample record types such as mmap or comm
497must be associated with an active event.
498This dummy event allows gathering such records without requiring
499a counting event.
f2b1d720 500.RE
f2b1d720 501
f2b1d720
MK
502.RS
503If
504.I type
505is
506.BR PERF_TYPE_TRACEPOINT ,
507then we are measuring kernel tracepoints.
508The value to use in
509.I config
510can be obtained from under debugfs
511.I tracing/events/*/*/id
512if ftrace is enabled in the kernel.
f2b1d720 513.RE
1f22e274 514
f2b1d720
MK
515.RS
516If
517.I type
518is
519.BR PERF_TYPE_HW_CACHE ,
520then we are measuring a hardware CPU cache event.
521To calculate the appropriate
522.I config
523value use the following equation:
524.RS 4
525.nf
526
527 (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
528 (perf_hw_cache_op_result_id << 16)
529.fi
530.P
531where
532.I perf_hw_cache_id
533is one of:
7db515ef 534.RS 4
f2b1d720
MK
535.TP
536.B PERF_COUNT_HW_CACHE_L1D
537for measuring Level 1 Data Cache
538.TP
539.B PERF_COUNT_HW_CACHE_L1I
540for measuring Level 1 Instruction Cache
541.TP
542.B PERF_COUNT_HW_CACHE_LL
543for measuring Last-Level Cache
544.TP
545.B PERF_COUNT_HW_CACHE_DTLB
546for measuring the Data TLB
547.TP
548.B PERF_COUNT_HW_CACHE_ITLB
549for measuring the Instruction TLB
550.TP
551.B PERF_COUNT_HW_CACHE_BPU
552for measuring the branch prediction unit
553.TP
5a69ce9c
MK
554.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)"
555.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477
f2b1d720
MK
556for measuring local memory accesses
557.RE
f2b1d720
MK
558.P
559and
560.I perf_hw_cache_op_id
561is one of
7db515ef 562.RS 4
f2b1d720
MK
563.TP
564.B PERF_COUNT_HW_CACHE_OP_READ
565for read accesses
566.TP
567.B PERF_COUNT_HW_CACHE_OP_WRITE
568for write accesses
569.TP
570.B PERF_COUNT_HW_CACHE_OP_PREFETCH
571for prefetch accesses
572.RE
f2b1d720
MK
573.P
574and
575.I perf_hw_cache_op_result_id
576is one of
7db515ef 577.RS 4
f2b1d720
MK
578.TP
579.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
580to measure accesses
581.TP
582.B PERF_COUNT_HW_CACHE_RESULT_MISS
583to measure misses
584.RE
585.RE
586
587If
588.I type
589is
590.BR PERF_TYPE_RAW ,
591then a custom "raw"
592.I config
593value is needed.
594Most CPUs support events that are not covered by the "generalized" events.
595These are implementation defined; see your CPU manual (for example
596the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
597Guide).
598The libpfm4 library can be used to translate from the name in the
599architectural manuals to the raw hex value
600.BR perf_event_open ()
601expects in this field.
602
603If
604.I type
605is
606.BR PERF_TYPE_BREAKPOINT ,
607then leave
608.I config
609set to zero.
610Its parameters are set in other places.
611.RE
612.TP
613.IR sample_period ", " sample_freq
21977c9d 614A "sampling" event is one that generates an overflow notification
f2b1d720
MK
615every N events, where N is given by
616.IR sample_period .
21977c9d 617A sampling event has
f2b1d720 618.IR sample_period " > 0."
21977c9d 619When an overflow occurs, requested data is recorded
f2b1d720
MK
620in the mmap buffer.
621The
622.I sample_type
21977c9d 623field controls what data is recorded on each overflow.
f2b1d720
MK
624
625.I sample_freq
626can be used if you wish to use frequency rather than period.
37bee118 627In this case, you set the
f2b1d720
MK
628.I freq
629flag.
630The kernel will adjust the sampling period
631to try and achieve the desired rate.
632The rate of adjustment is a
633timer tick.
f2b1d720
MK
634.TP
635.I "sample_type"
636The various bits in this field specify which values to include
637in the sample.
638They will be recorded in a ring-buffer,
ad73a2cc 639which is available to user space using
f2b1d720
MK
640.BR mmap (2).
641The order in which the values are saved in the
642sample are documented in the MMAP Layout subsection below;
643it is not the
644.I "enum perf_event_sample_format"
645order.
646.RS
647.TP
648.B PERF_SAMPLE_IP
649Records instruction pointer.
650.TP
651.B PERF_SAMPLE_TID
7db515ef 652Records the process and thread IDs.
f2b1d720
MK
653.TP
654.B PERF_SAMPLE_TIME
655Records a timestamp.
656.TP
657.B PERF_SAMPLE_ADDR
658Records an address, if applicable.
659.TP
660.B PERF_SAMPLE_READ
661Record counter values for all events in a group, not just the group leader.
662.TP
663.B PERF_SAMPLE_CALLCHAIN
664Records the callchain (stack backtrace).
665.TP
666.B PERF_SAMPLE_ID
667Records a unique ID for the opened event's group leader.
668.TP
669.B PERF_SAMPLE_CPU
670Records CPU number.
671.TP
672.B PERF_SAMPLE_PERIOD
673Records the current sampling period.
674.TP
675.B PERF_SAMPLE_STREAM_ID
676Records a unique ID for the opened event.
677Unlike
678.B PERF_SAMPLE_ID
679the actual ID is returned, not the group leader.
8859d3a9
DP
680This ID is the same as the one returned by
681.BR PERF_FORMAT_ID .
f2b1d720
MK
682.TP
683.B PERF_SAMPLE_RAW
684Records additional data, if applicable.
685Usually returned by tracepoint events.
686.TP
31c1f2b0 687.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
60dafbc1 688.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
045bf4d3
VW
689This provides a record of recent branches, as provided
690by CPU branch sampling hardware (such as Intel Last Branch Record).
691Not all hardware supports this feature.
692
693See the
694.I branch_sample_type
695field for how to filter which branches are reported.
f2b1d720 696.TP
31c1f2b0 697.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
60dafbc1 698.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
d1007d14
VW
699Records the current user-level CPU register state
700(the values in the process before the kernel was called).
f2b1d720 701.TP
31c1f2b0 702.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
60dafbc1 703.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
704Records the user level stack, allowing stack unwinding.
705.TP
31c1f2b0 706.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
60dafbc1 707.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c
d1007d14 708Records a hardware provided weight value that expresses how
51700fd7 709costly the sampled event was.
d1007d14
VW
710This allows the hardware to highlight expensive events in
711a profile.
712.TP
31c1f2b0 713.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
60dafbc1 714.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1
d1007d14
VW
715Records the data source: where in the memory hierarchy
716the data associated with the sampled instruction came from.
6170255e 717This is available only if the underlying hardware
d1007d14 718supports this feature.
7480dabb 719.TP
31c1f2b0 720.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
60dafbc1 721.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955
8859d3a9
DP
722Places the
723.B SAMPLE_ID
724value in a fixed position in the record,
7480dabb
VW
725either at the beginning (for sample events) or at the end
726(if a non-sample event).
727
728This was necessary because a sample stream may have
729records from various different event sources with different
730.I sample_type
731settings.
e9bd9b2c 732Parsing the event stream properly was not possible because the
8859d3a9
DP
733format of the record was needed to find
734.BR SAMPLE_ID ,
735but
27f52b52 736the format could not be found without knowing what
7480dabb
VW
737event the sample belonged to (causing a circular
738dependency).
739
e41c36b2 740The
7480dabb
VW
741.B PERF_SAMPLE_IDENTIFIER
742setting makes the event stream always parsable
8859d3a9
DP
743by putting
744.B SAMPLE_ID
745in a fixed location, even though
746it means having duplicate
747.B SAMPLE_ID
748values in records.
1e043959 749.TP
60dafbc1
MK
750.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)"
751.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5
84fc2a6e 752Records reasons for transactional memory abort events
1e043959
VW
753(for example, from Intel TSX transactional memory support).
754
755The
756.I precise_ip
b3f39642 757setting must be greater than 0 and a transactional memory abort
1e043959 758event must be measured or no values will be recorded.
84fc2a6e
MK
759Also note that some perf_event measurements, such as sampled
760cycle counting, may cause extraneous aborts (by causing an
1e043959 761interrupt during a transaction).
f5281dfd
VW
762.TP
763.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)"
764.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
765Records a subset of the current CPU register state
766as specified by
767.IR sample_regs_intr .
768Unlike
769.B PERF_SAMPLE_REGS_USER
770the register values will return kernel register
771state if the overflow happened while kernel
772code is running.
773If the CPU supports hardware sampling of
774register state (i.e. PEBS on Intel x86) and
775.I precise_ip
776is set higher than zero then the register
777values returned are those captured by
778hardware at the time of the sampled
779instruction's retirement.
f2b1d720 780.RE
f2b1d720
MK
781.TP
782.IR "read_format"
783This field specifies the format of the data returned by
784.BR read (2)
785on a
7db515ef 786.BR perf_event_open ()
f2b1d720
MK
787file descriptor.
788.RS
789.TP
790.B PERF_FORMAT_TOTAL_TIME_ENABLED
7ede2f66
DP
791Adds the 64-bit
792.I time_enabled
793field.
f2b1d720
MK
794This can be used to calculate estimated totals if
795the PMU is overcommitted and multiplexing is happening.
796.TP
797.B PERF_FORMAT_TOTAL_TIME_RUNNING
7ede2f66
DP
798Adds the 64-bit
799.I time_running
800field.
f2b1d720 801This can be used to calculate estimated totals if
3d1ee497 802the PMU is overcommitted and multiplexing is happening.
f2b1d720
MK
803.TP
804.B PERF_FORMAT_ID
805Adds a 64-bit unique value that corresponds to the event group.
806.TP
807.B PERF_FORMAT_GROUP
808Allows all counter values in an event group to be read with one read.
809.RE
f2b1d720
MK
810.TP
811.IR "disabled"
812The
813.I disabled
814bit specifies whether the counter starts out disabled or enabled.
815If disabled, the event can later be enabled by
816.BR ioctl (2),
817.BR prctl (2),
818or
819.IR enable_on_exec .
406650db
VW
820
821When creating an event group, typically the group leader is initialized
822with
823.I disabled
824set to 1 and any child events are initialized with
825.I disabled
826set to 0.
827Despite
828.I disabled
829being 0, the child events will not start until the group leader
830is enabled.
f2b1d720
MK
831.TP
832.IR "inherit"
833The
834.I inherit
835bit specifies that this counter should count events of child
836tasks as well as the task specified.
33a0ccb2 837This applies only to new children, not to any existing children at
f2b1d720
MK
838the time the counter is created (nor to any new children of
839existing children).
840
841Inherit does not work for some combinations of
842.IR read_format s,
843such as
844.BR PERF_FORMAT_GROUP .
f2b1d720
MK
845.TP
846.IR "pinned"
847The
848.I pinned
849bit specifies that the counter should always be on the CPU if at all
850possible.
33a0ccb2 851It applies only to hardware counters and only to group leaders.
f2b1d720
MK
852If a pinned counter cannot be put onto the CPU (e.g., because there are
853not enough hardware counters or because of a conflict with some other
854event), then the counter goes into an 'error' state, where reads
855return end-of-file (i.e.,
856.BR read (2)
857returns 0) until the counter is subsequently enabled or disabled.
f2b1d720
MK
858.TP
859.IR "exclusive"
860The
861.I exclusive
862bit specifies that when this counter's group is on the CPU,
863it should be the only group using the CPU's counters.
864In the future this may allow monitoring programs to
865support PMU features that need to run alone so that they do not
866disrupt other hardware counters.
bea10c8c
VW
867
868Note that many unexpected situations may prevent events with the
869.I exclusive
d3532647 870bit set from ever running.
bea10c8c 871This includes any users running a system-wide
d3532647 872measurement as well as any kernel use of the performance counters
bea10c8c 873(including the commonly enabled NMI Watchdog Timer interface).
f2b1d720
MK
874.TP
875.IR "exclude_user"
ad73a2cc 876If this bit is set, the count excludes events that happen in user space.
f2b1d720
MK
877.TP
878.IR "exclude_kernel"
879If this bit is set, the count excludes events that happen in kernel-space.
f2b1d720
MK
880.TP
881.IR "exclude_hv"
882If this bit is set, the count excludes events that happen in the
883hypervisor.
884This is mainly for PMUs that have built-in support for handling this
885(such as POWER).
886Extra support is needed for handling hypervisor measurements on most
887machines.
f2b1d720
MK
888.TP
889.IR "exclude_idle"
890If set, don't count when the CPU is idle.
f2b1d720
MK
891.TP
892.IR "mmap"
893The
894.I mmap
75ee11e5 895bit enables generation of
cd7c700a 896.B PERF_RECORD_MMAP
75ee11e5
VW
897samples for every
898.BR mmap (2)
899call that has
cd7c700a 900.B PROT_EXEC
75ee11e5
VW
901set.
902This allows tools to notice new executable code being mapped into
903a program (dynamic shared libraries for example)
904so that addresses can be mapped back to the original code.
f2b1d720
MK
905.TP
906.IR "comm"
907The
908.I comm
909bit enables tracking of process command name as modified by the
cd7c700a 910.BR exec (2)
f2b1d720 911and
cd7c700a 912.BR prctl (PR_SET_NAME)
49bc411c
VW
913system calls as well as writing to
914.IR /proc/self/comm .
790ee6d6 915If the
49bc411c 916.I comm_exec
790ee6d6 917flag is also successfully set (possible since Linux 3.16),
747a6e7c 918.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
919then the misc flag
920.B PERF_RECORD_MISC_COMM_EXEC
921can be used to differentiate the
922.BR exec (2)
923case from the others.
f2b1d720
MK
924.TP
925.IR "freq"
926If this bit is set, then
927.I sample_frequency
928not
929.I sample_period
930is used when setting up the sampling interval.
f2b1d720
MK
931.TP
932.IR "inherit_stat"
933This bit enables saving of event counts on context switch for
934inherited tasks.
33a0ccb2 935This is meaningful only if the
f2b1d720
MK
936.I inherit
937field is set.
f2b1d720
MK
938.TP
939.IR "enable_on_exec"
940If this bit is set, a counter is automatically
941enabled after a call to
942.BR exec (2).
f2b1d720
MK
943.TP
944.IR "task"
945If this bit is set, then
946fork/exit notifications are included in the ring buffer.
f2b1d720
MK
947.TP
948.IR "watermark"
21977c9d 949If set, have an overflow notification happen when we cross the
f2b1d720
MK
950.I wakeup_watermark
951boundary.
21977c9d 952Otherwise, overflow notifications happen after
f2b1d720
MK
953.I wakeup_events
954samples.
f2b1d720 955.TP
31c1f2b0 956.IR "precise_ip" " (since Linux 2.6.35)"
747a6e7c 957.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076
f2b1d720
MK
958This controls the amount of skid.
959Skid is how many instructions
960execute between an event of interest happening and the kernel
961being able to stop and record the event.
962Smaller skid is
963better and allows more accurate reporting of which events
964correspond to which instructions, but hardware is often limited
965with how small this can be.
966
967The values of this are the following:
968.RS
969.TP
9700 -
971.B SAMPLE_IP
2b538c3e 972can have arbitrary skid.
f2b1d720
MK
973.TP
9741 -
975.B SAMPLE_IP
2b538c3e 976must have constant skid.
f2b1d720
MK
977.TP
9782 -
979.B SAMPLE_IP
2b538c3e 980requested to have 0 skid.
f2b1d720
MK
981.TP
9823 -
983.B SAMPLE_IP
984must have 0 skid.
985See also
986.BR PERF_RECORD_MISC_EXACT_IP .
987.RE
f2b1d720 988.TP
31c1f2b0 989.IR "mmap_data" " (since Linux 2.6.36)"
747a6e7c 990.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e
f2b1d720
MK
991The counterpart of the
992.I mmap
75ee11e5
VW
993field.
994This enables generation of
cd7c700a 995.B PERF_RECORD_MMAP
75ee11e5
VW
996samples for
997.BR mmap (2)
998calls that do not have
cd7c700a 999.B PROT_EXEC
75ee11e5 1000set (for example data and SysV shared memory).
f2b1d720 1001.TP
31c1f2b0 1002.IR "sample_id_all" " (since Linux 2.6.38)"
747a6e7c 1003.\" commit c980d1091810df13f21aabbce545fd98f545bbf7
7480dabb 1004If set, then TID, TIME, ID, STREAM_ID, and CPU can
f2b1d720
MK
1005additionally be included in
1006.RB non- PERF_RECORD_SAMPLE s
1007if the corresponding
1008.I sample_type
1009is selected.
7480dabb 1010
e9bd9b2c 1011If
7480dabb 1012.B PERF_SAMPLE_IDENTIFIER
37bee118 1013is specified, then an additional ID value is included
7480dabb
VW
1014as the last value to ease parsing the record stream.
1015This may lead to the
e9bd9b2c 1016.I id
7480dabb
VW
1017value appearing twice.
1018
1019The layout is described by this pseudo-structure:
1020.in +4n
1021.nf
1022struct sample_id {
1023 { u32 pid, tid; } /* if PERF_SAMPLE_TID set */
1024 { u64 time; } /* if PERF_SAMPLE_TIME set */
1025 { u64 id; } /* if PERF_SAMPLE_ID set */
1026 { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
1027 { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
1028 { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
1029};
1030.fi
f2b1d720 1031.TP
31c1f2b0 1032.IR "exclude_host" " (since Linux 3.2)"
747a6e7c 1033.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e
VW
1034When conducting measurements that include processes running
1035VM instances (i.e. have executed a
1036.I KVM_RUN
1037.BR ioctl (2)
1038) only measure events happening inside a guest instance.
1039This is only meaningful outside the guests; this setting does
1040not change counts gathered inside of a guest.
1041Currently this functionality is x86 only.
f2b1d720 1042.TP
31c1f2b0 1043.IR "exclude_guest" " (since Linux 3.2)"
747a6e7c 1044.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e
VW
1045When conducting measurements that include processes running
1046VM instances (i.e. have executed a
1047.I KVM_RUN
1048.BR ioctl (2)
1049) do not measure events happening inside guest instances.
1050This is only meaningful outside the guests; this setting does
1051not change counts gathered inside of a guest.
1052Currently this functionality is x86 only.
f2b1d720 1053.TP
31c1f2b0 1054.IR "exclude_callchain_kernel" " (since Linux 3.7)"
747a6e7c 1055.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1056Do not include kernel callchains.
f2b1d720 1057.TP
31c1f2b0 1058.IR "exclude_callchain_user" " (since Linux 3.7)"
747a6e7c 1059.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1060Do not include user callchains.
f2b1d720 1061.TP
9bfc542b 1062.IR "mmap2" " (since Linux 3.16)"
747a6e7c
VW
1063.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741
1064.\" This is tricky; was committed during 3.12 development
1065.\" but right before release was disabled.
1066.\" So while you could select mmap2 starting with 3.12
1067.\" it did not work until 3.16
1068.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005
9bfc542b
VW
1069Generate an extended executable mmap record that contains enough
1070additional information to uniquely identify shared mappings.
1071The
1072.I mmap
1073flag must also be set for this to work.
1074.TP
49bc411c 1075.IR "comm_exec" " (since Linux 3.16)"
747a6e7c 1076.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
5ab35ae5 1077This is purely a feature-detection flag, it does not change
49bc411c 1078kernel behavior.
5ab35ae5 1079If this flag can successfully be set, then, when
49bc411c 1080.I comm
5ab35ae5 1081is enabled, the
49bc411c
VW
1082.B PERF_RECORD_MISC_COMM_EXEC
1083flag will be set in the
1084.I misc
1085field of a comm record header if the rename event being
1086reported was caused by a call to
1087.BR exec (2).
1088This allows tools to distinguish between the various
1089types of process renaming.
1090.TP
6bd5186a
VW
1091.IR "use_clockid" " (since Linux 4.1)"
1092.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1093This allows selecting which internal Linux clock to use
1094when generating timestamps via the
1095.I clockid
1096field.
1097This can make it easier to correlate perf sample times with
1098timestamps generated by other tools.
1099.TP
f2b1d720
MK
1100.IR "wakeup_events" ", " "wakeup_watermark"
1101This union sets how many samples
1102.RI ( wakeup_events )
1103or bytes
1104.RI ( wakeup_watermark )
21977c9d 1105happen before an overflow notification happens.
f2b1d720
MK
1106Which one is used is selected by the
1107.I watermark
cb8a928f 1108bit flag.
751c0f1a
VW
1109
1110.I wakeup_events
6170255e 1111counts only
751c0f1a 1112.B PERF_RECORD_SAMPLE
51700fd7 1113record types.
21977c9d 1114To receive overflow notification for all
751c0f1a 1115.B PERF_RECORD
21977c9d 1116types choose watermark and set
751c0f1a
VW
1117.I wakeup_watermark
1118to 1.
21977c9d
VW
1119
1120Prior to Linux 3.0 setting
747a6e7c 1121.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50
21977c9d
VW
1122.I wakeup_events
1123to 0 resulted in no overflow notifications;
1124more recent kernels treat 0 the same as 1.
f2b1d720 1125.TP
31c1f2b0 1126.IR "bp_type" " (since Linux 2.6.33)"
747a6e7c 1127.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1128This chooses the breakpoint type.
1129It is one of:
1130.RS
1131.TP
1132.BR HW_BREAKPOINT_EMPTY
2b538c3e 1133No breakpoint.
f2b1d720
MK
1134.TP
1135.BR HW_BREAKPOINT_R
2b538c3e 1136Count when we read the memory location.
f2b1d720
MK
1137.TP
1138.BR HW_BREAKPOINT_W
2b538c3e 1139Count when we write the memory location.
f2b1d720
MK
1140.TP
1141.BR HW_BREAKPOINT_RW
2b538c3e 1142Count when we read or write the memory location.
f2b1d720
MK
1143.TP
1144.BR HW_BREAKPOINT_X
2b538c3e 1145Count when we execute code at the memory location.
f2b1d720 1146.LP
7db515ef 1147The values can be combined via a bitwise or, but the
f2b1d720
MK
1148combination of
1149.B HW_BREAKPOINT_R
1150or
1151.B HW_BREAKPOINT_W
1152with
1153.B HW_BREAKPOINT_X
1154is not allowed.
1155.RE
f2b1d720 1156.TP
31c1f2b0 1157.IR "bp_addr" " (since Linux 2.6.33)"
747a6e7c 1158.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1159.I bp_addr
1160address of the breakpoint.
1161For execution breakpoints this is the memory address of the instruction
1162of interest; for read and write breakpoints it is the memory address
1163of the memory location of interest.
f2b1d720 1164.TP
31c1f2b0 1165.IR "config1" " (since Linux 2.6.39)"
747a6e7c 1166.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1167.I config1
1168is used for setting events that need an extra register or otherwise
1169do not fit in the regular config field.
1170Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
1171on 3.3 and later kernels.
f2b1d720 1172.TP
31c1f2b0 1173.IR "bp_len" " (since Linux 2.6.33)"
747a6e7c 1174.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1175.I bp_len
1176is the length of the breakpoint being measured if
1177.I type
1178is
1179.BR PERF_TYPE_BREAKPOINT .
1180Options are
1181.BR HW_BREAKPOINT_LEN_1 ,
1182.BR HW_BREAKPOINT_LEN_2 ,
1183.BR HW_BREAKPOINT_LEN_4 ,
1184.BR HW_BREAKPOINT_LEN_8 .
1185For an execution breakpoint, set this to
1186.IR sizeof(long) .
f2b1d720 1187.TP
31c1f2b0 1188.IR "config2" " (since Linux 2.6.39)"
747a6e7c 1189.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1190
1191.I config2
1192is a further extension of the
1193.I config1
1194field.
f2b1d720 1195.TP
31c1f2b0 1196.IR "branch_sample_type" " (since Linux 3.4)"
747a6e7c 1197.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
8a94e783 1198If
045bf4d3
VW
1199.B PERF_SAMPLE_BRANCH_STACK
1200is enabled, then this specifies what branches to include
1201in the branch record.
e3c9782b
VW
1202
1203The first part of the value is the privilege level, which
1204is a combination of one of the following values.
045bf4d3
VW
1205If the user does not set privilege level explicitly, the kernel
1206will use the event's privilege level.
1207Event and branch privilege levels do not have to match.
f2b1d720
MK
1208.RS
1209.TP
1210.B PERF_SAMPLE_BRANCH_USER
33d6e2c7 1211Branch target is in user space.
f2b1d720
MK
1212.TP
1213.B PERF_SAMPLE_BRANCH_KERNEL
33d6e2c7 1214Branch target is in kernel space.
f2b1d720
MK
1215.TP
1216.B PERF_SAMPLE_BRANCH_HV
33d6e2c7 1217Branch target is in hypervisor.
e3c9782b
VW
1218.TP
1219.B PERF_SAMPLE_BRANCH_PLM_ALL
1220A convenience value that is the three preceding values ORed together.
1221
1222.P
1223In addition to the privilege value, at least one or more of the
1224following bits must be set.
1225
f2b1d720
MK
1226.TP
1227.B PERF_SAMPLE_BRANCH_ANY
33d6e2c7 1228Any branch type.
f2b1d720
MK
1229.TP
1230.B PERF_SAMPLE_BRANCH_ANY_CALL
33d6e2c7 1231Any call branch.
f2b1d720
MK
1232.TP
1233.B PERF_SAMPLE_BRANCH_ANY_RETURN
33d6e2c7 1234Any return branch.
f2b1d720 1235.TP
e3c9782b 1236.B PERF_SAMPLE_BRANCH_IND_CALL
33d6e2c7 1237Indirect calls.
f2b1d720 1238.TP
aea60aad 1239.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)"
60dafbc1 1240.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050
aea60aad
VW
1241Conditional branches.
1242.TP
31c1f2b0 1243.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
60dafbc1 1244.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1245Transactional memory aborts.
e3c9782b 1246.TP
31c1f2b0 1247.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
60dafbc1 1248.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1249Branch in transactional memory transaction.
e3c9782b 1250.TP
31c1f2b0 1251.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
60dafbc1 1252.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1253Branch not in transactional memory transaction.
bb7e6ff0
VW
1254.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)"
1255.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70
1256Branch is part of a hardware generated call stack.
1257This requires hardware support, currently only found
1258on Intel x86 Haswell or newer.
f2b1d720 1259.RE
e3c9782b 1260
f2b1d720 1261.TP
31c1f2b0 1262.IR "sample_regs_user" " (since Linux 3.7)"
747a6e7c 1263.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
4651e412 1264This bit mask defines the set of user CPU registers to dump on samples.
76c637e1 1265The layout of the register mask is architecture-specific and
d1007d14
VW
1266described in the kernel header
1267.IR arch/ARCH/include/uapi/asm/perf_regs.h .
f2b1d720 1268.TP
31c1f2b0 1269.IR "sample_stack_user" " (since Linux 3.7)"
747a6e7c 1270.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
1271This defines the size of the user stack to dump if
1272.B PERF_SAMPLE_STACK_USER
1273is specified.
6bd5186a
VW
1274.TP
1275.IR "clockid" " (since Linux 4.1)"
1276.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1277If
1278.I use_clockid
1279is set, then this field selects which internal Linux timer to
1280use for timestamps.
1281The available timers are defined in
1282.IR linux/time.h ,
1283with
1284.BR CLOCK_MONOTONIC , CLOCK_MONOTONIC_RAW , CLOCK_REALTIME ,
1285.BR CLOCK_BOOTTIME ", and " CLOCK_TAI
1286currently supported.
73d8cece 1287.SS Reading results
f2b1d720 1288Once a
7db515ef 1289.BR perf_event_open ()
3d1ee497 1290file descriptor has been opened, the values
f2b1d720
MK
1291of the events can be read from the file descriptor.
1292The values that are there are specified by the
1293.I read_format
7db515ef
MK
1294field in the
1295.I attr
1296structure at open time.
f2b1d720
MK
1297
1298If you attempt to read into a buffer that is not big enough to hold the
1299data
1300.B ENOSPC
1301is returned
1302
1303Here is the layout of the data returned by a read:
e525b89f 1304.IP * 2
f2b1d720
MK
1305If
1306.B PERF_FORMAT_GROUP
1307was specified to allow reading all events in a group at once:
1308
1309.in +4n
1310.nf
1311struct read_format {
e525b89f
MK
1312 u64 nr; /* The number of events */
1313 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1314 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
e307112d 1315 struct {
e525b89f
MK
1316 u64 value; /* The value of the event */
1317 u64 id; /* if PERF_FORMAT_ID */
f2b1d720
MK
1318 } values[nr];
1319};
1320.fi
1321.in
e525b89f 1322.IP *
f2b1d720
MK
1323If
1324.B PERF_FORMAT_GROUP
1325was
1326.I not
e525b89f 1327specified:
f2b1d720
MK
1328
1329.in +4n
1330.nf
1331struct read_format {
1332 u64 value; /* The value of the event */
1333 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1334 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1335 u64 id; /* if PERF_FORMAT_ID */
1336};
1337.fi
1338.in
e525b89f
MK
1339.PP
1340The values read are as follows:
f2b1d720
MK
1341.TP
1342.I nr
1343The number of events in this file descriptor.
1344Only available if
1345.B PERF_FORMAT_GROUP
1346was specified.
f2b1d720
MK
1347.TP
1348.IR time_enabled ", " time_running
1349Total time the event was enabled and running.
1350Normally these are the same.
37bee118
MK
1351If more events are started,
1352then available counter slots on the PMU, then multiplexing
33a0ccb2 1353happens and events run only part of the time.
37bee118 1354In that case, the
f2b1d720
MK
1355.I time_enabled
1356and
1357.I time running
1358values can be used to scale an estimated value for the count.
f2b1d720
MK
1359.TP
1360.I value
1361An unsigned 64-bit value containing the counter result.
f2b1d720
MK
1362.TP
1363.I id
6170255e 1364A globally unique value for this particular event, only present if
f2b1d720 1365.B PERF_FORMAT_ID
e525b89f
MK
1366was specified in
1367.IR read_format .
73d8cece 1368.SS MMAP layout
f2b1d720 1369When using
7db515ef 1370.BR perf_event_open ()
f2b1d720
MK
1371in sampled mode, asynchronous events
1372(like counter overflow or
1373.B PROT_EXEC
1374mmap tracking)
1375are logged into a ring-buffer.
1376This ring-buffer is created and accessed through
1377.BR mmap (2).
1378
1379The mmap size should be 1+2^n pages, where the first page is a
1380metadata page
e525b89f 1381.RI ( "struct perf_event_mmap_page" )
f2b1d720
MK
1382that contains various
1383bits of information such as where the ring-buffer head is.
1384
1385Before kernel 2.6.39, there is a bug that means you must allocate a mmap
1386ring buffer when sampling even if you do not plan to access it.
1387
1388The structure of the first metadata mmap page is as follows:
1389
1390.in +4n
1391.nf
1392struct perf_event_mmap_page {
ce88f77b
MK
1393 __u32 version; /* version number of this structure */
1394 __u32 compat_version; /* lowest version this is compat with */
1395 __u32 lock; /* seqlock for synchronization */
1396 __u32 index; /* hardware counter identifier */
1397 __s64 offset; /* add to hardware counter value */
1398 __u64 time_enabled; /* time event active */
1399 __u64 time_running; /* time event on CPU */
f2b1d720
MK
1400 union {
1401 __u64 capabilities;
135cba8b 1402 struct {
ce88f77b
MK
1403 __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
1404 cap_bit0_is_deprecated : 1,
1405 cap_user_rdpmc : 1,
1406 cap_user_time : 1,
1407 cap_user_time_zero : 1,
135cba8b 1408 };
f2b1d720 1409 };
ce88f77b
MK
1410 __u16 pmc_width;
1411 __u16 time_shift;
1412 __u32 time_mult;
1413 __u64 time_offset;
1414 __u64 __reserved[120]; /* Pad to 1k */
1415 __u64 data_head; /* head in the data section */
1416 __u64 data_tail; /* user-space written tail */
21d9849a
VW
1417 __u64 data_offset; /* where the buffer starts */
1418 __u64 data_size; /* data buffer size */
4e47c6e5
VW
1419 __u64 aux_head;
1420 __u64 aux_tail;
1421 __u64 aux_offset;
1422 __u64 aux_size;
21d9849a 1423
f2b1d720
MK
1424}
1425.fi
1426.in
1427
ce88f77b 1428The following list describes the fields in the
f2b1d720 1429.I perf_event_mmap_page
e525b89f 1430structure in more detail:
f2b1d720
MK
1431.TP
1432.I version
1433Version number of this structure.
f2b1d720
MK
1434.TP
1435.I compat_version
1436The lowest version this is compatible with.
f2b1d720
MK
1437.TP
1438.I lock
1439A seqlock for synchronization.
f2b1d720
MK
1440.TP
1441.I index
1442A unique hardware counter identifier.
f2b1d720
MK
1443.TP
1444.I offset
135cba8b
VW
1445When using rdpmc for reads this offset value
1446must be added to the one returned by rdpmc to get
1447the current total event count.
f2b1d720
MK
1448.TP
1449.I time_enabled
1450Time the event was active.
f2b1d720
MK
1451.TP
1452.I time_running
1453Time the event was running.
f2b1d720 1454.TP
31c1f2b0 1455.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
747a6e7c 1456.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
e9bd9b2c 1457There was a bug in the definition of
f2b1d720 1458.I cap_usr_time
135cba8b
VW
1459and
1460.I cap_usr_rdpmc
1461from Linux 3.4 until Linux 3.11.
1462Both bits were defined to point to the same location, so it was
e9bd9b2c 1463impossible to know if
135cba8b
VW
1464.I cap_usr_time
1465or
1466.I cap_usr_rdpmc
1467were actually set.
1468
4010bc07 1469Starting with Linux 3.12, these are renamed to
747a6e7c 1470.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b 1471.I cap_bit0
e41c36b2 1472and you should use the
135cba8b
VW
1473.I cap_user_time
1474and
1475.I cap_user_rdpmc
1476fields instead.
1477
f2b1d720 1478.TP
31c1f2b0 1479.IR cap_bit0_is_deprecated " (since Linux 3.12)"
747a6e7c 1480.\" commit fa7315871046b9a4c48627905691dbde57e51033
37bee118 1481If set, this bit indicates that the kernel supports
135cba8b
VW
1482the properly separated
1483.I cap_user_time
1484and
1485.I cap_user_rdpmc
1486bits.
1487
1488If not-set, it indicates an older kernel where
1489.I cap_usr_time
1490and
f2b1d720 1491.I cap_usr_rdpmc
135cba8b
VW
1492map to the same bit and thus both features should
1493be used with caution.
1494
1495.TP
31c1f2b0 1496.IR cap_user_rdpmc " (since Linux 3.12)"
747a6e7c 1497.\" commit fa7315871046b9a4c48627905691dbde57e51033
f2b1d720
MK
1498If the hardware supports user-space read of performance counters
1499without syscall (this is the "rdpmc" instruction on x86), then
1500the following code can be used to do a read:
1501
1502.in +4n
1503.nf
1504u32 seq, time_mult, time_shift, idx, width;
1505u64 count, enabled, running;
1506u64 cyc, time_offset;
f2b1d720
MK
1507
1508do {
1509 seq = pc\->lock;
1510 barrier();
1511 enabled = pc\->time_enabled;
1512 running = pc\->time_running;
1513
1514 if (pc\->cap_usr_time && enabled != running) {
1515 cyc = rdtsc();
1516 time_offset = pc\->time_offset;
1517 time_mult = pc\->time_mult;
1518 time_shift = pc\->time_shift;
1519 }
1520
1521 idx = pc\->index;
1522 count = pc\->offset;
1523
1524 if (pc\->cap_usr_rdpmc && idx) {
1525 width = pc\->pmc_width;
135cba8b 1526 count += rdpmc(idx \- 1);
f2b1d720
MK
1527 }
1528
1529 barrier();
1530} while (pc\->lock != seq);
1531.fi
1532.in
f2b1d720 1533.TP
cc19ea28 1534.IR cap_user_time " (since Linux 3.12)"
747a6e7c 1535.\" commit fa7315871046b9a4c48627905691dbde57e51033
7d182bb6 1536This bit indicates the hardware has a constant, nonstop
135cba8b
VW
1537timestamp counter (TSC on x86).
1538.TP
31c1f2b0 1539.IR cap_user_time_zero " (since Linux 3.12)"
747a6e7c 1540.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
1541Indicates the presence of
1542.I time_zero
1543which allows mapping timestamp values to
1544the hardware clock.
1545.TP
f2b1d720
MK
1546.I pmc_width
1547If
1548.IR cap_usr_rdpmc ,
1549this field provides the bit-width of the value
1550read using the rdpmc or equivalent instruction.
1551This can be used to sign extend the result like:
1552
1553.in +4n
1554.nf
1555pmc <<= 64 \- pmc_width;
1556pmc >>= 64 \- pmc_width; // signed shift right
1557count += pmc;
1558.fi
1559.in
f2b1d720
MK
1560.TP
1561.IR time_shift ", " time_mult ", " time_offset
1562
1563If
1564.IR cap_usr_time ,
1565these fields can be used to compute the time
7db515ef 1566delta since time_enabled (in nanoseconds) using rdtsc or similar.
f2b1d720
MK
1567.nf
1568
1569 u64 quot, rem;
1570 u64 delta;
1571 quot = (cyc >> time_shift);
1572 rem = cyc & ((1 << time_shift) \- 1);
1573 delta = time_offset + quot * time_mult +
1574 ((rem * time_mult) >> time_shift);
1575.fi
1576
7db515ef
MK
1577Where
1578.IR time_offset ,
1579.IR time_mult ,
1580.IR time_shift ,
1581and
1582.IR cyc
1583are read in the
f2b1d720
MK
1584seqcount loop described above.
1585This delta can then be added to
1586enabled and possible running (if idx), improving the scaling:
1587.nf
1588
1589 enabled += delta;
1590 if (idx)
1591 running += delta;
1592 quot = count / running;
1593 rem = count % running;
1594 count = quot * enabled + (rem * enabled) / running;
1595.fi
f2b1d720 1596.TP
31c1f2b0 1597.IR time_zero " (since Linux 3.12)"
747a6e7c 1598.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b 1599
e9bd9b2c 1600If
135cba8b 1601.I cap_usr_time_zero
37bee118 1602is set, then the hardware clock (the TSC timestamp counter on x86)
135cba8b
VW
1603can be calculated from the
1604.IR time_zero ", " time_mult ", and " time_shift " values:"
ce88f77b 1605
135cba8b
VW
1606.nf
1607 time = timestamp - time_zero;
1608 quot = time / time_mult;
1609 rem = time % time_mult;
1610 cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
1611.fi
ce88f77b 1612
135cba8b 1613And vice versa:
ce88f77b 1614
135cba8b
VW
1615.nf
1616 quot = cyc >> time_shift;
1617 rem = cyc & ((1 << time_shift) - 1);
1618 timestamp = time_zero + quot * time_mult +
1619 ((rem * time_mult) >> time_shift);
1620.fi
1621.TP
f2b1d720
MK
1622.I data_head
1623This points to the head of the data section.
7db515ef
MK
1624The value continuously increases, it does not wrap.
1625The value needs to be manually wrapped by the size of the mmap buffer
f2b1d720
MK
1626before accessing the samples.
1627
ce88f77b
MK
1628On SMP-capable platforms, after reading the
1629.I data_head
1630value,
ad73a2cc 1631user space should issue an rmb().
f2b1d720 1632.TP
fecd584f 1633.I data_tail
f2b1d720
MK
1634When the mapping is
1635.BR PROT_WRITE ,
7db515ef
MK
1636the
1637.I data_tail
1638value should be written by user space to reflect the last read data.
31020de9 1639In this case, the kernel will not overwrite unread data.
21d9849a
VW
1640.TP
1641.IR data_offset " (since Linux 4.1)"
1642.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1643Contains the offset of the location in the mmap buffer
1644where perf sample data begins.
1645.TP
1646.IR data_size " (since Linux 4.1)"
1647.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1648Contains the size of the perf sample region within
1649the mmap buffer.
4e47c6e5
VW
1650.TP
1651.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)
1652.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff
1653The AUX region allows mmaping a separate sample buffer for high
1654bandwidth data streams (separate from the main perf sample buffer).
1655An example of a high bandwidth stream is instruction tracing support,
1656as is found in newer Intel processors.
1657
1658To set up an AUX area, first
1659.I aux_offset
1660needs to be set with an offset greater than
1661.IR data_offset + data_size
1662and
1663.I aux_size
1664needs to be set to the desired buffer size.
1665The desired offset and size must be page aligned, and the size
1666must be a power of two.
1667These values are then passed to mmap in order to map the AUX buffer.
1668Pages in the AUX buffer are included as part of the user mlock
1669rlimit as well as the
1670.I perf_event_mlock_kb
1671allowance.
1672
1673The
1674.IR aux_head " and " aux_tail
1675ring buffer pointers have the same behavior and ordering
1676rules as the previous described
1677.IR data_head " and " data_tail .
e525b89f 1678.PP
f2b1d720
MK
1679The following 2^n ring-buffer pages have the layout described below.
1680
1681If
1682.I perf_event_attr.sample_id_all
1683is set, then all event types will
1684have the sample_type selected fields related to where/when (identity)
1685an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1686.B PERF_RECORD_SAMPLE
1687below, it will be stashed just after the
7db515ef
MK
1688.I perf_event_header
1689and the fields already present for the existing
3d1ee497 1690fields, that is, at the end of the payload.
f2b1d720
MK
1691That way a newer perf.data
1692file will be supported by older perf tools, with these new optional
1693fields being ignored.
1694
1695The mmap values start with a header:
1696
1697.in +4n
1698.nf
1699struct perf_event_header {
1700 __u32 type;
1701 __u16 misc;
1702 __u16 size;
1703};
1704.fi
1705.in
1706
1707Below, we describe the
1708.I perf_event_header
1709fields in more detail.
4047bc6c
MK
1710For ease of reading,
1711the fields with shorter descriptions are presented first.
1712.TP
1713.I size
1714This indicates the size of the record.
1715.TP
1716.I misc
1717The
1718.I misc
1719field contains additional information about the sample.
1720
1721The CPU mode can be determined from this value by masking with
1722.B PERF_RECORD_MISC_CPUMODE_MASK
1723and looking for one of the following (note these are not
1724bit masks, only one can be set at a time):
1725.RS
1726.TP
1727.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1728Unknown CPU mode.
1729.TP
1730.B PERF_RECORD_MISC_KERNEL
1731Sample happened in the kernel.
1732.TP
1733.B PERF_RECORD_MISC_USER
1734Sample happened in user code.
1735.TP
1736.B PERF_RECORD_MISC_HYPERVISOR
1737Sample happened in the hypervisor.
1738.TP
747a6e7c 1739.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)"
60dafbc1 1740.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1741Sample happened in the guest kernel.
1742.TP
747a6e7c 1743.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)"
60dafbc1 1744.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1745Sample happened in guest user code.
1746.RE
1747
1748.RS
1749In addition, one of the following bits can be set:
1750.TP
60dafbc1
MK
1751.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)"
1752.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75
4047bc6c
MK
1753This is set when the mapping is not executable;
1754otherwise the mapping is executable.
1755.TP
60dafbc1
MK
1756.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)"
1757.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
1758This is set for a
1759.B PERF_RECORD_COMM
1760record on kernels more recent than Linux 3.16
1761if a process name change was caused by an
1762.BR exec (2)
1763system call.
1764It is an alias for
1765.B PERF_RECORD_MISC_MMAP_DATA
1766since the two values would not be set in the same record.
1767.TP
4047bc6c
MK
1768.B PERF_RECORD_MISC_EXACT_IP
1769This indicates that the content of
1770.B PERF_SAMPLE_IP
1771points
1772to the actual instruction that triggered the event.
1773See also
1774.IR perf_event_attr.precise_ip .
1775.TP
60dafbc1
MK
1776.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)"
1777.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74
4047bc6c
MK
1778This indicates there is extended data available (currently not used).
1779.RE
f2b1d720
MK
1780.TP
1781.I type
1782The
1783.I type
1784value is one of the below.
1785The values in the corresponding record (that follows the header)
1786depend on the
1787.I type
1788selected as shown.
7480dabb 1789
f2b1d720 1790.RS
7db515ef 1791.TP 4
f2b1d720
MK
1792.B PERF_RECORD_MMAP
1793The MMAP events record the
1794.B PROT_EXEC
1795mappings so that we can correlate
ad73a2cc 1796user-space IPs to code.
f2b1d720
MK
1797They have the following structure:
1798
1799.in +4n
1800.nf
1801struct {
1802 struct perf_event_header header;
1803 u32 pid, tid;
1804 u64 addr;
1805 u64 len;
1806 u64 pgoff;
1807 char filename[];
1808};
1809.fi
1810.in
9bfc542b
VW
1811.RS
1812.TP
1813.I pid
3a058284 1814is the process ID.
9bfc542b
VW
1815.TP
1816.I tid
3a058284 1817is the thread ID.
9bfc542b
VW
1818.TP
1819.I addr
1820is the address of the allocated memory.
1821.I len
1822is the length of the allocated memory.
1823.I pgoff
1824is the page offset of the allocated memory.
1825.I filename
1826is a string describing the backing of the allocated memory.
1827.RE
f2b1d720
MK
1828.TP
1829.B PERF_RECORD_LOST
1830This record indicates when events are lost.
1831
1832.in +4n
1833.nf
1834struct {
1835 struct perf_event_header header;
1836 u64 id;
1837 u64 lost;
7480dabb 1838 struct sample_id sample_id;
f2b1d720
MK
1839};
1840.fi
1841.in
f2b1d720
MK
1842.RS
1843.TP
1844.I id
1845is the unique event ID for the samples that were lost.
1846.TP
1847.I lost
1848is the number of events that were lost.
1849.RE
f2b1d720
MK
1850.TP
1851.B PERF_RECORD_COMM
1852This record indicates a change in the process name.
1853
1854.in +4n
1855.nf
1856struct {
1857 struct perf_event_header header;
5ab35ae5
MK
1858 u32 pid;
1859 u32 tid;
f2b1d720 1860 char comm[];
7480dabb 1861 struct sample_id sample_id;
f2b1d720
MK
1862};
1863.fi
1864.in
49bc411c
VW
1865.RS
1866.TP
1867.I pid
5ab35ae5 1868is the process ID.
49bc411c
VW
1869.TP
1870.I tid
5ab35ae5 1871is the thread ID.
49bc411c
VW
1872.TP
1873.I comm
1874is a string containing the new name of the process.
1875.RE
f2b1d720
MK
1876.TP
1877.B PERF_RECORD_EXIT
1878This record indicates a process exit event.
1879
1880.in +4n
1881.nf
1882struct {
1883 struct perf_event_header header;
1884 u32 pid, ppid;
1885 u32 tid, ptid;
1886 u64 time;
7480dabb 1887 struct sample_id sample_id;
f2b1d720
MK
1888};
1889.fi
1890.in
f2b1d720
MK
1891.TP
1892.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
1893This record indicates a throttle/unthrottle event.
1894
1895.in +4n
1896.nf
1897struct {
1898 struct perf_event_header header;
1899 u64 time;
1900 u64 id;
1901 u64 stream_id;
7480dabb 1902 struct sample_id sample_id;
f2b1d720
MK
1903};
1904.fi
1905.in
f2b1d720
MK
1906.TP
1907.B PERF_RECORD_FORK
1908This record indicates a fork event.
1909
1910.in +4n
1911.nf
1912struct {
1913 struct perf_event_header header;
1914 u32 pid, ppid;
1915 u32 tid, ptid;
1916 u64 time;
7480dabb 1917 struct sample_id sample_id;
f2b1d720
MK
1918};
1919.fi
1920.in
f2b1d720
MK
1921.TP
1922.B PERF_RECORD_READ
1923This record indicates a read event.
1924
1925.in +4n
1926.nf
1927struct {
1928 struct perf_event_header header;
1929 u32 pid, tid;
1930 struct read_format values;
7480dabb 1931 struct sample_id sample_id;
f2b1d720
MK
1932};
1933.fi
1934.in
f2b1d720
MK
1935.TP
1936.B PERF_RECORD_SAMPLE
1937This record indicates a sample.
1938
1939.in +4n
1940.nf
1941struct {
1942 struct perf_event_header header;
7480dabb 1943 u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
7db515ef
MK
1944 u64 ip; /* if PERF_SAMPLE_IP */
1945 u32 pid, tid; /* if PERF_SAMPLE_TID */
1946 u64 time; /* if PERF_SAMPLE_TIME */
1947 u64 addr; /* if PERF_SAMPLE_ADDR */
1948 u64 id; /* if PERF_SAMPLE_ID */
1949 u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
1950 u32 cpu, res; /* if PERF_SAMPLE_CPU */
1951 u64 period; /* if PERF_SAMPLE_PERIOD */
f2b1d720 1952 struct read_format v; /* if PERF_SAMPLE_READ */
7db515ef
MK
1953 u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
1954 u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
1955 u32 size; /* if PERF_SAMPLE_RAW */
1956 char data[size]; /* if PERF_SAMPLE_RAW */
1957 u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
1958 struct perf_branch_entry lbr[bnr];
1959 /* if PERF_SAMPLE_BRANCH_STACK */
1960 u64 abi; /* if PERF_SAMPLE_REGS_USER */
1961 u64 regs[weight(mask)];
1962 /* if PERF_SAMPLE_REGS_USER */
1963 u64 size; /* if PERF_SAMPLE_STACK_USER */
1964 char data[size]; /* if PERF_SAMPLE_STACK_USER */
1965 u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
d1007d14
VW
1966 u64 weight; /* if PERF_SAMPLE_WEIGHT */
1967 u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
1e043959 1968 u64 transaction;/* if PERF_SAMPLE_TRANSACTION */
f5281dfd
VW
1969 u64 abi; /* if PERF_SAMPLE_REGS_INTR */
1970 u64 regs[weight(mask)];
1971 /* if PERF_SAMPLE_REGS_INTR */
f2b1d720
MK
1972};
1973.fi
4047bc6c
MK
1974.RS 4
1975.TP 4
7480dabb
VW
1976.I sample_id
1977If
1978.B PERF_SAMPLE_IDENTIFIER
1979is enabled, a 64-bit unique ID is included.
e9bd9b2c 1980This is a duplication of the
7480dabb
VW
1981.B PERF_SAMPLE_ID
1982.I id
1983value, but included at the beginning of the sample
1984so parsers can easily obtain the value.
1985.TP
f2b1d720 1986.I ip
7db515ef
MK
1987If
1988.B PERF_SAMPLE_IP
1989is enabled, then a 64-bit instruction
f2b1d720 1990pointer value is included.
f2b1d720 1991.TP
7db515ef
MK
1992.IR pid ", " tid
1993If
1994.B PERF_SAMPLE_TID
1995is enabled, then a 32-bit process ID
1996and 32-bit thread ID are included.
f2b1d720
MK
1997.TP
1998.I time
7db515ef
MK
1999If
2000.B PERF_SAMPLE_TIME
2001is enabled, then a 64-bit timestamp
f2b1d720
MK
2002is included.
2003This is obtained via local_clock() which is a hardware timestamp
2004if available and the jiffies value if not.
f2b1d720
MK
2005.TP
2006.I addr
7db515ef
MK
2007If
2008.B PERF_SAMPLE_ADDR
2009is enabled, then a 64-bit address is included.
f2b1d720
MK
2010This is usually the address of a tracepoint,
2011breakpoint, or software event; otherwise the value is 0.
f2b1d720
MK
2012.TP
2013.I id
7db515ef
MK
2014If
2015.B PERF_SAMPLE_ID
2016is enabled, a 64-bit unique ID is included.
f2b1d720 2017If the event is a member of an event group, the group leader ID is returned.
7db515ef
MK
2018This ID is the same as the one returned by
2019.BR PERF_FORMAT_ID .
f2b1d720
MK
2020.TP
2021.I stream_id
7db515ef
MK
2022If
2023.B PERF_SAMPLE_STREAM_ID
2024is enabled, a 64-bit unique ID is included.
f2b1d720
MK
2025Unlike
2026.B PERF_SAMPLE_ID
2027the actual ID is returned, not the group leader.
7db515ef
MK
2028This ID is the same as the one returned by
2029.BR PERF_FORMAT_ID .
f2b1d720 2030.TP
7db515ef
MK
2031.IR cpu ", " res
2032If
2033.B PERF_SAMPLE_CPU
2034is enabled, this is a 32-bit value indicating
f2b1d720
MK
2035which CPU was being used, in addition to a reserved (unused)
203632-bit value.
f2b1d720
MK
2037.TP
2038.I period
7db515ef
MK
2039If
2040.B PERF_SAMPLE_PERIOD
2041is enabled, a 64-bit value indicating
f2b1d720 2042the current sampling period is written.
f2b1d720
MK
2043.TP
2044.I v
7db515ef
MK
2045If
2046.B PERF_SAMPLE_READ
2047is enabled, a structure of type read_format
f2b1d720
MK
2048is included which has values for all events in the event group.
2049The values included depend on the
2050.I read_format
7db515ef
MK
2051value used at
2052.BR perf_event_open ()
2053time.
f2b1d720 2054.TP
7db515ef
MK
2055.IR nr ", " ips[nr]
2056If
2057.B PERF_SAMPLE_CALLCHAIN
2058is enabled, then a 64-bit number is included
f2b1d720 2059which indicates how many following 64-bit instruction pointers will
7db515ef
MK
2060follow.
2061This is the current callchain.
f2b1d720 2062.TP
7ede2f66 2063.IR size ", " data[size]
7db515ef
MK
2064If
2065.B PERF_SAMPLE_RAW
2066is enabled, then a 32-bit value indicating size
f2b1d720
MK
2067is included followed by an array of 8-bit values of length size.
2068The values are padded with 0 to have 64-bit alignment.
2069
2070This RAW record data is opaque with respect to the ABI.
2071The ABI doesn't make any promises with respect to the stability
2072of its content, it may vary depending
2073on event, hardware, and kernel version.
f2b1d720 2074.TP
7db515ef
MK
2075.IR bnr ", " lbr[bnr]
2076If
2077.B PERF_SAMPLE_BRANCH_STACK
2078is enabled, then a 64-bit value indicating
2079the number of records is included, followed by
2080.I bnr
2081.I perf_branch_entry
045bf4d3
VW
2082structures which each include the fields:
2083.RS
2084.TP
2085.I from
2b538c3e 2086This indicates the source instruction (may not be a branch).
045bf4d3
VW
2087.TP
2088.I to
2b538c3e 2089The branch target.
045bf4d3
VW
2090.TP
2091.I mispred
2b538c3e 2092The branch target was mispredicted.
045bf4d3
VW
2093.TP
2094.I predicted
2b538c3e 2095The branch target was predicted.
e3c9782b 2096.TP
31c1f2b0 2097.IR in_tx " (since Linux 3.11)"
747a6e7c 2098.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2099The branch was in a transactional memory transaction.
e3c9782b 2100.TP
31c1f2b0 2101.IR abort " (since Linux 3.11)"
747a6e7c 2102.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2103The branch was in an aborted transactional memory transaction.
e3c9782b
VW
2104
2105.P
045bf4d3
VW
2106The entries are from most to least recent, so the first entry
2107has the most recent branch.
2108
8a94e783
MK
2109Support for
2110.I mispred
2111and
2112.I predicted
baf7029b 2113is optional; if not supported, both
045bf4d3
VW
2114values will be 0.
2115
e3c9782b
VW
2116The type of branches recorded is specified by the
2117.I branch_sample_type
2118field.
2119.RE
2120
f2b1d720 2121.TP
7db515ef
MK
2122.IR abi ", " regs[weight(mask)]
2123If
2124.B PERF_SAMPLE_REGS_USER
d1007d14 2125is enabled, then the user CPU registers are recorded.
f2b1d720
MK
2126
2127The
2128.I abi
2129field is one of
2130.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
7db515ef 2131.BR PERF_SAMPLE_REGS_ABI_64 .
d1007d14
VW
2132
2133The
2134.I regs
2135field is an array of the CPU registers that were specified by
2136the
2137.I sample_regs_user
2138attr field.
2139The number of values is the number of bits set in the
51700fd7 2140.I sample_regs_user
4651e412 2141bit mask.
f2b1d720 2142.TP
7db515ef
MK
2143.IR size ", " data[size] ", " dyn_size
2144If
2145.B PERF_SAMPLE_STACK_USER
02ca78a0
VW
2146is enabled, then the user stack is recorded.
2147This can be used to generate stack backtraces.
d1007d14
VW
2148.I size
2149is the size requested by the user in
02ca78a0 2150.I sample_stack_user
d1007d14
VW
2151or else the maximum record size.
2152.I data
02ca78a0
VW
2153is the stack data (a raw dump of the memory pointed to by the
2154stack pointer at the time of sampling).
d1007d14
VW
2155.I dyn_size
2156is the amount of data actually dumped (can be less than
460e3d7a 2157.IR size ).
d1007d14 2158.TP
51700fd7 2159.I weight
d1007d14
VW
2160If
2161.B PERF_SAMPLE_WEIGHT
7de4a1e3 2162is enabled, then a 64-bit value provided by the hardware
d1007d14
VW
2163is recorded that indicates how costly the event was.
2164This allows expensive events to stand out more clearly
2165in profiles.
2166.TP
2167.I data_src
51700fd7 2168If
d1007d14 2169.B PERF_SAMPLE_DATA_SRC
7de4a1e3 2170is enabled, then a 64-bit value is recorded that is made up of
d1007d14
VW
2171the following fields:
2172.RS
2b538c3e 2173.TP 4
d1007d14 2174.I mem_op
2b538c3e
MK
2175Type of opcode, a bitwise combination of:
2176
2177.PD 0
2178.RS
2179.TP 24
d1007d14 2180.B PERF_MEM_OP_NA
2b538c3e
MK
2181Not available
2182.TP
d1007d14 2183.B PERF_MEM_OP_LOAD
2b538c3e
MK
2184Load instruction
2185.TP
d1007d14 2186.B PERF_MEM_OP_STORE
2b538c3e
MK
2187Store instruction
2188.TP
d1007d14 2189.B PERF_MEM_OP_PFETCH
2b538c3e
MK
2190Prefetch
2191.TP
d1007d14 2192.B PERF_MEM_OP_EXEC
2b538c3e
MK
2193Executable code
2194.RE
2195.PD
d1007d14
VW
2196.TP
2197.I mem_lvl
bc9d90b5 2198Memory hierarchy level hit or miss, a bitwise combination of
ef4f4031 2199the following, shifted left by
bc9d90b5 2200.BR PERF_MEM_LVL_SHIFT :
2b538c3e
MK
2201
2202.PD 0
2203.RS
2204.TP 24
d1007d14 2205.B PERF_MEM_LVL_NA
2b538c3e
MK
2206Not available
2207.TP
d1007d14 2208.B PERF_MEM_LVL_HIT
2b538c3e
MK
2209Hit
2210.TP
d1007d14 2211.B PERF_MEM_LVL_MISS
2b538c3e
MK
2212Miss
2213.TP
d1007d14 2214.B PERF_MEM_LVL_L1
2b538c3e
MK
2215Level 1 cache
2216.TP
d1007d14 2217.B PERF_MEM_LVL_LFB
2b538c3e
MK
2218Line fill buffer
2219.TP
d1007d14 2220.B PERF_MEM_LVL_L2
2b538c3e
MK
2221Level 2 cache
2222.TP
d1007d14 2223.B PERF_MEM_LVL_L3
2b538c3e
MK
2224Level 3 cache
2225.TP
d1007d14 2226.B PERF_MEM_LVL_LOC_RAM
2b538c3e
MK
2227Local DRAM
2228.TP
d1007d14 2229.B PERF_MEM_LVL_REM_RAM1
2b538c3e
MK
2230Remote DRAM 1 hop
2231.TP
d1007d14 2232.B PERF_MEM_LVL_REM_RAM2
2b538c3e
MK
2233Remote DRAM 2 hops
2234.TP
d1007d14 2235.B PERF_MEM_LVL_REM_CCE1
2b538c3e
MK
2236Remote cache 1 hop
2237.TP
d1007d14 2238.B PERF_MEM_LVL_REM_CCE2
2b538c3e
MK
2239Remote cache 2 hops
2240.TP
d1007d14 2241.B PERF_MEM_LVL_IO
2b538c3e
MK
2242I/O memory
2243.TP
d1007d14 2244.B PERF_MEM_LVL_UNC
2b538c3e
MK
2245Uncached memory
2246.RE
2247.PD
d1007d14
VW
2248.TP
2249.I mem_snoop
bc9d90b5
VW
2250Snoop mode, a bitwise combination of the following, shifted left by
2251.BR PERF_MEM_SNOOP_SHIFT :
2b538c3e
MK
2252
2253.PD 0
2254.RS
2255.TP 24
d1007d14 2256.B PERF_MEM_SNOOP_NA
2b538c3e
MK
2257Not available
2258.TP
d1007d14 2259.B PERF_MEM_SNOOP_NONE
2b538c3e
MK
2260No snoop
2261.TP
d1007d14 2262.B PERF_MEM_SNOOP_HIT
2b538c3e
MK
2263Snoop hit
2264.TP
d1007d14 2265.B PERF_MEM_SNOOP_MISS
2b538c3e
MK
2266Snoop miss
2267.TP
d1007d14 2268.B PERF_MEM_SNOOP_HITM
2b538c3e
MK
2269Snoop hit modified
2270.RE
2271.PD
d1007d14
VW
2272.TP
2273.I mem_lock
bc9d90b5
VW
2274Lock instruction, a bitwise combination of the following, shifted left by
2275.BR PERF_MEM_LOCK_SHIFT :
2b538c3e
MK
2276
2277.PD 0
2278.RS
2279.TP 24
d1007d14 2280.B PERF_MEM_LOCK_NA
2b538c3e
MK
2281Not available
2282.TP
d1007d14 2283.B PERF_MEM_LOCK_LOCKED
2b538c3e
MK
2284Locked transaction
2285.RE
2286.PD
d1007d14
VW
2287.TP
2288.I mem_dtlb
bc9d90b5
VW
2289TLB access hit or miss, a bitwise combination of the following, shifted
2290left by
2291.BR PERF_MEM_TLB_SHIFT :
2b538c3e
MK
2292
2293.PD 0
2294.RS
2295.TP 24
d1007d14 2296.B PERF_MEM_TLB_NA
2b538c3e
MK
2297Not available
2298.TP
d1007d14 2299.B PERF_MEM_TLB_HIT
2b538c3e
MK
2300Hit
2301.TP
d1007d14 2302.B PERF_MEM_TLB_MISS
2b538c3e
MK
2303Miss
2304.TP
d1007d14 2305.B PERF_MEM_TLB_L1
2b538c3e
MK
2306Level 1 TLB
2307.TP
d1007d14 2308.B PERF_MEM_TLB_L2
2b538c3e
MK
2309Level 2 TLB
2310.TP
d1007d14 2311.B PERF_MEM_TLB_WK
2b538c3e
MK
2312Hardware walker
2313.TP
d1007d14 2314.B PERF_MEM_TLB_OS
2b538c3e
MK
2315OS fault handler
2316.RE
2317.PD
d1007d14 2318.RE
1e043959
VW
2319.TP
2320.I transaction
2321If the
2322.B PERF_SAMPLE_TRANSACTION
37bee118 2323flag is set, then a 64-bit field is recorded describing
1e043959
VW
2324the sources of any transactional memory aborts.
2325
2326The field is a bitwise combination of the following values:
2327.RS
2328.TP
2329.B PERF_TXN_ELISION
b3f39642 2330Abort from an elision type transaction (Intel-CPU-specific).
1e043959
VW
2331.TP
2332.B PERF_TXN_TRANSACTION
b3f39642 2333Abort from a generic transaction.
1e043959
VW
2334.TP
2335.B PERF_TXN_SYNC
b3f39642 2336Synchronous abort (related to the reported instruction).
1e043959
VW
2337.TP
2338.B PERF_TXN_ASYNC
b3f39642 2339Asynchronous abort (not related to the reported instruction).
1e043959
VW
2340.TP
2341.B PERF_TXN_RETRY
053a3e08 2342Retryable abort (retrying the transaction may have succeeded).
1e043959
VW
2343.TP
2344.B PERF_TXN_CONFLICT
b3f39642 2345Abort due to memory conflicts with other threads.
1e043959
VW
2346.TP
2347.B PERF_TXN_CAPACITY_WRITE
b3f39642 2348Abort due to write capacity overflow.
1e043959
VW
2349.TP
2350.B PERF_TXN_CAPACITY_READ
b3f39642 2351Abort due to read capacity overflow.
1e043959 2352.RE
b3f39642
MK
2353.IP
2354In addition, a user-specified abort code can be obtained from
2355the high 32 bits of the field by shifting right by
1e043959
VW
2356.B PERF_TXN_ABORT_SHIFT
2357and masking with
2358.BR PERF_TXN_ABORT_MASK .
f5281dfd
VW
2359.TP
2360.IR abi ", " regs[weight(mask)]
2361If
2362.B PERF_SAMPLE_REGS_INTR
2363is enabled, then the user CPU registers are recorded.
2364
2365The
2366.I abi
2367field is one of
2368.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
2369.BR PERF_SAMPLE_REGS_ABI_64 .
2370
2371The
2372.I regs
2373field is an array of the CPU registers that were specified by
2374the
2375.I sample_regs_intr
2376attr field.
2377The number of values is the number of bits set in the
2378.I sample_regs_intr
2379bit mask.
f2b1d720 2380.RE
9bfc542b
VW
2381.TP
2382.B PERF_RECORD_MMAP2
2383This record includes extended information on
2384.BR mmap (2)
2385calls returning executable mappings.
2386The format is similar to that of the
2387.B PERF_RECORD_MMAP
3a058284 2388record, but includes extra values that allow uniquely identifying
9bfc542b 2389shared mappings.
3a058284 2390
9bfc542b
VW
2391.in +4n
2392.nf
2393struct {
2394 struct perf_event_header header;
3a058284
MK
2395 u32 pid;
2396 u32 tid;
9bfc542b
VW
2397 u64 addr;
2398 u64 len;
2399 u64 pgoff;
2400 u32 maj;
2401 u32 min;
2402 u64 ino;
2403 u64 ino_generation;
3a058284
MK
2404 u32 prot;
2405 u32 flags;
9bfc542b
VW
2406 char filename[];
2407 struct sample_id sample_id;
2408};
2409.fi
2410.RS
2411.TP
2412.I pid
3a058284 2413is the process ID.
9bfc542b
VW
2414.TP
2415.I tid
3a058284 2416is the thread ID.
9bfc542b
VW
2417.TP
2418.I addr
2419is the address of the allocated memory.
2420.TP
2421.I len
2422is the length of the allocated memory.
2423.TP
2424.I pgoff
2425is the page offset of the allocated memory.
2426.TP
2427.I maj
3a058284 2428is the major ID of the underlying device.
9bfc542b
VW
2429.TP
2430.I min
3a058284 2431is the minor ID of the underlying device.
9bfc542b
VW
2432.TP
2433.I ino
3a058284 2434is the inode number.
9bfc542b
VW
2435.TP
2436.I ino_generation
2437is the inode generation.
2438.TP
2439.I prot
2440is the protection information.
2441.TP
2442.I flags
2443is the flags information.
2444.TP
2445.I filename
2446is a string describing the backing of the allocated memory.
2447.RE
f2b1d720 2448.RE
21977c9d
VW
2449.SS Overflow handling
2450Events can be set to notify when a threshold is crossed,
2451indicating an overflow.
2452Overflow conditions can be captured by monitoring the
2453event file descriptor with
f2b1d720
MK
2454.BR poll (2),
2455.BR select (2),
21977c9d
VW
2456or
2457.BR epoll (2).
2458Alternately, a SIGIO signal handler can be created and
2459the event configured with
2460.BR fcntl (2)
2461to generate SIGIO signals.
f2b1d720 2462
6170255e 2463Overflows are generated only by sampling events
f2b1d720 2464.RI ( sample_period
7d182bb6 2465must have a nonzero value).
f2b1d720 2466
21977c9d 2467There are two ways to generate overflow notifications.
f2b1d720
MK
2468
2469The first is to set a
2470.I wakeup_events
2471or
2472.I wakeup_watermark
21977c9d 2473value that will trigger if a certain number of samples
f2b1d720 2474or bytes have been written to the mmap ring buffer.
21977c9d 2475In this case
7db515ef 2476.B POLL_IN
21977c9d 2477is indicated.
f2b1d720
MK
2478
2479The other way is by use of the
7db515ef 2480.B PERF_EVENT_IOC_REFRESH
f2b1d720
MK
2481ioctl.
2482This ioctl adds to a counter that decrements each time the event overflows.
21977c9d 2483When nonzero,
7db515ef 2484.B POLL_IN
21977c9d
VW
2485is indicated, but
2486once the counter reaches 0
7db515ef 2487.B POLL_HUP
21977c9d 2488is indicated and
f2b1d720
MK
2489the underlying event is disabled.
2490
50e4319c
VW
2491Refreshing an event group leader refreshes all siblings and
2492refreshing with a parameter of 0 currently enables infinite
2493refreshes;
2494these behaviors are unsupported and should not be relied on.
2495.\" See https://lkml.org/lkml/2011/5/24/337
2496
4010bc07 2497Starting with Linux 3.18,
747a6e7c 2498.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883
21977c9d
VW
2499.B POLL_HUP
2500is indicated if the event being monitored is attached to a different
2501process and that process exits.
73d8cece 2502.SS rdpmc instruction
f2b1d720 2503Starting with Linux 3.4 on x86, you can use the
747a6e7c 2504.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
f2b1d720
MK
2505.I rdpmc
2506instruction to get low-latency reads without having to enter the kernel.
2507Note that using
2508.I rdpmc
2509is not necessarily faster than other methods for reading event values.
2510
2511Support for this can be detected with the
2512.I cap_usr_rdpmc
2513field in the mmap page; documentation on how
2514to calculate event values can be found in that section.
73d8cece 2515.SS perf_event ioctl calls
f2b1d720
MK
2516.PP
2517Various ioctls act on
7db515ef 2518.BR perf_event_open ()
ce88f77b 2519file descriptors:
f2b1d720
MK
2520.TP
2521.B PERF_EVENT_IOC_ENABLE
ce88f77b 2522This enables the individual event or event group specified by the
7db515ef 2523file descriptor argument.
f2b1d720 2524
51700fd7 2525If the
8cc8b90d 2526.B PERF_IOC_FLAG_GROUP
51700fd7 2527bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2528enabled, even if the event specified is not the group leader
2529(but see BUGS).
f2b1d720
MK
2530.TP
2531.B PERF_EVENT_IOC_DISABLE
ce88f77b 2532This disables the individual counter or event group specified by the
7db515ef 2533file descriptor argument.
f2b1d720
MK
2534
2535Enabling or disabling the leader of a group enables or disables the
2536entire group; that is, while the group leader is disabled, none of the
2537counters in the group will count.
33a0ccb2
MK
2538Enabling or disabling a member of a group other than the leader
2539affects only that counter; disabling a non-leader
f2b1d720
MK
2540stops that counter from counting but doesn't affect any other counter.
2541
51700fd7 2542If the
8cc8b90d 2543.B PERF_IOC_FLAG_GROUP
51700fd7 2544bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2545disabled, even if the event specified is not the group leader
2546(but see BUGS).
f2b1d720
MK
2547.TP
2548.B PERF_EVENT_IOC_REFRESH
2549Non-inherited overflow counters can use this
2550to enable a counter for a number of overflows specified by the argument,
2551after which it is disabled.
2552Subsequent calls of this ioctl add the argument value to the current
2553count.
21977c9d 2554An overflow notification with
7db515ef
MK
2555.B POLL_IN
2556set will happen on each overflow until the
21977c9d
VW
2557count reaches 0; when that happens a notification with
2558.B POLL_HUP
7db515ef 2559set is sent and the event is disabled.
f2b1d720 2560Using an argument of 0 is considered undefined behavior.
f2b1d720
MK
2561.TP
2562.B PERF_EVENT_IOC_RESET
36127c0e 2563Reset the event count specified by the
6061d29f 2564file descriptor argument to zero.
33a0ccb2 2565This resets only the counts; there is no way to reset the
f2b1d720
MK
2566multiplexing
2567.I time_enabled
2568or
2569.I time_running
2570values.
f2b1d720 2571
51700fd7 2572If the
8cc8b90d 2573.B PERF_IOC_FLAG_GROUP
51700fd7 2574bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2575reset, even if the event specified is not the group leader
2576(but see BUGS).
f2b1d720
MK
2577.TP
2578.B PERF_EVENT_IOC_PERIOD
e6cf5694 2579This updates the overflow period for the event.
3f118a29 2580
747a6e7c
VW
2581Since Linux 3.7 (on ARM)
2582.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc
2583and Linux 3.14 (all other architectures),
2584.\" commit bad7192b842c83e580747ca57104dd51fe08c223
3f118a29 2585the new period takes effect immediately.
ed81fdd9 2586On older kernels, the new period did not take effect until
3f118a29 2587after the next overflow.
f2b1d720
MK
2588
2589The argument is a pointer to a 64-bit value containing the
2590desired new period.
e6cf5694 2591
747a6e7c
VW
2592Prior to Linux 2.6.36
2593.\" commit ad0cf3478de8677f720ee06393b3147819568d6a
2594this ioctl always failed due to a bug
e6cf5694
VW
2595in the kernel.
2596
f2b1d720
MK
2597.TP
2598.B PERF_EVENT_IOC_SET_OUTPUT
2599This tells the kernel to report event notifications to the specified
2600file descriptor rather than the default one.
2601The file descriptors must all be on the same CPU.
2602
2603The argument specifies the desired file descriptor, or \-1 if
2604output should be ignored.
f2b1d720 2605.TP
31c1f2b0 2606.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
60dafbc1 2607.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830
f2b1d720
MK
2608This adds an ftrace filter to this event.
2609
2610The argument is a pointer to the desired ftrace filter.
a0dcc8dd 2611.TP
31c1f2b0 2612.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
60dafbc1 2613.\" commit cf4957f17f2a89984915ea808876d9c82225b862
bec6277e 2614This returns the event ID value for the given event file descriptor.
a0dcc8dd
VW
2615
2616The argument is a pointer to a 64-bit unsigned integer
2617to hold the result.
b0f7b411
VW
2618.TP
2619.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)"
2620.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
2621This allows attaching a Berkeley Packet Filter (BPF)
2622program to an existing kprobe tracepoint event.
2623You need
2624.B CAP_SYS_ADMIN
2625privileges to use this ioctl.
2626
2627The argument is a BPF program file descriptor that was created by
2628a previous
2629.BR bpf (2)
2630system call.
73d8cece 2631.SS Using prctl
f2b1d720
MK
2632A process can enable or disable all the event groups that are
2633attached to it using the
2634.BR prctl (2)
2635.B PR_TASK_PERF_EVENTS_ENABLE
2636and
2637.B PR_TASK_PERF_EVENTS_DISABLE
2638operations.
ee7b0cbf 2639This applies to all counters on the calling process, whether created by
f2b1d720
MK
2640this process or by another, and does not affect any counters that this
2641process has created on other processes.
33a0ccb2 2642It enables or disables only
f2b1d720 2643the group leaders, not any other members in the groups.
f2b1d720 2644.SS perf_event related configuration files
7db515ef
MK
2645Files in
2646.I /proc/sys/kernel/
7db515ef 2647.RS 4
f2b1d720 2648.TP
7db515ef 2649.I /proc/sys/kernel/perf_event_paranoid
f2b1d720
MK
2650
2651The
2652.I perf_event_paranoid
2653file can be set to restrict access to the performance counters.
2b538c3e
MK
2654.RS
2655.IP 2 4
6170255e 2656allow only user-space measurements.
2b538c3e
MK
2657.IP 1
2658allow both kernel and user measurements (default).
2659.IP 0
2660allow access to CPU-specific data but not raw tracepoint samples.
2661.IP \-1
2662no restrictions.
2663.RE
2664.IP
f2b1d720
MK
2665The existence of the
2666.I perf_event_paranoid
2667file is the official method for determining if a kernel supports
7db515ef 2668.BR perf_event_open ().
f2b1d720
MK
2669.TP
2670.I /proc/sys/kernel/perf_event_max_sample_rate
2671
7db515ef
MK
2672This sets the maximum sample rate.
2673Setting this too high can allow
f2b1d720 2674users to sample at a rate that impacts overall machine performance
7db515ef
MK
2675and potentially lock up the machine.
2676The default value is
f2b1d720 2677100000 (samples per second).
f2b1d720
MK
2678.TP
2679.I /proc/sys/kernel/perf_event_mlock_kb
2680
ce88f77b
MK
2681Maximum number of pages an unprivileged user can
2682.BR mlock (2).
f2b1d720 2683The default is 516 (kB).
e30dc77f 2684
f2b1d720 2685.RE
7db515ef
MK
2686Files in
2687.I /sys/bus/event_source/devices/
7db515ef 2688.RS 4
ce88f77b 2689Since Linux 2.6.34, the kernel supports having multiple PMUs
f2b1d720
MK
2690available for monitoring.
2691Information on how to program these PMUs can be found under
2692.IR /sys/bus/event_source/devices/ .
2693Each subdirectory corresponds to a different PMU.
f2b1d720 2694.TP
31c1f2b0 2695.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
747a6e7c 2696.\" commit abe43400579d5de0078c2d3a760e6598e183f871
f2b1d720
MK
2697This contains an integer that can be used in the
2698.I type
ce88f77b
MK
2699field of
2700.I perf_event_attr
2701to indicate that you wish to use this PMU.
f2b1d720 2702.TP
31c1f2b0 2703.IR /sys/bus/event_source/devices/*/rdpmc " (since Linux 3.4)"
747a6e7c 2704.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f
8a94e783 2705If this file is 1, then direct user-space access to the
e30dc77f
VW
2706performance counter registers is allowed via the rdpmc instruction.
2707This can be disabled by echoing 0 to the file.
f2b1d720 2708.TP
31c1f2b0 2709.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
747a6e7c 2710.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6
MK
2711This subdirectory contains information on the architecture-specific
2712subfields available for programming the various
f2b1d720 2713.I config
ce88f77b
MK
2714fields in the
2715.I perf_event_attr
2716struct.
e30dc77f
VW
2717
2718The content of each file is the name of the config field, followed
2719by a colon, followed by a series of integer bit ranges separated by
2720commas.
8a94e783 2721For example, the file
e30dc77f
VW
2722.I event
2723may contain the value
2724.I config1:1,6-10,44
2725which indicates that event is an attribute that occupies bits 1,6-10, and 44
ce88f77b
MK
2726of
2727.IR perf_event_attr::config1 .
e30dc77f 2728.TP
31c1f2b0 2729.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
747a6e7c 2730.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6 2731This subdirectory contains files with predefined events.
f2b1d720 2732The contents are strings describing the event settings
e30dc77f 2733expressed in terms of the fields found in the previously mentioned
f2b1d720
MK
2734.I ./format/
2735directory.
2736These are not necessarily complete lists of all events supported by
2737a PMU, but usually a subset of events deemed useful or interesting.
e30dc77f
VW
2738
2739The content of each file is a list of attribute names
8a94e783
MK
2740separated by commas.
2741Each entry has an optional value (either hex or decimal).
37bee118 2742If no value is specified, then it is assumed to be a single-bit
e30dc77f
VW
2743field with a value of 1.
2744An example entry may look like this:
699893d8 2745.IR event=0x2,inv,ldlat=3 .
f2b1d720
MK
2746.TP
2747.I /sys/bus/event_source/devices/*/uevent
e30dc77f
VW
2748This file is the standard kernel device interface
2749for injecting hotplug events.
2750.TP
31c1f2b0 2751.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
747a6e7c 2752.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac
699893d8
DP
2753The
2754.I cpumask
2755file contains a comma-separated list of integers that
2756indicate a representative CPU number for each socket (package)
e30dc77f
VW
2757on the motherboard.
2758This is needed when setting up uncore or northbridge events, as
2759those PMUs present socket-wide events.
f2b1d720 2760.RE
47297adb 2761.SH RETURN VALUE
f2b1d720
MK
2762.BR perf_event_open ()
2763returns the new file descriptor, or \-1 if an error occurred
2764(in which case,
2765.I errno
2766is set appropriately).
2767.SH ERRORS
d8b7d950
VW
2768The errors returned by
2769.BR perf_event_open ()
2770can be inconsistent, and may
2771vary across processor architectures and performance monitoring units.
f2b1d720 2772.TP
82b09254 2773.B E2BIG
ce88f77b
MK
2774Returned if the
2775.I perf_event_attr
82b09254
VW
2776.I size
2777value is too small
2778(smaller than
2779.BR PERF_ATTR_SIZE_VER0 ),
2780too big (larger than the page size),
2781or larger than the kernel supports and the extra bytes are not zero.
2782When
2783.B E2BIG
ce88f77b
MK
2784is returned, the
2785.I perf_event_attr
e9bd9b2c 2786.I size
d6af98f8 2787field is overwritten by the kernel to be the size of the structure
82b09254
VW
2788it was expecting.
2789.TP
d8b7d950 2790.B EACCES
27f0af8e
VW
2791Returned when the requested event requires
2792.B CAP_SYS_ADMIN
2793permissions (or a more permissive perf_event paranoid setting).
2794Some common cases where an unprivileged process
2795may encounter this error:
2796attaching to a process owned by a different user;
2b23ecbd
MK
2797monitoring all processes on a given CPU (i.e., specifying the
2798.I pid
2799argument as \-1);
079928f3 2800and not setting
accec051 2801.I exclude_kernel
079928f3 2802when the paranoid setting requires it.
d8b7d950
VW
2803.TP
2804.B EBADF
2805Returned if the
2806.I group_fd
accec051
MK
2807file descriptor is not valid, or, if
2808.B PERF_FLAG_PID_CGROUP
2809is set,
d8b7d950
VW
2810the cgroup file descriptor in
2811.I pid
2812is not valid.
2813.TP
2814.B EFAULT
2815Returned if the
2816.I attr
2817pointer points at an invalid memory address.
2818.TP
f2b1d720 2819.B EINVAL
d8b7d950
VW
2820Returned if the specified event is invalid.
2821There are many possible reasons for this.
2822A not-exhaustive list:
2823.I sample_freq
accec051 2824is higher than the maximum setting;
d8b7d950
VW
2825the
2826.I cpu
accec051 2827to monitor does not exist;
d8b7d950 2828.I read_format
accec051 2829is out of range;
d8b7d950 2830.I sample_type
accec051 2831is out of range;
d8b7d950
VW
2832the
2833.I flags
accec051 2834value is out of range;
d8b7d950
VW
2835.I exclusive
2836or
2837.I pinned
accec051 2838set and the event is not a group leader;
d8b7d950
VW
2839the event
2840.I config
accec051
MK
2841values are out of range or set reserved bits;
2842the generic event selected is not supported; or
d8b7d950
VW
2843there is not enough room to add the selected event.
2844.TP
2845.B EMFILE
2846Each opened event uses one file descriptor.
2847If a large number of events are opened the per-user file
2848descriptor limit (often 1024) will be hit and no more
2849events can be created.
2850.TP
2851.B ENODEV
2852Returned when the event involves a feature not supported
accec051 2853by the current CPU.
d8b7d950
VW
2854.TP
2855.B ENOENT
2856Returned if the
2857.I type
2858setting is not valid.
accec051 2859This error is also returned for
d8b7d950 2860some unsupported generic events.
f2b1d720
MK
2861.TP
2862.B ENOSPC
2863Prior to Linux 3.3, if there was not enough room for the event,
747a6e7c 2864.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6
f2b1d720
MK
2865.B ENOSPC
2866was returned.
accec051 2867In Linux 3.3, this was changed to
f2b1d720
MK
2868.BR EINVAL .
2869.B ENOSPC
d8b7d950 2870is still returned if you try to add more breakpoint events
accec051 2871than supported by the hardware.
d8b7d950
VW
2872.TP
2873.B ENOSYS
2874Returned if
2875.B PERF_SAMPLE_STACK_USER
2876is set in
2877.I sample_type
2878and it is not supported by hardware.
2879.TP
2880.B EOPNOTSUPP
2881Returned if an event requiring a specific hardware feature is
2882requested but there is no hardware support.
2883This includes requesting low-skid events if not supported,
2884branch tracing if it is not available, sampling if no PMU
2885interrupt is available, and branch stacks for software events.
2886.TP
2887.B EPERM
27f0af8e
VW
2888Returned on many (but not all) architectures when an unsupported
2889.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel
2890setting is specified.
2891
2892It can also happen, as with
2893.BR EACCES ,
2894when the requested event requires
2895.B CAP_SYS_ADMIN
2896permissions (or a more permissive perf_event paranoid setting).
2897This includes setting a breakpoint on a kernel address,
2898and (since Linux 3.13) setting a kernel function-trace tracepoint.
747a6e7c 2899.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34
d8b7d950
VW
2900.TP
2901.B ESRCH
2902Returned if attempting to attach to a process that does not exist.
f2b1d720 2903.SH VERSION
f2b1d720
MK
2904.BR perf_event_open ()
2905was introduced in Linux 2.6.31 but was called
747a6e7c 2906.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e
ffd4dec0 2907.BR perf_counter_open ().
f2b1d720 2908It was renamed in Linux 2.6.32.
747a6e7c 2909.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
f2b1d720 2910.SH CONFORMING TO
7db515ef
MK
2911This
2912.BR perf_event_open ()
2913system call Linux- specific
f2b1d720 2914and should not be used in programs intended to be portable.
f2b1d720
MK
2915.SH NOTES
2916Glibc does not provide a wrapper for this system call; call it using
2917.BR syscall (2).
7db515ef 2918See the example below.
f2b1d720
MK
2919
2920The official way of knowing if
7db515ef 2921.BR perf_event_open ()
f2b1d720
MK
2922support is enabled is checking
2923for the existence of the file
7db515ef 2924.IR /proc/sys/kernel/perf_event_paranoid .
f2b1d720 2925.SH BUGS
f2b1d720
MK
2926The
2927.B F_SETOWN_EX
2928option to
7db515ef 2929.BR fcntl (2)
f2b1d720
MK
2930is needed to properly get overflow signals in threads.
2931This was introduced in Linux 2.6.32.
747a6e7c 2932.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5
f2b1d720 2933
747a6e7c
VW
2934Prior to Linux 2.6.33 (at least for x86),
2935.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1
2936the kernel did not check
f2b1d720
MK
2937if events could be scheduled together until read time.
2938The same happens on all known kernels if the NMI watchdog is enabled.
2939This means to see if a given set of events works you have to
2940.BR perf_event_open (),
2941start, then read before you know for sure you
2942can get valid measurements.
2943
ce88f77b 2944Prior to Linux 2.6.34, event constraints were not enforced by the kernel.
f2b1d720
MK
2945In that case, some events would silently return "0" if the kernel
2946scheduled them in an improper counter slot.
747a6e7c 2947.\" FIXME: cannot find a kernel commit for this one
f2b1d720 2948
ce88f77b 2949Prior to Linux 2.6.34, there was a bug when multiplexing where the
f2b1d720 2950wrong results could be returned.
747a6e7c 2951.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8
f2b1d720
MK
2952
2953Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
2954"inherit" is enabled and many threads are started.
747a6e7c 2955.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd
f2b1d720
MK
2956
2957Prior to Linux 2.6.35,
747a6e7c 2958.\" commit 050735b08ca8a016bbace4445fa025b88fee770b
f2b1d720
MK
2959.B PERF_FORMAT_GROUP
2960did not work with attached processes.
2961
f2b1d720
MK
2962There is a bug in the kernel code between
2963Linux 2.6.36 and Linux 3.0 that ignores the
2964"watermark" field and acts as if a wakeup_event
2965was chosen if the union has a
7d182bb6 2966nonzero value in it.
747a6e7c 2967.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02
f2b1d720 2968
8a94e783 2969From Linux 2.6.31 to Linux 3.4, the
dbc01ecd
VW
2970.B PERF_IOC_FLAG_GROUP
2971ioctl argument was broken and would repeatedly operate
2972on the event specified rather than iterating across
2973all sibling events in a group.
747a6e7c 2974.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e
dbc01ecd 2975
7205b8df 2976From Linux 3.4 to Linux 3.11, the mmap
747a6e7c 2977.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
2978.I cap_usr_rdpmc
2979and
2980.I cap_usr_time
2981bits mapped to the same location.
2982Code should migrate to the new
2983.I cap_user_rdpmc
2984and
2985.I cap_user_time
2986fields instead.
2987
7db515ef
MK
2988Always double-check your results!
2989Various generalized events have had wrong values.
f2b1d720
MK
2990For example, retired branches measured
2991the wrong thing on AMD machines until Linux 2.6.35.
747a6e7c 2992.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
f2b1d720
MK
2993.SH EXAMPLE
2994The following is a short example that measures the total
7db515ef
MK
2995instruction count of a call to
2996.BR printf (3).
f2b1d720
MK
2997.nf
2998
2999#include <stdlib.h>
3000#include <stdio.h>
3001#include <unistd.h>
3002#include <string.h>
3003#include <sys/ioctl.h>
3004#include <linux/perf_event.h>
3005#include <asm/unistd.h>
3006
571767ca 3007static long
7db515ef
MK
3008perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
3009 int cpu, int group_fd, unsigned long flags)
f2b1d720
MK
3010{
3011 int ret;
3012
7db515ef
MK
3013 ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
3014 group_fd, flags);
f2b1d720
MK
3015 return ret;
3016}
3017
f2b1d720
MK
3018int
3019main(int argc, char **argv)
3020{
f2b1d720
MK
3021 struct perf_event_attr pe;
3022 long long count;
3023 int fd;
3024
3025 memset(&pe, 0, sizeof(struct perf_event_attr));
3026 pe.type = PERF_TYPE_HARDWARE;
3027 pe.size = sizeof(struct perf_event_attr);
3028 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
3029 pe.disabled = 1;
3030 pe.exclude_kernel = 1;
3031 pe.exclude_hv = 1;
3032
3033 fd = perf_event_open(&pe, 0, \-1, \-1, 0);
7db515ef 3034 if (fd == \-1) {
f2b1d720 3035 fprintf(stderr, "Error opening leader %llx\\n", pe.config);
7db515ef 3036 exit(EXIT_FAILURE);
f2b1d720
MK
3037 }
3038
3039 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
3040 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
3041
3042 printf("Measuring instruction count for this printf\\n");
3043
3044 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
3045 read(fd, &count, sizeof(long long));
3046
3047 printf("Used %lld instructions\\n", count);
3048
3049 close(fd);
3050}
3051.fi
47297adb 3052.SH SEE ALSO
f2b1d720
MK
3053.BR fcntl (2),
3054.BR mmap (2),
3055.BR open (2),
3056.BR prctl (2),
3057.BR read (2)