]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/perf_event_open.2
fanotify_init.2: wfix
[thirdparty/man-pages.git] / man2 / perf_event_open.2
CommitLineData
f2b1d720
MK
1.\" Copyright (c) 2012, Vincent Weaver
2.\"
1dd72f9c 3.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
f2b1d720
MK
4.\" This is free documentation; you can redistribute it and/or
5.\" modify it under the terms of the GNU General Public License as
6.\" published by the Free Software Foundation; either version 2 of
7.\" the License, or (at your option) any later version.
8.\"
9.\" The GNU General Public License's references to "object code"
10.\" and "executables" are to be interpreted as the output of any
11.\" document formatting or typesetting system, including
12.\" intermediate and printed output.
13.\"
14.\" This manual is distributed in the hope that it will be useful,
15.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
16.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17.\" GNU General Public License for more details.
18.\"
19.\" You should have received a copy of the GNU General Public
20.\" License along with this manual; if not, see
21.\" <http://www.gnu.org/licenses/>.
6a8d8745 22.\" %%%LICENSE_END
f2b1d720
MK
23.\"
24.\" This document is based on the perf_event.h header file, the
25.\" tools/perf/design.txt file, and a lot of bitter experience.
26.\"
4b8c67d9 27.TH PERF_EVENT_OPEN 2 2017-09-15 "Linux" "Linux Programmer's Manual"
f2b1d720
MK
28.SH NAME
29perf_event_open \- set up performance monitoring
30.SH SYNOPSIS
31.nf
32.B #include <linux/perf_event.h>
33.B #include <linux/hw_breakpoint.h>
68e4db0a 34.PP
f2b1d720
MK
35.BI "int perf_event_open(struct perf_event_attr *" attr ,
36.BI " pid_t " pid ", int " cpu ", int " group_fd ,
37.BI " unsigned long " flags );
38.fi
dbfe9c70 39.PP
f2b1d720
MK
40.IR Note :
41There is no glibc wrapper for this system call; see NOTES.
42.SH DESCRIPTION
43Given a list of parameters,
44.BR perf_event_open ()
45returns a file descriptor, for use in subsequent system calls
46.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
47.PP
48A call to
49.BR perf_event_open ()
50creates a file descriptor that allows measuring performance
51information.
52Each file descriptor corresponds to one
53event that is measured; these can be grouped together
54to measure multiple events simultaneously.
55.PP
56Events can be enabled and disabled in two ways: via
57.BR ioctl (2)
58and via
0fe9e4b1 59.BR prctl (2).
f2b1d720
MK
60When an event is disabled it does not count or generate overflows but does
61continue to exist and maintain its count value.
62.PP
63Events come in two flavors: counting and sampled.
64A
65.I counting
66event is one that is used for counting the aggregate number of events
67that occur.
68In general, counting event results are gathered with a
69.BR read (2)
70call.
71A
72.I sampling
73event periodically writes measurements to a buffer that can then
74be accessed via
0fe9e4b1 75.BR mmap (2).
f2b1d720 76.SS Arguments
11ac5b51 77.PP
f2b1d720 78The
a02a1737 79.I pid
f2b1d720 80and
a02a1737
VW
81.I cpu
82arguments allow specifying which process and CPU to monitor:
83.TP
f2d15dc9 84.BR "pid == 0" " and " "cpu == \-1"
ee7b0cbf 85This measures the calling process/thread on any CPU.
a02a1737 86.TP
f2d15dc9 87.BR "pid == 0" " and " "cpu >= 0"
ee7b0cbf 88This measures the calling process/thread only
a02a1737
VW
89when running on the specified CPU.
90.TP
f2d15dc9 91.BR "pid > 0" " and " "cpu == \-1"
a02a1737
VW
92This measures the specified process/thread on any CPU.
93.TP
f2d15dc9 94.BR "pid > 0" " and " "cpu >= 0"
a02a1737
VW
95This measures the specified process/thread only
96when running on the specified CPU.
97.TP
f2d15dc9 98.BR "pid == \-1" " and " "cpu >= 0"
a02a1737 99This measures all processes/threads on the specified CPU.
ce88f77b 100This requires
f2b1d720
MK
101.B CAP_SYS_ADMIN
102capability or a
103.I /proc/sys/kernel/perf_event_paranoid
104value of less than 1.
a02a1737 105.TP
ce88f77b 106.BR "pid == \-1" " and " "cpu == \-1"
a02a1737 107This setting is invalid and will return an error.
11ac5b51 108.PP
13ec13dc
MK
109When
110.I pid
111is greater than zero, permission to perform this system call
112is governed by a ptrace access mode
113.B PTRACE_MODE_READ_REALCREDS
114check; see
115.BR ptrace (2).
efeece04 116.PP
f2b1d720
MK
117The
118.I group_fd
119argument allows event groups to be created.
120An event group has one event which is the group leader.
121The leader is created first, with
122.IR group_fd " = \-1."
123The rest of the group members are created with subsequent
124.BR perf_event_open ()
125calls with
126.IR group_fd
bec6277e 127being set to the file descriptor of the group leader.
f2b1d720
MK
128(A single event on its own is created with
129.IR group_fd " = \-1"
130and is considered to be a group with only 1 member.)
33a0ccb2 131An event group is scheduled onto the CPU as a unit: it will
d1007d14 132be put onto the CPU only if all of the events in the group can be put onto
f2b1d720
MK
133the CPU.
134This means that the values of the member events can be
ce88f77b 135meaningfully compared\(emadded, divided (to get ratios), and so on\(emwith each
f2b1d720
MK
136other, since they have counted events for the same set of executed
137instructions.
11ac5b51 138.PP
f2b1d720
MK
139The
140.I flags
08e325e8 141argument is formed by ORing together zero or more of the following values:
f2b1d720 142.TP
60dafbc1
MK
143.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)"
144.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e
e9b1ab78
MK
145This flag enables the close-on-exec flag for the created
146event file descriptor,
147so that the file descriptor is automatically closed on
148.BR execve (2).
8bad22e5
MK
149Setting the close-on-exec flags at creation time, rather than later with
150.BR fcntl (2),
e9b1ab78
MK
151avoids potential race conditions where the calling thread invokes
152.BR perf_event_open ()
a61dba34
MK
153and
154.BR fcntl (2)
e9b1ab78
MK
155at the same time as another thread calls
156.BR fork (2)
157then
158.BR execve (2).
159.TP
f2b1d720 160.BR PERF_FLAG_FD_NO_GROUP
31266c04
VW
161This flag tells the event to ignore the
162.IR group_fd
163parameter except for the purpose of setting up output redirection
164using the
165.B PERF_FLAG_FD_OUTPUT
166flag.
f2b1d720 167.TP
3117263f 168.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)"
747a6e7c 169.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318
31266c04
VW
170This flag re-routes the event's sampled output to instead
171be included in the mmap buffer of the event specified by
172.IR group_fd .
f2b1d720 173.TP
3117263f 174.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)"
60dafbc1 175.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25
f2b1d720
MK
176This flag activates per-container system-wide monitoring.
177A container
ce88f77b 178is an abstraction that isolates a set of resources for finer-grained
699893d8 179control (CPUs, memory, etc.).
f2b1d720
MK
180In this mode, the event is measured
181only if the thread running on the monitored CPU belongs to the designated
182container (cgroup).
183The cgroup is identified by passing a file descriptor
184opened on its directory in the cgroupfs filesystem.
185For instance, if the
186cgroup to monitor is called
187.IR test ,
188then a file descriptor opened on
189.I /dev/cgroup/test
190(assuming cgroupfs is mounted on
191.IR /dev/cgroup )
192must be passed as the
193.I pid
194parameter.
33a0ccb2 195cgroup monitoring is available only
f2b1d720 196for system-wide events and may therefore require extra permissions.
11ac5b51 197.PP
f2b1d720
MK
198The
199.I perf_event_attr
200structure provides detailed configuration information
201for the event being created.
efeece04 202.PP
f2b1d720 203.in +4n
b8302363 204.EX
f2b1d720 205struct perf_event_attr {
da8bd8a4
MK
206 __u32 type; /* Type of event */
207 __u32 size; /* Size of attribute structure */
208 __u64 config; /* Type-specific configuration */
f2b1d720
MK
209
210 union {
211 __u64 sample_period; /* Period of sampling */
212 __u64 sample_freq; /* Frequency of sampling */
213 };
214
ce88f77b
MK
215 __u64 sample_type; /* Specifies values included in sample */
216 __u64 read_format; /* Specifies values returned in read */
217
218 __u64 disabled : 1, /* off by default */
219 inherit : 1, /* children inherit it */
220 pinned : 1, /* must always be on PMU */
221 exclusive : 1, /* only group on PMU */
222 exclude_user : 1, /* don't count user */
223 exclude_kernel : 1, /* don't count kernel */
224 exclude_hv : 1, /* don't count hypervisor */
225 exclude_idle : 1, /* don't count when idle */
226 mmap : 1, /* include mmap data */
227 comm : 1, /* include comm data */
228 freq : 1, /* use freq, not period */
229 inherit_stat : 1, /* per task counts */
230 enable_on_exec : 1, /* next exec enables */
231 task : 1, /* trace fork/exit */
232 watermark : 1, /* wakeup_watermark */
233 precise_ip : 2, /* skid constraint */
234 mmap_data : 1, /* non-exec mmap data */
235 sample_id_all : 1, /* sample_type all events */
236 exclude_host : 1, /* don't count in host */
237 exclude_guest : 1, /* don't count in guest */
238 exclude_callchain_kernel : 1,
239 /* exclude kernel callchains */
240 exclude_callchain_user : 1,
241 /* exclude user callchains */
9bfc542b 242 mmap2 : 1, /* include mmap with inode data */
dc9ec146
MK
243 comm_exec : 1, /* flag comm events that are
244 due to exec */
6bd5186a 245 use_clockid : 1, /* use clockid for time fields */
9277a75d 246 context_switch : 1, /* context switch data */
6bd5186a 247
9277a75d 248 __reserved_1 : 37;
f2b1d720
MK
249
250 union {
251 __u32 wakeup_events; /* wakeup every n events */
7db515ef 252 __u32 wakeup_watermark; /* bytes before wakeup */
f2b1d720
MK
253 };
254
255 __u32 bp_type; /* breakpoint type */
256
257 union {
258 __u64 bp_addr; /* breakpoint address */
259 __u64 config1; /* extension of config */
260 };
261
262 union {
263 __u64 bp_len; /* breakpoint length */
264 __u64 config2; /* extension of config1 */
265 };
ce88f77b
MK
266 __u64 branch_sample_type; /* enum perf_branch_sample_type */
267 __u64 sample_regs_user; /* user regs to dump on samples */
268 __u32 sample_stack_user; /* size of stack to dump on
7db515ef 269 samples */
6bd5186a 270 __s32 clockid; /* clock to use for time fields */
f5281dfd 271 __u64 sample_regs_intr; /* regs to dump on samples */
cdc52f4a 272 __u32 aux_watermark; /* aux bytes before wakeup */
fd133d5d
VW
273 __u16 sample_max_stack; /* max frames in callchain */
274 __u16 __reserved_2; /* align to u64 */
cdc52f4a 275
f2b1d720 276};
b8302363 277.EE
f2b1d720 278.in
efeece04 279.PP
f2b1d720
MK
280The fields of the
281.I perf_event_attr
282structure are described in more detail below:
f2b1d720
MK
283.TP
284.I type
285This field specifies the overall event type.
286It has one of the following values:
287.RS
288.TP
289.B PERF_TYPE_HARDWARE
290This indicates one of the "generalized" hardware events provided
291by the kernel.
292See the
293.I config
294field definition for more details.
295.TP
296.B PERF_TYPE_SOFTWARE
297This indicates one of the software-defined events provided by the kernel
298(even if no hardware support is available).
299.TP
300.B PERF_TYPE_TRACEPOINT
301This indicates a tracepoint
302provided by the kernel tracepoint infrastructure.
303.TP
304.B PERF_TYPE_HW_CACHE
305This indicates a hardware cache event.
306This has a special encoding, described in the
307.I config
308field definition.
309.TP
310.B PERF_TYPE_RAW
311This indicates a "raw" implementation-specific event in the
312.IR config " field."
313.TP
31c1f2b0 314.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
60dafbc1 315.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
316This indicates a hardware breakpoint as provided by the CPU.
317Breakpoints can be read/write accesses to an address as well as
318execution of an instruction address.
319.TP
320.RB "dynamic PMU"
747a6e7c
VW
321Since Linux 2.6.38,
322.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19
7db515ef 323.BR perf_event_open ()
f2b1d720
MK
324can support multiple PMUs.
325To enable this, a value exported by the kernel can be used in the
326.I type
327field to indicate which PMU to use.
328The value to use can be found in the sysfs filesystem:
329there is a subdirectory per PMU instance under
330.IR /sys/bus/event_source/devices .
7d182bb6 331In each subdirectory there is a
f2b1d720
MK
332.I type
333file whose content is an integer that can be used in the
334.I type
335field.
336For instance,
337.I /sys/bus/event_source/devices/cpu/type
338contains the value for the core CPU PMU, which is usually 4.
339.RE
f2b1d720
MK
340.TP
341.I "size"
342The size of the
343.I perf_event_attr
344structure for forward/backward compatibility.
345Set this using
346.I sizeof(struct perf_event_attr)
347to allow the kernel to see
348the struct size at the time of compilation.
efeece04 349.IP
f2b1d720
MK
350The related define
351.B PERF_ATTR_SIZE_VER0
352is set to 64; this was the size of the first published struct.
353.B PERF_ATTR_SIZE_VER1
354is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
747a6e7c
VW
355.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
356.\" this was added much later when PERF_ATTR_SIZE_VER2 happened
357.\" but the actual attr_size had increased in 2.6.33
f2b1d720
MK
358.B PERF_ATTR_SIZE_VER2
359is 80 corresponding to the addition of branch sampling in Linux 3.4.
747a6e7c 360.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
d2a6be2f 361.B PERF_ATTR_SIZE_VER3
f2b1d720 362is 96 corresponding to the addition
7ede2f66
DP
363of
364.I sample_regs_user
365and
366.I sample_stack_user
367in Linux 3.7.
747a6e7c 368.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03
f5281dfd
VW
369.B PERF_ATTR_SIZE_VER4
370is 104 corresponding to the addition of
371.I sample_regs_intr
372in Linux 3.19.
373.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
cdc52f4a
VW
374.B PERF_ATTR_SIZE_VER5
375is 112 corresponding to the addition of
2050c098 376.I aux_watermark
cdc52f4a
VW
377in Linux 4.1.
378.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
f2b1d720
MK
379.TP
380.I "config"
381This specifies which event you want, in conjunction with
382the
383.I type
384field.
385The
386.IR config1 " and " config2
387fields are also taken into account in cases where 64 bits is not
388enough to fully specify the event.
389The encoding of these fields are event dependent.
efeece04 390.IP
f2b1d720
MK
391There are various ways to set the
392.I config
393field that are dependent on the value of the previously
394described
395.I type
396field.
397What follows are various possible settings for
398.I config
399separated out by
400.IR type .
efeece04 401.IP
f2b1d720
MK
402If
403.I type
404is
405.BR PERF_TYPE_HARDWARE ,
406we are measuring one of the generalized hardware CPU events.
407Not all of these are available on all platforms.
408Set
409.I config
410to one of the following:
411.RS 12
412.TP
413.B PERF_COUNT_HW_CPU_CYCLES
414Total cycles.
2b538c3e 415Be wary of what happens during CPU frequency scaling.
f2b1d720
MK
416.TP
417.B PERF_COUNT_HW_INSTRUCTIONS
418Retired instructions.
419Be careful, these can be affected by various
2b538c3e 420issues, most notably hardware interrupt counts.
f2b1d720
MK
421.TP
422.B PERF_COUNT_HW_CACHE_REFERENCES
423Cache accesses.
424Usually this indicates Last Level Cache accesses but this may
425vary depending on your CPU.
426This may include prefetches and coherency messages; again this
427depends on the design of your CPU.
428.TP
429.B PERF_COUNT_HW_CACHE_MISSES
430Cache misses.
431Usually this indicates Last Level Cache misses; this is intended to be
432used in conjunction with the
433.B PERF_COUNT_HW_CACHE_REFERENCES
434event to calculate cache miss rates.
435.TP
436.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
437Retired branch instructions.
747a6e7c 438Prior to Linux 2.6.35, this used
f2b1d720 439the wrong event on AMD processors.
747a6e7c 440.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
f2b1d720
MK
441.TP
442.B PERF_COUNT_HW_BRANCH_MISSES
443Mispredicted branch instructions.
444.TP
445.B PERF_COUNT_HW_BUS_CYCLES
446Bus cycles, which can be different from total cycles.
447.TP
31c1f2b0 448.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
747a6e7c 449.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
450Stalled cycles during issue.
451.TP
31c1f2b0 452.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
747a6e7c 453.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
454Stalled cycles during retirement.
455.TP
31c1f2b0 456.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
60dafbc1 457.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890
f2b1d720
MK
458Total cycles; not affected by CPU frequency scaling.
459.RE
460.IP
461If
462.I type
463is
464.BR PERF_TYPE_SOFTWARE ,
465we are measuring software events provided by the kernel.
466Set
467.I config
468to one of the following:
469.RS 12
470.TP
471.B PERF_COUNT_SW_CPU_CLOCK
472This reports the CPU clock, a high-resolution per-CPU timer.
473.TP
474.B PERF_COUNT_SW_TASK_CLOCK
475This reports a clock count specific to the task that is running.
476.TP
477.B PERF_COUNT_SW_PAGE_FAULTS
478This reports the number of page faults.
479.TP
480.B PERF_COUNT_SW_CONTEXT_SWITCHES
481This counts context switches.
482Until Linux 2.6.34, these were all reported as user-space
483events, after that they are reported as happening in the kernel.
747a6e7c 484.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21
f2b1d720
MK
485.TP
486.B PERF_COUNT_SW_CPU_MIGRATIONS
487This reports the number of times the process
488has migrated to a new CPU.
489.TP
490.B PERF_COUNT_SW_PAGE_FAULTS_MIN
491This counts the number of minor page faults.
492These did not require disk I/O to handle.
493.TP
494.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
495This counts the number of major page faults.
496These required disk I/O to handle.
497.TP
31c1f2b0 498.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
60dafbc1 499.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
500This counts the number of alignment faults.
501These happen when unaligned memory accesses happen; the kernel
502can handle these but it reduces performance.
33a0ccb2 503This happens only on some architectures (never on x86).
f2b1d720 504.TP
31c1f2b0 505.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
60dafbc1 506.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
507This counts the number of emulation faults.
508The kernel sometimes traps on unimplemented instructions
7db515ef 509and emulates them for user space.
f2b1d720 510This can negatively impact performance.
dab38455 511.TP
31c1f2b0 512.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
60dafbc1 513.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77
dab38455
VW
514This is a placeholder event that counts nothing.
515Informational sample record types such as mmap or comm
516must be associated with an active event.
517This dummy event allows gathering such records without requiring
518a counting event.
f2b1d720 519.RE
efeece04 520.PP
f2b1d720
MK
521.RS
522If
523.I type
524is
525.BR PERF_TYPE_TRACEPOINT ,
526then we are measuring kernel tracepoints.
527The value to use in
528.I config
529can be obtained from under debugfs
530.I tracing/events/*/*/id
531if ftrace is enabled in the kernel.
f2b1d720 532.RE
efeece04 533.PP
f2b1d720
MK
534.RS
535If
536.I type
537is
538.BR PERF_TYPE_HW_CACHE ,
539then we are measuring a hardware CPU cache event.
540To calculate the appropriate
541.I config
542value use the following equation:
408731d4 543.PP
f2b1d720
MK
544.RS 4
545.nf
f2b1d720
MK
546 (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
547 (perf_hw_cache_op_result_id << 16)
548.fi
11ac5b51 549.PP
f2b1d720
MK
550where
551.I perf_hw_cache_id
552is one of:
7db515ef 553.RS 4
f2b1d720
MK
554.TP
555.B PERF_COUNT_HW_CACHE_L1D
556for measuring Level 1 Data Cache
557.TP
558.B PERF_COUNT_HW_CACHE_L1I
559for measuring Level 1 Instruction Cache
560.TP
561.B PERF_COUNT_HW_CACHE_LL
562for measuring Last-Level Cache
563.TP
564.B PERF_COUNT_HW_CACHE_DTLB
565for measuring the Data TLB
566.TP
567.B PERF_COUNT_HW_CACHE_ITLB
568for measuring the Instruction TLB
569.TP
570.B PERF_COUNT_HW_CACHE_BPU
571for measuring the branch prediction unit
572.TP
5a69ce9c
MK
573.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)"
574.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477
f2b1d720
MK
575for measuring local memory accesses
576.RE
11ac5b51 577.PP
f2b1d720
MK
578and
579.I perf_hw_cache_op_id
4af27572 580is one of:
7db515ef 581.RS 4
f2b1d720
MK
582.TP
583.B PERF_COUNT_HW_CACHE_OP_READ
584for read accesses
585.TP
586.B PERF_COUNT_HW_CACHE_OP_WRITE
587for write accesses
588.TP
589.B PERF_COUNT_HW_CACHE_OP_PREFETCH
590for prefetch accesses
591.RE
11ac5b51 592.PP
f2b1d720
MK
593and
594.I perf_hw_cache_op_result_id
4af27572 595is one of:
7db515ef 596.RS 4
f2b1d720
MK
597.TP
598.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
599to measure accesses
600.TP
601.B PERF_COUNT_HW_CACHE_RESULT_MISS
602to measure misses
603.RE
604.RE
efeece04 605.PP
f2b1d720
MK
606If
607.I type
608is
609.BR PERF_TYPE_RAW ,
610then a custom "raw"
611.I config
612value is needed.
613Most CPUs support events that are not covered by the "generalized" events.
614These are implementation defined; see your CPU manual (for example
615the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
616Guide).
617The libpfm4 library can be used to translate from the name in the
618architectural manuals to the raw hex value
619.BR perf_event_open ()
620expects in this field.
efeece04 621.PP
f2b1d720
MK
622If
623.I type
624is
625.BR PERF_TYPE_BREAKPOINT ,
626then leave
627.I config
628set to zero.
629Its parameters are set in other places.
630.RE
631.TP
632.IR sample_period ", " sample_freq
21977c9d 633A "sampling" event is one that generates an overflow notification
f2b1d720
MK
634every N events, where N is given by
635.IR sample_period .
21977c9d 636A sampling event has
f2b1d720 637.IR sample_period " > 0."
21977c9d 638When an overflow occurs, requested data is recorded
f2b1d720
MK
639in the mmap buffer.
640The
641.I sample_type
21977c9d 642field controls what data is recorded on each overflow.
efeece04 643.IP
f2b1d720
MK
644.I sample_freq
645can be used if you wish to use frequency rather than period.
37bee118 646In this case, you set the
f2b1d720
MK
647.I freq
648flag.
649The kernel will adjust the sampling period
650to try and achieve the desired rate.
651The rate of adjustment is a
652timer tick.
f2b1d720
MK
653.TP
654.I "sample_type"
655The various bits in this field specify which values to include
656in the sample.
657They will be recorded in a ring-buffer,
ad73a2cc 658which is available to user space using
f2b1d720
MK
659.BR mmap (2).
660The order in which the values are saved in the
661sample are documented in the MMAP Layout subsection below;
662it is not the
663.I "enum perf_event_sample_format"
664order.
665.RS
666.TP
667.B PERF_SAMPLE_IP
668Records instruction pointer.
669.TP
670.B PERF_SAMPLE_TID
7db515ef 671Records the process and thread IDs.
f2b1d720
MK
672.TP
673.B PERF_SAMPLE_TIME
674Records a timestamp.
675.TP
676.B PERF_SAMPLE_ADDR
677Records an address, if applicable.
678.TP
679.B PERF_SAMPLE_READ
680Record counter values for all events in a group, not just the group leader.
681.TP
682.B PERF_SAMPLE_CALLCHAIN
683Records the callchain (stack backtrace).
684.TP
685.B PERF_SAMPLE_ID
686Records a unique ID for the opened event's group leader.
687.TP
688.B PERF_SAMPLE_CPU
689Records CPU number.
690.TP
691.B PERF_SAMPLE_PERIOD
692Records the current sampling period.
693.TP
694.B PERF_SAMPLE_STREAM_ID
695Records a unique ID for the opened event.
696Unlike
697.B PERF_SAMPLE_ID
698the actual ID is returned, not the group leader.
8859d3a9
DP
699This ID is the same as the one returned by
700.BR PERF_FORMAT_ID .
f2b1d720
MK
701.TP
702.B PERF_SAMPLE_RAW
703Records additional data, if applicable.
704Usually returned by tracepoint events.
705.TP
31c1f2b0 706.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
60dafbc1 707.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
045bf4d3
VW
708This provides a record of recent branches, as provided
709by CPU branch sampling hardware (such as Intel Last Branch Record).
710Not all hardware supports this feature.
efeece04 711.IP
045bf4d3
VW
712See the
713.I branch_sample_type
714field for how to filter which branches are reported.
f2b1d720 715.TP
31c1f2b0 716.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
60dafbc1 717.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
d1007d14
VW
718Records the current user-level CPU register state
719(the values in the process before the kernel was called).
f2b1d720 720.TP
31c1f2b0 721.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
60dafbc1 722.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
723Records the user level stack, allowing stack unwinding.
724.TP
31c1f2b0 725.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
60dafbc1 726.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c
d1007d14 727Records a hardware provided weight value that expresses how
51700fd7 728costly the sampled event was.
d1007d14
VW
729This allows the hardware to highlight expensive events in
730a profile.
731.TP
31c1f2b0 732.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
60dafbc1 733.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1
d1007d14
VW
734Records the data source: where in the memory hierarchy
735the data associated with the sampled instruction came from.
6170255e 736This is available only if the underlying hardware
d1007d14 737supports this feature.
7480dabb 738.TP
31c1f2b0 739.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
60dafbc1 740.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955
8859d3a9
DP
741Places the
742.B SAMPLE_ID
743value in a fixed position in the record,
7480dabb
VW
744either at the beginning (for sample events) or at the end
745(if a non-sample event).
efeece04 746.IP
7480dabb
VW
747This was necessary because a sample stream may have
748records from various different event sources with different
749.I sample_type
750settings.
e9bd9b2c 751Parsing the event stream properly was not possible because the
8859d3a9
DP
752format of the record was needed to find
753.BR SAMPLE_ID ,
754but
27f52b52 755the format could not be found without knowing what
7480dabb
VW
756event the sample belonged to (causing a circular
757dependency).
efeece04 758.IP
e41c36b2 759The
7480dabb
VW
760.B PERF_SAMPLE_IDENTIFIER
761setting makes the event stream always parsable
8859d3a9
DP
762by putting
763.B SAMPLE_ID
764in a fixed location, even though
765it means having duplicate
766.B SAMPLE_ID
767values in records.
1e043959 768.TP
60dafbc1
MK
769.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)"
770.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5
84fc2a6e 771Records reasons for transactional memory abort events
1e043959 772(for example, from Intel TSX transactional memory support).
efeece04 773.IP
1e043959
VW
774The
775.I precise_ip
b3f39642 776setting must be greater than 0 and a transactional memory abort
1e043959 777event must be measured or no values will be recorded.
84fc2a6e
MK
778Also note that some perf_event measurements, such as sampled
779cycle counting, may cause extraneous aborts (by causing an
1e043959 780interrupt during a transaction).
f5281dfd
VW
781.TP
782.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)"
783.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
784Records a subset of the current CPU register state
785as specified by
786.IR sample_regs_intr .
787Unlike
788.B PERF_SAMPLE_REGS_USER
789the register values will return kernel register
790state if the overflow happened while kernel
791code is running.
792If the CPU supports hardware sampling of
b01ae37b 793register state (i.e., PEBS on Intel x86) and
f5281dfd
VW
794.I precise_ip
795is set higher than zero then the register
796values returned are those captured by
797hardware at the time of the sampled
798instruction's retirement.
f2b1d720 799.RE
f2b1d720
MK
800.TP
801.IR "read_format"
802This field specifies the format of the data returned by
803.BR read (2)
804on a
7db515ef 805.BR perf_event_open ()
f2b1d720
MK
806file descriptor.
807.RS
808.TP
809.B PERF_FORMAT_TOTAL_TIME_ENABLED
7ede2f66
DP
810Adds the 64-bit
811.I time_enabled
812field.
f2b1d720
MK
813This can be used to calculate estimated totals if
814the PMU is overcommitted and multiplexing is happening.
815.TP
816.B PERF_FORMAT_TOTAL_TIME_RUNNING
7ede2f66
DP
817Adds the 64-bit
818.I time_running
819field.
f2b1d720 820This can be used to calculate estimated totals if
3d1ee497 821the PMU is overcommitted and multiplexing is happening.
f2b1d720
MK
822.TP
823.B PERF_FORMAT_ID
824Adds a 64-bit unique value that corresponds to the event group.
825.TP
826.B PERF_FORMAT_GROUP
827Allows all counter values in an event group to be read with one read.
828.RE
f2b1d720
MK
829.TP
830.IR "disabled"
831The
832.I disabled
833bit specifies whether the counter starts out disabled or enabled.
834If disabled, the event can later be enabled by
835.BR ioctl (2),
836.BR prctl (2),
837or
838.IR enable_on_exec .
efeece04 839.IP
406650db
VW
840When creating an event group, typically the group leader is initialized
841with
842.I disabled
843set to 1 and any child events are initialized with
844.I disabled
845set to 0.
846Despite
847.I disabled
848being 0, the child events will not start until the group leader
849is enabled.
f2b1d720
MK
850.TP
851.IR "inherit"
852The
853.I inherit
854bit specifies that this counter should count events of child
855tasks as well as the task specified.
33a0ccb2 856This applies only to new children, not to any existing children at
f2b1d720
MK
857the time the counter is created (nor to any new children of
858existing children).
efeece04 859.IP
f2b1d720 860Inherit does not work for some combinations of
4b3a5f01
MK
861.IR read_format
862values, such as
f2b1d720 863.BR PERF_FORMAT_GROUP .
f2b1d720
MK
864.TP
865.IR "pinned"
866The
867.I pinned
868bit specifies that the counter should always be on the CPU if at all
869possible.
33a0ccb2 870It applies only to hardware counters and only to group leaders.
f2b1d720
MK
871If a pinned counter cannot be put onto the CPU (e.g., because there are
872not enough hardware counters or because of a conflict with some other
873event), then the counter goes into an 'error' state, where reads
874return end-of-file (i.e.,
875.BR read (2)
876returns 0) until the counter is subsequently enabled or disabled.
f2b1d720
MK
877.TP
878.IR "exclusive"
879The
880.I exclusive
881bit specifies that when this counter's group is on the CPU,
882it should be the only group using the CPU's counters.
883In the future this may allow monitoring programs to
884support PMU features that need to run alone so that they do not
885disrupt other hardware counters.
efeece04 886.IP
bea10c8c
VW
887Note that many unexpected situations may prevent events with the
888.I exclusive
d3532647 889bit set from ever running.
bea10c8c 890This includes any users running a system-wide
d3532647 891measurement as well as any kernel use of the performance counters
bea10c8c 892(including the commonly enabled NMI Watchdog Timer interface).
f2b1d720
MK
893.TP
894.IR "exclude_user"
ad73a2cc 895If this bit is set, the count excludes events that happen in user space.
f2b1d720
MK
896.TP
897.IR "exclude_kernel"
edb3e316 898If this bit is set, the count excludes events that happen in kernel space.
f2b1d720
MK
899.TP
900.IR "exclude_hv"
901If this bit is set, the count excludes events that happen in the
902hypervisor.
903This is mainly for PMUs that have built-in support for handling this
904(such as POWER).
905Extra support is needed for handling hypervisor measurements on most
906machines.
f2b1d720
MK
907.TP
908.IR "exclude_idle"
909If set, don't count when the CPU is idle.
f2b1d720
MK
910.TP
911.IR "mmap"
912The
913.I mmap
75ee11e5 914bit enables generation of
cd7c700a 915.B PERF_RECORD_MMAP
75ee11e5
VW
916samples for every
917.BR mmap (2)
918call that has
cd7c700a 919.B PROT_EXEC
75ee11e5
VW
920set.
921This allows tools to notice new executable code being mapped into
922a program (dynamic shared libraries for example)
923so that addresses can be mapped back to the original code.
f2b1d720
MK
924.TP
925.IR "comm"
926The
927.I comm
928bit enables tracking of process command name as modified by the
cd7c700a 929.BR exec (2)
f2b1d720 930and
cd7c700a 931.BR prctl (PR_SET_NAME)
49bc411c
VW
932system calls as well as writing to
933.IR /proc/self/comm .
790ee6d6 934If the
49bc411c 935.I comm_exec
790ee6d6 936flag is also successfully set (possible since Linux 3.16),
747a6e7c 937.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
938then the misc flag
939.B PERF_RECORD_MISC_COMM_EXEC
940can be used to differentiate the
941.BR exec (2)
942case from the others.
f2b1d720
MK
943.TP
944.IR "freq"
945If this bit is set, then
946.I sample_frequency
947not
948.I sample_period
949is used when setting up the sampling interval.
f2b1d720
MK
950.TP
951.IR "inherit_stat"
952This bit enables saving of event counts on context switch for
953inherited tasks.
33a0ccb2 954This is meaningful only if the
f2b1d720
MK
955.I inherit
956field is set.
f2b1d720
MK
957.TP
958.IR "enable_on_exec"
959If this bit is set, a counter is automatically
960enabled after a call to
961.BR exec (2).
f2b1d720
MK
962.TP
963.IR "task"
964If this bit is set, then
965fork/exit notifications are included in the ring buffer.
f2b1d720
MK
966.TP
967.IR "watermark"
21977c9d 968If set, have an overflow notification happen when we cross the
f2b1d720
MK
969.I wakeup_watermark
970boundary.
21977c9d 971Otherwise, overflow notifications happen after
f2b1d720
MK
972.I wakeup_events
973samples.
f2b1d720 974.TP
31c1f2b0 975.IR "precise_ip" " (since Linux 2.6.35)"
747a6e7c 976.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076
f2b1d720
MK
977This controls the amount of skid.
978Skid is how many instructions
979execute between an event of interest happening and the kernel
980being able to stop and record the event.
981Smaller skid is
982better and allows more accurate reporting of which events
983correspond to which instructions, but hardware is often limited
984with how small this can be.
efeece04 985.IP
5d73bc3f 986The possible values of this field are the following:
f2b1d720 987.RS
dc9ec146 988.IP 0 3
f2b1d720 989.B SAMPLE_IP
2b538c3e 990can have arbitrary skid.
dc9ec146 991.IP 1
f2b1d720 992.B SAMPLE_IP
2b538c3e 993must have constant skid.
dc9ec146 994.IP 2
f2b1d720 995.B SAMPLE_IP
2b538c3e 996requested to have 0 skid.
dc9ec146 997.IP 3
f2b1d720
MK
998.B SAMPLE_IP
999must have 0 skid.
5d73bc3f 1000See also the description of
f2b1d720
MK
1001.BR PERF_RECORD_MISC_EXACT_IP .
1002.RE
f2b1d720 1003.TP
31c1f2b0 1004.IR "mmap_data" " (since Linux 2.6.36)"
747a6e7c 1005.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e
b01ae37b 1006This is the counterpart of the
f2b1d720 1007.I mmap
75ee11e5
VW
1008field.
1009This enables generation of
cd7c700a 1010.B PERF_RECORD_MMAP
75ee11e5
VW
1011samples for
1012.BR mmap (2)
1013calls that do not have
cd7c700a 1014.B PROT_EXEC
75ee11e5 1015set (for example data and SysV shared memory).
f2b1d720 1016.TP
31c1f2b0 1017.IR "sample_id_all" " (since Linux 2.6.38)"
747a6e7c 1018.\" commit c980d1091810df13f21aabbce545fd98f545bbf7
7480dabb 1019If set, then TID, TIME, ID, STREAM_ID, and CPU can
f2b1d720
MK
1020additionally be included in
1021.RB non- PERF_RECORD_SAMPLE s
1022if the corresponding
1023.I sample_type
1024is selected.
efeece04 1025.IP
e9bd9b2c 1026If
7480dabb 1027.B PERF_SAMPLE_IDENTIFIER
37bee118 1028is specified, then an additional ID value is included
7480dabb
VW
1029as the last value to ease parsing the record stream.
1030This may lead to the
e9bd9b2c 1031.I id
7480dabb 1032value appearing twice.
efeece04 1033.IP
7480dabb 1034The layout is described by this pseudo-structure:
efeece04 1035.IP
7480dabb 1036.in +4n
b8302363 1037.EX
7480dabb 1038struct sample_id {
5b0fbedb
MK
1039 { u32 pid, tid; } /* if PERF_SAMPLE_TID set */
1040 { u64 time; } /* if PERF_SAMPLE_TIME set */
1041 { u64 id; } /* if PERF_SAMPLE_ID set */
1042 { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
1043 { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
1044 { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
7480dabb 1045};
5383b93b
MK
1046.EE
1047,in
f2b1d720 1048.TP
31c1f2b0 1049.IR "exclude_host" " (since Linux 3.2)"
747a6e7c 1050.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e 1051When conducting measurements that include processes running
5d73bc3f
MK
1052VM instances (i.e., have executed a
1053.B KVM_RUN
1054.BR ioctl (2)),
1055only measure events happening inside a guest instance.
e38fb93e
VW
1056This is only meaningful outside the guests; this setting does
1057not change counts gathered inside of a guest.
34d4e61d 1058Currently, this functionality is x86 only.
f2b1d720 1059.TP
31c1f2b0 1060.IR "exclude_guest" " (since Linux 3.2)"
747a6e7c 1061.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e 1062When conducting measurements that include processes running
5d73bc3f
MK
1063VM instances (i.e., have executed a
1064.B KVM_RUN
1065.BR ioctl (2)),
1066do not measure events happening inside guest instances.
e38fb93e
VW
1067This is only meaningful outside the guests; this setting does
1068not change counts gathered inside of a guest.
34d4e61d 1069Currently, this functionality is x86 only.
f2b1d720 1070.TP
31c1f2b0 1071.IR "exclude_callchain_kernel" " (since Linux 3.7)"
747a6e7c 1072.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1073Do not include kernel callchains.
f2b1d720 1074.TP
31c1f2b0 1075.IR "exclude_callchain_user" " (since Linux 3.7)"
747a6e7c 1076.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1077Do not include user callchains.
f2b1d720 1078.TP
9bfc542b 1079.IR "mmap2" " (since Linux 3.16)"
747a6e7c
VW
1080.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741
1081.\" This is tricky; was committed during 3.12 development
1082.\" but right before release was disabled.
1083.\" So while you could select mmap2 starting with 3.12
1084.\" it did not work until 3.16
1085.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005
9bfc542b
VW
1086Generate an extended executable mmap record that contains enough
1087additional information to uniquely identify shared mappings.
1088The
1089.I mmap
1090flag must also be set for this to work.
1091.TP
49bc411c 1092.IR "comm_exec" " (since Linux 3.16)"
747a6e7c 1093.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
5ab35ae5 1094This is purely a feature-detection flag, it does not change
49bc411c 1095kernel behavior.
5ab35ae5 1096If this flag can successfully be set, then, when
49bc411c 1097.I comm
5ab35ae5 1098is enabled, the
49bc411c
VW
1099.B PERF_RECORD_MISC_COMM_EXEC
1100flag will be set in the
1101.I misc
1102field of a comm record header if the rename event being
1103reported was caused by a call to
1104.BR exec (2).
1105This allows tools to distinguish between the various
1106types of process renaming.
1107.TP
6bd5186a
VW
1108.IR "use_clockid" " (since Linux 4.1)"
1109.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1110This allows selecting which internal Linux clock to use
1111when generating timestamps via the
1112.I clockid
1113field.
1114This can make it easier to correlate perf sample times with
1115timestamps generated by other tools.
1116.TP
9277a75d
VW
1117.IR "context_switch" " (since Linux 4.3)"
1118.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
1119This enables the generation of
1120.B PERF_RECORD_SWITCH
1121records when a context switch occurs.
1122It also enables the generation of
1123.B PERF_RECORD_SWITCH_CPU_WIDE
d5a24378 1124records when sampling in CPU-wide mode.
9277a75d
VW
1125This functionality is in addition to existing tracepoint and
1126software events for measuring context switches.
54905b0f
MK
1127The advantage of this method is that it will give full
1128information even with strict
9277a75d
VW
1129.I perf_event_paranoid
1130settings.
1131.TP
f2b1d720
MK
1132.IR "wakeup_events" ", " "wakeup_watermark"
1133This union sets how many samples
1134.RI ( wakeup_events )
1135or bytes
1136.RI ( wakeup_watermark )
21977c9d 1137happen before an overflow notification happens.
f2b1d720
MK
1138Which one is used is selected by the
1139.I watermark
cb8a928f 1140bit flag.
efeece04 1141.IP
751c0f1a 1142.I wakeup_events
6170255e 1143counts only
751c0f1a 1144.B PERF_RECORD_SAMPLE
51700fd7 1145record types.
21977c9d 1146To receive overflow notification for all
751c0f1a 1147.B PERF_RECORD
21977c9d 1148types choose watermark and set
751c0f1a
VW
1149.I wakeup_watermark
1150to 1.
efeece04 1151.IP
fc79d996 1152Prior to Linux 3.0, setting
747a6e7c 1153.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50
21977c9d
VW
1154.I wakeup_events
1155to 0 resulted in no overflow notifications;
1156more recent kernels treat 0 the same as 1.
f2b1d720 1157.TP
31c1f2b0 1158.IR "bp_type" " (since Linux 2.6.33)"
747a6e7c 1159.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1160This chooses the breakpoint type.
1161It is one of:
1162.RS
1163.TP
1164.BR HW_BREAKPOINT_EMPTY
2b538c3e 1165No breakpoint.
f2b1d720
MK
1166.TP
1167.BR HW_BREAKPOINT_R
2b538c3e 1168Count when we read the memory location.
f2b1d720
MK
1169.TP
1170.BR HW_BREAKPOINT_W
2b538c3e 1171Count when we write the memory location.
f2b1d720
MK
1172.TP
1173.BR HW_BREAKPOINT_RW
2b538c3e 1174Count when we read or write the memory location.
f2b1d720
MK
1175.TP
1176.BR HW_BREAKPOINT_X
2b538c3e 1177Count when we execute code at the memory location.
dd3568a1 1178.PP
7db515ef 1179The values can be combined via a bitwise or, but the
f2b1d720
MK
1180combination of
1181.B HW_BREAKPOINT_R
1182or
1183.B HW_BREAKPOINT_W
1184with
1185.B HW_BREAKPOINT_X
1186is not allowed.
1187.RE
f2b1d720 1188.TP
31c1f2b0 1189.IR "bp_addr" " (since Linux 2.6.33)"
747a6e7c 1190.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
5d73bc3f 1191This is the address of the breakpoint.
4b3a5f01
MK
1192For execution breakpoints, this is the memory address of the instruction
1193of interest; for read and write breakpoints, it is the memory address
f2b1d720 1194of the memory location of interest.
f2b1d720 1195.TP
31c1f2b0 1196.IR "config1" " (since Linux 2.6.39)"
747a6e7c 1197.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1198.I config1
1199is used for setting events that need an extra register or otherwise
1200do not fit in the regular config field.
1201Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
4b3a5f01 1202on Linux 3.3 and later kernels.
f2b1d720 1203.TP
31c1f2b0 1204.IR "bp_len" " (since Linux 2.6.33)"
747a6e7c 1205.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1206.I bp_len
1207is the length of the breakpoint being measured if
1208.I type
1209is
1210.BR PERF_TYPE_BREAKPOINT .
1211Options are
1212.BR HW_BREAKPOINT_LEN_1 ,
1213.BR HW_BREAKPOINT_LEN_2 ,
1214.BR HW_BREAKPOINT_LEN_4 ,
4b3a5f01 1215and
f2b1d720
MK
1216.BR HW_BREAKPOINT_LEN_8 .
1217For an execution breakpoint, set this to
1218.IR sizeof(long) .
f2b1d720 1219.TP
31c1f2b0 1220.IR "config2" " (since Linux 2.6.39)"
747a6e7c 1221.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1222.I config2
1223is a further extension of the
1224.I config1
1225field.
f2b1d720 1226.TP
31c1f2b0 1227.IR "branch_sample_type" " (since Linux 3.4)"
747a6e7c 1228.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
8a94e783 1229If
045bf4d3
VW
1230.B PERF_SAMPLE_BRANCH_STACK
1231is enabled, then this specifies what branches to include
1232in the branch record.
efeece04 1233.IP
e3c9782b 1234The first part of the value is the privilege level, which
4b3a5f01 1235is a combination of one of the values listed below.
045bf4d3
VW
1236If the user does not set privilege level explicitly, the kernel
1237will use the event's privilege level.
1238Event and branch privilege levels do not have to match.
f2b1d720
MK
1239.RS
1240.TP
1241.B PERF_SAMPLE_BRANCH_USER
33d6e2c7 1242Branch target is in user space.
f2b1d720
MK
1243.TP
1244.B PERF_SAMPLE_BRANCH_KERNEL
33d6e2c7 1245Branch target is in kernel space.
f2b1d720
MK
1246.TP
1247.B PERF_SAMPLE_BRANCH_HV
33d6e2c7 1248Branch target is in hypervisor.
e3c9782b
VW
1249.TP
1250.B PERF_SAMPLE_BRANCH_PLM_ALL
1251A convenience value that is the three preceding values ORed together.
11ac5b51 1252.PP
e3c9782b
VW
1253In addition to the privilege value, at least one or more of the
1254following bits must be set.
f2b1d720
MK
1255.TP
1256.B PERF_SAMPLE_BRANCH_ANY
33d6e2c7 1257Any branch type.
f2b1d720
MK
1258.TP
1259.B PERF_SAMPLE_BRANCH_ANY_CALL
c6e5df74 1260Any call branch (includes direct calls, indirect calls, and far jumps).
f2b1d720 1261.TP
e3c9782b 1262.B PERF_SAMPLE_BRANCH_IND_CALL
33d6e2c7 1263Indirect calls.
f2b1d720 1264.TP
c6e5df74
VW
1265.BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)"
1266.\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73
1267Direct calls.
1268.TP
1269.B PERF_SAMPLE_BRANCH_ANY_RETURN
1270Any return branch.
1271.TP
dde354c9
VW
1272.BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)"
1273.\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc
1274Indirect jumps.
1275.TP
aea60aad 1276.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)"
60dafbc1 1277.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050
aea60aad
VW
1278Conditional branches.
1279.TP
31c1f2b0 1280.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
60dafbc1 1281.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1282Transactional memory aborts.
e3c9782b 1283.TP
31c1f2b0 1284.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
60dafbc1 1285.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1286Branch in transactional memory transaction.
e3c9782b 1287.TP
31c1f2b0 1288.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
60dafbc1 1289.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1290Branch not in transactional memory transaction.
bb7e6ff0
VW
1291.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)"
1292.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70
95655a22 1293Branch is part of a hardware-generated call stack.
bb7e6ff0
VW
1294This requires hardware support, currently only found
1295on Intel x86 Haswell or newer.
f2b1d720 1296.RE
f2b1d720 1297.TP
31c1f2b0 1298.IR "sample_regs_user" " (since Linux 3.7)"
747a6e7c 1299.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
4651e412 1300This bit mask defines the set of user CPU registers to dump on samples.
76c637e1 1301The layout of the register mask is architecture-specific and
4b3a5f01 1302is described in the kernel header file
d1007d14 1303.IR arch/ARCH/include/uapi/asm/perf_regs.h .
f2b1d720 1304.TP
31c1f2b0 1305.IR "sample_stack_user" " (since Linux 3.7)"
747a6e7c 1306.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
1307This defines the size of the user stack to dump if
1308.B PERF_SAMPLE_STACK_USER
1309is specified.
6bd5186a
VW
1310.TP
1311.IR "clockid" " (since Linux 4.1)"
1312.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1313If
1314.I use_clockid
1315is set, then this field selects which internal Linux timer to
1316use for timestamps.
1317The available timers are defined in
1318.IR linux/time.h ,
1319with
95655a22
MK
1320.BR CLOCK_MONOTONIC ,
1321.BR CLOCK_MONOTONIC_RAW ,
1322.BR CLOCK_REALTIME ,
1323.BR CLOCK_BOOTTIME ,
1324and
1325.B CLOCK_TAI
6bd5186a 1326currently supported.
cdc52f4a
VW
1327.TP
1328.IR "aux_watermark" " (since Linux 4.1)"
1329.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
1330This specifies how much data is required to trigger a
1331.B PERF_RECORD_AUX
1332sample.
fd133d5d
VW
1333.TP
1334.IR "sample_max_stack" " (since Linux 4.8)"
1335.\" commit 97c79a38cd454602645f0470ffb444b3b75ce574
1336When
1337.I sample_type
1338includes
5dd3feec 1339.BR PERF_SAMPLE_CALLCHAIN ,
4b3a5f01 1340this field specifies how many stack frames to report when
fd133d5d 1341generating the callchain.
73d8cece 1342.SS Reading results
f2b1d720 1343Once a
7db515ef 1344.BR perf_event_open ()
3d1ee497 1345file descriptor has been opened, the values
f2b1d720
MK
1346of the events can be read from the file descriptor.
1347The values that are there are specified by the
1348.I read_format
7db515ef
MK
1349field in the
1350.I attr
1351structure at open time.
efeece04 1352.PP
f2b1d720 1353If you attempt to read into a buffer that is not big enough to hold the
4b3a5f01 1354data, the error
f2b1d720 1355.B ENOSPC
4b3a5f01 1356results.
efeece04 1357.PP
f2b1d720 1358Here is the layout of the data returned by a read:
e525b89f 1359.IP * 2
f2b1d720
MK
1360If
1361.B PERF_FORMAT_GROUP
1362was specified to allow reading all events in a group at once:
efeece04 1363.IP
f2b1d720 1364.in +4n
b8302363 1365.EX
f2b1d720 1366struct read_format {
e525b89f
MK
1367 u64 nr; /* The number of events */
1368 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1369 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
e307112d 1370 struct {
e525b89f
MK
1371 u64 value; /* The value of the event */
1372 u64 id; /* if PERF_FORMAT_ID */
f2b1d720
MK
1373 } values[nr];
1374};
b8302363 1375.EE
f2b1d720 1376.in
e525b89f 1377.IP *
f2b1d720
MK
1378If
1379.B PERF_FORMAT_GROUP
1380was
1381.I not
e525b89f 1382specified:
efeece04 1383.IP
f2b1d720 1384.in +4n
b8302363 1385.EX
f2b1d720
MK
1386struct read_format {
1387 u64 value; /* The value of the event */
1388 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1389 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1390 u64 id; /* if PERF_FORMAT_ID */
1391};
b8302363 1392.EE
f2b1d720 1393.in
e525b89f
MK
1394.PP
1395The values read are as follows:
f2b1d720
MK
1396.TP
1397.I nr
1398The number of events in this file descriptor.
fcc4f4f4 1399Available only if
f2b1d720
MK
1400.B PERF_FORMAT_GROUP
1401was specified.
f2b1d720
MK
1402.TP
1403.IR time_enabled ", " time_running
1404Total time the event was enabled and running.
4b3a5f01 1405Normally these values are the same.
37bee118
MK
1406If more events are started,
1407then available counter slots on the PMU, then multiplexing
33a0ccb2 1408happens and events run only part of the time.
37bee118 1409In that case, the
f2b1d720
MK
1410.I time_enabled
1411and
1412.I time running
1413values can be used to scale an estimated value for the count.
f2b1d720
MK
1414.TP
1415.I value
1416An unsigned 64-bit value containing the counter result.
f2b1d720
MK
1417.TP
1418.I id
4b3a5f01 1419A globally unique value for this particular event; only present if
f2b1d720 1420.B PERF_FORMAT_ID
e525b89f
MK
1421was specified in
1422.IR read_format .
73d8cece 1423.SS MMAP layout
f2b1d720 1424When using
7db515ef 1425.BR perf_event_open ()
f2b1d720
MK
1426in sampled mode, asynchronous events
1427(like counter overflow or
1428.B PROT_EXEC
1429mmap tracking)
1430are logged into a ring-buffer.
1431This ring-buffer is created and accessed through
1432.BR mmap (2).
efeece04 1433.PP
f2b1d720
MK
1434The mmap size should be 1+2^n pages, where the first page is a
1435metadata page
e525b89f 1436.RI ( "struct perf_event_mmap_page" )
f2b1d720
MK
1437that contains various
1438bits of information such as where the ring-buffer head is.
efeece04 1439.PP
95655a22 1440Before kernel 2.6.39, there is a bug that means you must allocate an mmap
f2b1d720 1441ring buffer when sampling even if you do not plan to access it.
efeece04 1442.PP
f2b1d720 1443The structure of the first metadata mmap page is as follows:
efeece04 1444.PP
f2b1d720 1445.in +4n
b8302363 1446.EX
f2b1d720 1447struct perf_event_mmap_page {
ce88f77b
MK
1448 __u32 version; /* version number of this structure */
1449 __u32 compat_version; /* lowest version this is compat with */
1450 __u32 lock; /* seqlock for synchronization */
1451 __u32 index; /* hardware counter identifier */
1452 __s64 offset; /* add to hardware counter value */
1453 __u64 time_enabled; /* time event active */
1454 __u64 time_running; /* time event on CPU */
f2b1d720
MK
1455 union {
1456 __u64 capabilities;
135cba8b 1457 struct {
ce88f77b
MK
1458 __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
1459 cap_bit0_is_deprecated : 1,
1460 cap_user_rdpmc : 1,
1461 cap_user_time : 1,
1462 cap_user_time_zero : 1,
135cba8b 1463 };
f2b1d720 1464 };
ce88f77b
MK
1465 __u16 pmc_width;
1466 __u16 time_shift;
1467 __u32 time_mult;
1468 __u64 time_offset;
ee8655b5 1469 __u64 __reserved[120]; /* Pad to 1 k */
ce88f77b
MK
1470 __u64 data_head; /* head in the data section */
1471 __u64 data_tail; /* user-space written tail */
21d9849a
VW
1472 __u64 data_offset; /* where the buffer starts */
1473 __u64 data_size; /* data buffer size */
4e47c6e5
VW
1474 __u64 aux_head;
1475 __u64 aux_tail;
1476 __u64 aux_offset;
1477 __u64 aux_size;
21d9849a 1478
f2b1d720 1479}
b8302363 1480.EE
f2b1d720 1481.in
efeece04 1482.PP
ce88f77b 1483The following list describes the fields in the
f2b1d720 1484.I perf_event_mmap_page
e525b89f 1485structure in more detail:
f2b1d720
MK
1486.TP
1487.I version
1488Version number of this structure.
f2b1d720
MK
1489.TP
1490.I compat_version
1491The lowest version this is compatible with.
f2b1d720
MK
1492.TP
1493.I lock
1494A seqlock for synchronization.
f2b1d720
MK
1495.TP
1496.I index
1497A unique hardware counter identifier.
f2b1d720
MK
1498.TP
1499.I offset
135cba8b
VW
1500When using rdpmc for reads this offset value
1501must be added to the one returned by rdpmc to get
1502the current total event count.
f2b1d720
MK
1503.TP
1504.I time_enabled
1505Time the event was active.
f2b1d720
MK
1506.TP
1507.I time_running
1508Time the event was running.
f2b1d720 1509.TP
31c1f2b0 1510.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
747a6e7c 1511.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
e9bd9b2c 1512There was a bug in the definition of
f2b1d720 1513.I cap_usr_time
135cba8b
VW
1514and
1515.I cap_usr_rdpmc
1516from Linux 3.4 until Linux 3.11.
1517Both bits were defined to point to the same location, so it was
e9bd9b2c 1518impossible to know if
135cba8b
VW
1519.I cap_usr_time
1520or
1521.I cap_usr_rdpmc
1522were actually set.
efeece04 1523.IP
4010bc07 1524Starting with Linux 3.12, these are renamed to
747a6e7c 1525.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b 1526.I cap_bit0
e41c36b2 1527and you should use the
135cba8b
VW
1528.I cap_user_time
1529and
1530.I cap_user_rdpmc
1531fields instead.
f2b1d720 1532.TP
31c1f2b0 1533.IR cap_bit0_is_deprecated " (since Linux 3.12)"
747a6e7c 1534.\" commit fa7315871046b9a4c48627905691dbde57e51033
37bee118 1535If set, this bit indicates that the kernel supports
135cba8b
VW
1536the properly separated
1537.I cap_user_time
1538and
1539.I cap_user_rdpmc
1540bits.
efeece04 1541.IP
135cba8b
VW
1542If not-set, it indicates an older kernel where
1543.I cap_usr_time
1544and
f2b1d720 1545.I cap_usr_rdpmc
135cba8b
VW
1546map to the same bit and thus both features should
1547be used with caution.
135cba8b 1548.TP
31c1f2b0 1549.IR cap_user_rdpmc " (since Linux 3.12)"
747a6e7c 1550.\" commit fa7315871046b9a4c48627905691dbde57e51033
f2b1d720
MK
1551If the hardware supports user-space read of performance counters
1552without syscall (this is the "rdpmc" instruction on x86), then
1553the following code can be used to do a read:
efeece04 1554.IP
f2b1d720 1555.in +4n
b8302363 1556.EX
f2b1d720
MK
1557u32 seq, time_mult, time_shift, idx, width;
1558u64 count, enabled, running;
1559u64 cyc, time_offset;
f2b1d720
MK
1560
1561do {
1562 seq = pc\->lock;
1563 barrier();
1564 enabled = pc\->time_enabled;
1565 running = pc\->time_running;
1566
1567 if (pc\->cap_usr_time && enabled != running) {
1568 cyc = rdtsc();
1569 time_offset = pc\->time_offset;
1570 time_mult = pc\->time_mult;
1571 time_shift = pc\->time_shift;
1572 }
1573
1574 idx = pc\->index;
1575 count = pc\->offset;
1576
1577 if (pc\->cap_usr_rdpmc && idx) {
1578 width = pc\->pmc_width;
135cba8b 1579 count += rdpmc(idx \- 1);
f2b1d720
MK
1580 }
1581
1582 barrier();
1583} while (pc\->lock != seq);
b8302363 1584.EE
f2b1d720 1585.in
f2b1d720 1586.TP
cc19ea28 1587.IR cap_user_time " (since Linux 3.12)"
747a6e7c 1588.\" commit fa7315871046b9a4c48627905691dbde57e51033
7d182bb6 1589This bit indicates the hardware has a constant, nonstop
135cba8b
VW
1590timestamp counter (TSC on x86).
1591.TP
31c1f2b0 1592.IR cap_user_time_zero " (since Linux 3.12)"
747a6e7c 1593.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
1594Indicates the presence of
1595.I time_zero
1596which allows mapping timestamp values to
1597the hardware clock.
1598.TP
f2b1d720
MK
1599.I pmc_width
1600If
1601.IR cap_usr_rdpmc ,
1602this field provides the bit-width of the value
1603read using the rdpmc or equivalent instruction.
1604This can be used to sign extend the result like:
efeece04 1605.IP
f2b1d720 1606.in +4n
b8302363 1607.EX
f2b1d720
MK
1608pmc <<= 64 \- pmc_width;
1609pmc >>= 64 \- pmc_width; // signed shift right
1610count += pmc;
b8302363 1611.EE
f2b1d720 1612.in
f2b1d720
MK
1613.TP
1614.IR time_shift ", " time_mult ", " time_offset
efeece04 1615.IP
f2b1d720
MK
1616If
1617.IR cap_usr_time ,
1618these fields can be used to compute the time
4b3a5f01
MK
1619delta since
1620.I time_enabled
1621(in nanoseconds) using rdtsc or similar.
408731d4 1622.IP
f2b1d720 1623.nf
f2b1d720
MK
1624 u64 quot, rem;
1625 u64 delta;
1626 quot = (cyc >> time_shift);
988688f6 1627 rem = cyc & (((u64)1 << time_shift) \- 1);
f2b1d720
MK
1628 delta = time_offset + quot * time_mult +
1629 ((rem * time_mult) >> time_shift);
1630.fi
efeece04 1631.IP
7db515ef
MK
1632Where
1633.IR time_offset ,
1634.IR time_mult ,
1635.IR time_shift ,
1636and
1637.IR cyc
1638are read in the
f2b1d720
MK
1639seqcount loop described above.
1640This delta can then be added to
1641enabled and possible running (if idx), improving the scaling:
408731d4 1642.IP
f2b1d720 1643.nf
f2b1d720
MK
1644 enabled += delta;
1645 if (idx)
1646 running += delta;
1647 quot = count / running;
1648 rem = count % running;
1649 count = quot * enabled + (rem * enabled) / running;
1650.fi
f2b1d720 1651.TP
31c1f2b0 1652.IR time_zero " (since Linux 3.12)"
747a6e7c 1653.\" commit fa7315871046b9a4c48627905691dbde57e51033
efeece04 1654.IP
e9bd9b2c 1655If
135cba8b 1656.I cap_usr_time_zero
37bee118 1657is set, then the hardware clock (the TSC timestamp counter on x86)
135cba8b
VW
1658can be calculated from the
1659.IR time_zero ", " time_mult ", and " time_shift " values:"
efeece04 1660.IP
135cba8b
VW
1661.nf
1662 time = timestamp - time_zero;
1663 quot = time / time_mult;
1664 rem = time % time_mult;
1665 cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
1666.fi
efeece04 1667.IP
135cba8b 1668And vice versa:
efeece04 1669.IP
135cba8b
VW
1670.nf
1671 quot = cyc >> time_shift;
988688f6 1672 rem = cyc & (((u64)1 << time_shift) - 1);
135cba8b
VW
1673 timestamp = time_zero + quot * time_mult +
1674 ((rem * time_mult) >> time_shift);
1675.fi
1676.TP
f2b1d720
MK
1677.I data_head
1678This points to the head of the data section.
7db515ef
MK
1679The value continuously increases, it does not wrap.
1680The value needs to be manually wrapped by the size of the mmap buffer
f2b1d720 1681before accessing the samples.
efeece04 1682.IP
ce88f77b
MK
1683On SMP-capable platforms, after reading the
1684.I data_head
1685value,
ad73a2cc 1686user space should issue an rmb().
f2b1d720 1687.TP
fecd584f 1688.I data_tail
f2b1d720
MK
1689When the mapping is
1690.BR PROT_WRITE ,
7db515ef
MK
1691the
1692.I data_tail
1693value should be written by user space to reflect the last read data.
31020de9 1694In this case, the kernel will not overwrite unread data.
21d9849a
VW
1695.TP
1696.IR data_offset " (since Linux 4.1)"
1697.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1698Contains the offset of the location in the mmap buffer
1699where perf sample data begins.
1700.TP
1701.IR data_size " (since Linux 4.1)"
1702.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1703Contains the size of the perf sample region within
1704the mmap buffer.
4e47c6e5
VW
1705.TP
1706.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)
1707.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff
95655a22
MK
1708The AUX region allows mmaping a separate sample buffer for
1709high-bandwidth data streams (separate from the main perf sample buffer).
1710An example of a high-bandwidth stream is instruction tracing support,
4e47c6e5 1711as is found in newer Intel processors.
efeece04 1712.IP
4e47c6e5
VW
1713To set up an AUX area, first
1714.I aux_offset
1715needs to be set with an offset greater than
1716.IR data_offset + data_size
1717and
1718.I aux_size
1719needs to be set to the desired buffer size.
1720The desired offset and size must be page aligned, and the size
1721must be a power of two.
1722These values are then passed to mmap in order to map the AUX buffer.
95655a22
MK
1723Pages in the AUX buffer are included as part of the
1724.BR RLIMIT_MEMLOCK
1725resource limit (see
1726.BR setrlimit (2)),
1727and also as part of the
4e47c6e5
VW
1728.I perf_event_mlock_kb
1729allowance.
efeece04 1730.IP
95655a22 1731By default, the AUX buffer will be truncated if it will not fit
b1355f6a
VW
1732in the available space in the ring buffer.
1733If the AUX buffer is mapped as a read only buffer, then it will
1734operate in ring buffer mode where old data will be overwritten
1735by new.
95655a22 1736In overwrite mode, it might not be possible to infer where the
b1355f6a
VW
1737new data began, and it is the consumer's job to disable
1738measurement while reading to avoid possible data races.
efeece04 1739.IP
4e47c6e5
VW
1740The
1741.IR aux_head " and " aux_tail
1742ring buffer pointers have the same behavior and ordering
1743rules as the previous described
1744.IR data_head " and " data_tail .
e525b89f 1745.PP
f2b1d720 1746The following 2^n ring-buffer pages have the layout described below.
efeece04 1747.PP
f2b1d720
MK
1748If
1749.I perf_event_attr.sample_id_all
1750is set, then all event types will
1751have the sample_type selected fields related to where/when (identity)
1752an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1753.B PERF_RECORD_SAMPLE
1754below, it will be stashed just after the
7db515ef
MK
1755.I perf_event_header
1756and the fields already present for the existing
3d1ee497 1757fields, that is, at the end of the payload.
4b3a5f01
MK
1758This allows a newer perf.data
1759file to be supported by older perf tools, with the new optional
f2b1d720 1760fields being ignored.
efeece04 1761.PP
f2b1d720 1762The mmap values start with a header:
efeece04 1763.PP
f2b1d720 1764.in +4n
b8302363 1765.EX
f2b1d720
MK
1766struct perf_event_header {
1767 __u32 type;
1768 __u16 misc;
1769 __u16 size;
1770};
b8302363 1771.EE
f2b1d720 1772.in
efeece04 1773.PP
f2b1d720
MK
1774Below, we describe the
1775.I perf_event_header
1776fields in more detail.
4047bc6c
MK
1777For ease of reading,
1778the fields with shorter descriptions are presented first.
1779.TP
1780.I size
1781This indicates the size of the record.
1782.TP
1783.I misc
1784The
1785.I misc
1786field contains additional information about the sample.
efeece04 1787.IP
4047bc6c
MK
1788The CPU mode can be determined from this value by masking with
1789.B PERF_RECORD_MISC_CPUMODE_MASK
1790and looking for one of the following (note these are not
1791bit masks, only one can be set at a time):
1792.RS
1793.TP
1794.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1795Unknown CPU mode.
1796.TP
1797.B PERF_RECORD_MISC_KERNEL
1798Sample happened in the kernel.
1799.TP
1800.B PERF_RECORD_MISC_USER
1801Sample happened in user code.
1802.TP
1803.B PERF_RECORD_MISC_HYPERVISOR
1804Sample happened in the hypervisor.
1805.TP
747a6e7c 1806.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)"
60dafbc1 1807.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1808Sample happened in the guest kernel.
1809.TP
747a6e7c 1810.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)"
60dafbc1 1811.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1812Sample happened in guest user code.
1813.RE
efeece04 1814.PP
4047bc6c 1815.RS
d5a24378
MK
1816Since the following three statuses are generated by
1817different record types, they alias to the same bit:
4047bc6c 1818.TP
60dafbc1
MK
1819.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)"
1820.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75
4047bc6c
MK
1821This is set when the mapping is not executable;
1822otherwise the mapping is executable.
1823.TP
60dafbc1
MK
1824.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)"
1825.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
1826This is set for a
1827.B PERF_RECORD_COMM
1828record on kernels more recent than Linux 3.16
1829if a process name change was caused by an
1830.BR exec (2)
1831system call.
9277a75d
VW
1832.TP
1833.BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)"
1834.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
1835When a
d5a24378
MK
1836.BR PERF_RECORD_SWITCH
1837or
1838.BR PERF_RECORD_SWITCH_CPU_WIDE
1839record is generated, this bit indicates that the
9277a75d 1840context switch is away from the current process
d5a24378 1841(instead of into the current process).
9277a75d 1842.RE
efeece04 1843.PP
9277a75d
VW
1844.RS
1845In addition, the following bits can be set:
49bc411c 1846.TP
4047bc6c
MK
1847.B PERF_RECORD_MISC_EXACT_IP
1848This indicates that the content of
1849.B PERF_SAMPLE_IP
1850points
1851to the actual instruction that triggered the event.
1852See also
1853.IR perf_event_attr.precise_ip .
1854.TP
60dafbc1
MK
1855.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)"
1856.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74
4047bc6c 1857This indicates there is extended data available (currently not used).
ffbc7c02
VW
1858.TP
1859.B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT
1860.\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4
1861This bit is not set by the kernel.
141efa1b
MK
1862It is reserved for the user-space perf utility to indicate that
1863.I /proc/i[pid]/maps
1864parsing was taking too long and was stopped, and thus the mmap
ffbc7c02 1865records may be truncated.
4047bc6c 1866.RE
f2b1d720
MK
1867.TP
1868.I type
1869The
1870.I type
1871value is one of the below.
1872The values in the corresponding record (that follows the header)
1873depend on the
1874.I type
1875selected as shown.
f2b1d720 1876.RS
7db515ef 1877.TP 4
f2b1d720
MK
1878.B PERF_RECORD_MMAP
1879The MMAP events record the
1880.B PROT_EXEC
1881mappings so that we can correlate
ad73a2cc 1882user-space IPs to code.
f2b1d720 1883They have the following structure:
efeece04 1884.IP
f2b1d720 1885.in +4n
b8302363 1886.EX
f2b1d720
MK
1887struct {
1888 struct perf_event_header header;
1889 u32 pid, tid;
1890 u64 addr;
1891 u64 len;
1892 u64 pgoff;
1893 char filename[];
1894};
b8302363 1895.EE
f2b1d720 1896.in
9bfc542b
VW
1897.RS
1898.TP
1899.I pid
3a058284 1900is the process ID.
9bfc542b
VW
1901.TP
1902.I tid
3a058284 1903is the thread ID.
9bfc542b
VW
1904.TP
1905.I addr
1906is the address of the allocated memory.
1907.I len
1908is the length of the allocated memory.
1909.I pgoff
1910is the page offset of the allocated memory.
1911.I filename
1912is a string describing the backing of the allocated memory.
1913.RE
f2b1d720
MK
1914.TP
1915.B PERF_RECORD_LOST
1916This record indicates when events are lost.
efeece04 1917.IP
f2b1d720 1918.in +4n
b8302363 1919.EX
f2b1d720
MK
1920struct {
1921 struct perf_event_header header;
7a10da70
MK
1922 u64 id;
1923 u64 lost;
7480dabb 1924 struct sample_id sample_id;
f2b1d720 1925};
b8302363 1926.EE
f2b1d720 1927.in
f2b1d720
MK
1928.RS
1929.TP
1930.I id
1931is the unique event ID for the samples that were lost.
1932.TP
1933.I lost
1934is the number of events that were lost.
1935.RE
f2b1d720
MK
1936.TP
1937.B PERF_RECORD_COMM
1938This record indicates a change in the process name.
efeece04 1939.IP
f2b1d720 1940.in +4n
b8302363 1941.EX
f2b1d720
MK
1942struct {
1943 struct perf_event_header header;
7a10da70
MK
1944 u32 pid;
1945 u32 tid;
1946 char comm[];
7480dabb 1947 struct sample_id sample_id;
f2b1d720 1948};
b8302363 1949.EE
f2b1d720 1950.in
49bc411c
VW
1951.RS
1952.TP
1953.I pid
5ab35ae5 1954is the process ID.
49bc411c
VW
1955.TP
1956.I tid
5ab35ae5 1957is the thread ID.
49bc411c
VW
1958.TP
1959.I comm
1960is a string containing the new name of the process.
1961.RE
f2b1d720
MK
1962.TP
1963.B PERF_RECORD_EXIT
1964This record indicates a process exit event.
efeece04 1965.IP
f2b1d720 1966.in +4n
b8302363 1967.EX
f2b1d720
MK
1968struct {
1969 struct perf_event_header header;
7a10da70
MK
1970 u32 pid, ppid;
1971 u32 tid, ptid;
1972 u64 time;
7480dabb 1973 struct sample_id sample_id;
f2b1d720 1974};
b8302363 1975.EE
f2b1d720 1976.in
f2b1d720
MK
1977.TP
1978.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
1979This record indicates a throttle/unthrottle event.
efeece04 1980.IP
f2b1d720 1981.in +4n
b8302363 1982.EX
f2b1d720
MK
1983struct {
1984 struct perf_event_header header;
7a10da70
MK
1985 u64 time;
1986 u64 id;
1987 u64 stream_id;
7480dabb 1988 struct sample_id sample_id;
f2b1d720 1989};
b8302363 1990.EE
f2b1d720 1991.in
f2b1d720
MK
1992.TP
1993.B PERF_RECORD_FORK
1994This record indicates a fork event.
efeece04 1995.IP
f2b1d720 1996.in +4n
b8302363 1997.EX
f2b1d720
MK
1998struct {
1999 struct perf_event_header header;
7a10da70
MK
2000 u32 pid, ppid;
2001 u32 tid, ptid;
2002 u64 time;
7480dabb 2003 struct sample_id sample_id;
f2b1d720 2004};
b8302363 2005.EE
f2b1d720 2006.in
f2b1d720
MK
2007.TP
2008.B PERF_RECORD_READ
2009This record indicates a read event.
efeece04 2010.IP
f2b1d720 2011.in +4n
b8302363 2012.EX
f2b1d720
MK
2013struct {
2014 struct perf_event_header header;
7a10da70 2015 u32 pid, tid;
f2b1d720 2016 struct read_format values;
7480dabb 2017 struct sample_id sample_id;
f2b1d720 2018};
b8302363 2019.EE
f2b1d720 2020.in
f2b1d720
MK
2021.TP
2022.B PERF_RECORD_SAMPLE
2023This record indicates a sample.
efeece04 2024.IP
f2b1d720 2025.in +4n
b8302363 2026.EX
f2b1d720
MK
2027struct {
2028 struct perf_event_header header;
880403e9
MK
2029 u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
2030 u64 ip; /* if PERF_SAMPLE_IP */
2031 u32 pid, tid; /* if PERF_SAMPLE_TID */
2032 u64 time; /* if PERF_SAMPLE_TIME */
2033 u64 addr; /* if PERF_SAMPLE_ADDR */
2034 u64 id; /* if PERF_SAMPLE_ID */
2035 u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
2036 u32 cpu, res; /* if PERF_SAMPLE_CPU */
2037 u64 period; /* if PERF_SAMPLE_PERIOD */
2038 struct read_format v; /* if PERF_SAMPLE_READ */
2039 u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
2040 u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
2041 u32 size; /* if PERF_SAMPLE_RAW */
2042 char data[size]; /* if PERF_SAMPLE_RAW */
2043 u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
7db515ef 2044 struct perf_branch_entry lbr[bnr];
880403e9
MK
2045 /* if PERF_SAMPLE_BRANCH_STACK */
2046 u64 abi; /* if PERF_SAMPLE_REGS_USER */
7a10da70 2047 u64 regs[weight(mask)];
880403e9
MK
2048 /* if PERF_SAMPLE_REGS_USER */
2049 u64 size; /* if PERF_SAMPLE_STACK_USER */
2050 char data[size]; /* if PERF_SAMPLE_STACK_USER */
2051 u64 dyn_size; /* if PERF_SAMPLE_STACK_USER &&
2052 size != 0 */
2053 u64 weight; /* if PERF_SAMPLE_WEIGHT */
2054 u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
2055 u64 transaction; /* if PERF_SAMPLE_TRANSACTION */
2056 u64 abi; /* if PERF_SAMPLE_REGS_INTR */
7a10da70 2057 u64 regs[weight(mask)];
880403e9 2058 /* if PERF_SAMPLE_REGS_INTR */
f2b1d720 2059};
ba4924aa 2060.EE
4047bc6c
MK
2061.RS 4
2062.TP 4
7480dabb
VW
2063.I sample_id
2064If
2065.B PERF_SAMPLE_IDENTIFIER
2066is enabled, a 64-bit unique ID is included.
e9bd9b2c 2067This is a duplication of the
7480dabb
VW
2068.B PERF_SAMPLE_ID
2069.I id
2070value, but included at the beginning of the sample
2071so parsers can easily obtain the value.
2072.TP
f2b1d720 2073.I ip
7db515ef
MK
2074If
2075.B PERF_SAMPLE_IP
2076is enabled, then a 64-bit instruction
f2b1d720 2077pointer value is included.
f2b1d720 2078.TP
7db515ef
MK
2079.IR pid ", " tid
2080If
2081.B PERF_SAMPLE_TID
2082is enabled, then a 32-bit process ID
2083and 32-bit thread ID are included.
f2b1d720
MK
2084.TP
2085.I time
7db515ef
MK
2086If
2087.B PERF_SAMPLE_TIME
2088is enabled, then a 64-bit timestamp
f2b1d720
MK
2089is included.
2090This is obtained via local_clock() which is a hardware timestamp
2091if available and the jiffies value if not.
f2b1d720
MK
2092.TP
2093.I addr
7db515ef
MK
2094If
2095.B PERF_SAMPLE_ADDR
2096is enabled, then a 64-bit address is included.
f2b1d720
MK
2097This is usually the address of a tracepoint,
2098breakpoint, or software event; otherwise the value is 0.
f2b1d720
MK
2099.TP
2100.I id
7db515ef
MK
2101If
2102.B PERF_SAMPLE_ID
2103is enabled, a 64-bit unique ID is included.
f2b1d720 2104If the event is a member of an event group, the group leader ID is returned.
7db515ef
MK
2105This ID is the same as the one returned by
2106.BR PERF_FORMAT_ID .
f2b1d720
MK
2107.TP
2108.I stream_id
7db515ef
MK
2109If
2110.B PERF_SAMPLE_STREAM_ID
2111is enabled, a 64-bit unique ID is included.
f2b1d720
MK
2112Unlike
2113.B PERF_SAMPLE_ID
2114the actual ID is returned, not the group leader.
7db515ef
MK
2115This ID is the same as the one returned by
2116.BR PERF_FORMAT_ID .
f2b1d720 2117.TP
7db515ef
MK
2118.IR cpu ", " res
2119If
2120.B PERF_SAMPLE_CPU
2121is enabled, this is a 32-bit value indicating
f2b1d720
MK
2122which CPU was being used, in addition to a reserved (unused)
212332-bit value.
f2b1d720
MK
2124.TP
2125.I period
7db515ef
MK
2126If
2127.B PERF_SAMPLE_PERIOD
2128is enabled, a 64-bit value indicating
f2b1d720 2129the current sampling period is written.
f2b1d720
MK
2130.TP
2131.I v
7db515ef
MK
2132If
2133.B PERF_SAMPLE_READ
2134is enabled, a structure of type read_format
f2b1d720
MK
2135is included which has values for all events in the event group.
2136The values included depend on the
2137.I read_format
7db515ef
MK
2138value used at
2139.BR perf_event_open ()
2140time.
f2b1d720 2141.TP
7db515ef
MK
2142.IR nr ", " ips[nr]
2143If
2144.B PERF_SAMPLE_CALLCHAIN
2145is enabled, then a 64-bit number is included
f2b1d720 2146which indicates how many following 64-bit instruction pointers will
7db515ef
MK
2147follow.
2148This is the current callchain.
f2b1d720 2149.TP
7ede2f66 2150.IR size ", " data[size]
7db515ef
MK
2151If
2152.B PERF_SAMPLE_RAW
2153is enabled, then a 32-bit value indicating size
f2b1d720
MK
2154is included followed by an array of 8-bit values of length size.
2155The values are padded with 0 to have 64-bit alignment.
efeece04 2156.IP
f2b1d720
MK
2157This RAW record data is opaque with respect to the ABI.
2158The ABI doesn't make any promises with respect to the stability
2159of its content, it may vary depending
2160on event, hardware, and kernel version.
f2b1d720 2161.TP
7db515ef
MK
2162.IR bnr ", " lbr[bnr]
2163If
2164.B PERF_SAMPLE_BRANCH_STACK
2165is enabled, then a 64-bit value indicating
2166the number of records is included, followed by
2167.I bnr
2168.I perf_branch_entry
045bf4d3
VW
2169structures which each include the fields:
2170.RS
2171.TP
2172.I from
2b538c3e 2173This indicates the source instruction (may not be a branch).
045bf4d3
VW
2174.TP
2175.I to
2b538c3e 2176The branch target.
045bf4d3
VW
2177.TP
2178.I mispred
2b538c3e 2179The branch target was mispredicted.
045bf4d3
VW
2180.TP
2181.I predicted
2b538c3e 2182The branch target was predicted.
e3c9782b 2183.TP
31c1f2b0 2184.IR in_tx " (since Linux 3.11)"
747a6e7c 2185.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2186The branch was in a transactional memory transaction.
e3c9782b 2187.TP
31c1f2b0 2188.IR abort " (since Linux 3.11)"
747a6e7c 2189.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2190The branch was in an aborted transactional memory transaction.
96919592
VW
2191.TP
2192.IR cycles " (since Linux 4.3)"
2193.\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
2194This reports the number of cycles elapsed since the
2195previous branch stack update.
11ac5b51 2196.PP
045bf4d3
VW
2197The entries are from most to least recent, so the first entry
2198has the most recent branch.
efeece04 2199.PP
8a94e783 2200Support for
dceb9af6
MK
2201.IR mispred ,
2202.IR predicted ,
2203and
2204.IR cycles
96919592 2205is optional; if not supported, those
045bf4d3 2206values will be 0.
efeece04 2207.PP
e3c9782b
VW
2208The type of branches recorded is specified by the
2209.I branch_sample_type
2210field.
2211.RE
f2b1d720 2212.TP
7db515ef
MK
2213.IR abi ", " regs[weight(mask)]
2214If
2215.B PERF_SAMPLE_REGS_USER
d1007d14 2216is enabled, then the user CPU registers are recorded.
efeece04 2217.IP
f2b1d720
MK
2218The
2219.I abi
2220field is one of
2221.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
7db515ef 2222.BR PERF_SAMPLE_REGS_ABI_64 .
efeece04 2223.IP
d1007d14
VW
2224The
2225.I regs
2226field is an array of the CPU registers that were specified by
2227the
2228.I sample_regs_user
2229attr field.
2230The number of values is the number of bits set in the
51700fd7 2231.I sample_regs_user
4651e412 2232bit mask.
f2b1d720 2233.TP
7db515ef
MK
2234.IR size ", " data[size] ", " dyn_size
2235If
2236.B PERF_SAMPLE_STACK_USER
02ca78a0
VW
2237is enabled, then the user stack is recorded.
2238This can be used to generate stack backtraces.
d1007d14
VW
2239.I size
2240is the size requested by the user in
02ca78a0 2241.I sample_stack_user
d1007d14
VW
2242or else the maximum record size.
2243.I data
02ca78a0
VW
2244is the stack data (a raw dump of the memory pointed to by the
2245stack pointer at the time of sampling).
d1007d14
VW
2246.I dyn_size
2247is the amount of data actually dumped (can be less than
460e3d7a 2248.IR size ).
4dc411dd
KF
2249Note that
2250.I dyn_size
2251is omitted if
2252.I size
2253is 0.
d1007d14 2254.TP
51700fd7 2255.I weight
d1007d14
VW
2256If
2257.B PERF_SAMPLE_WEIGHT
7de4a1e3 2258is enabled, then a 64-bit value provided by the hardware
d1007d14
VW
2259is recorded that indicates how costly the event was.
2260This allows expensive events to stand out more clearly
2261in profiles.
2262.TP
2263.I data_src
51700fd7 2264If
d1007d14 2265.B PERF_SAMPLE_DATA_SRC
7de4a1e3 2266is enabled, then a 64-bit value is recorded that is made up of
d1007d14
VW
2267the following fields:
2268.RS
2b538c3e 2269.TP 4
d1007d14 2270.I mem_op
2b538c3e 2271Type of opcode, a bitwise combination of:
efeece04 2272.IP
2b538c3e
MK
2273.PD 0
2274.RS
2275.TP 24
d1007d14 2276.B PERF_MEM_OP_NA
2b538c3e
MK
2277Not available
2278.TP
d1007d14 2279.B PERF_MEM_OP_LOAD
2b538c3e
MK
2280Load instruction
2281.TP
d1007d14 2282.B PERF_MEM_OP_STORE
2b538c3e
MK
2283Store instruction
2284.TP
d1007d14 2285.B PERF_MEM_OP_PFETCH
2b538c3e
MK
2286Prefetch
2287.TP
d1007d14 2288.B PERF_MEM_OP_EXEC
2b538c3e
MK
2289Executable code
2290.RE
2291.PD
d1007d14
VW
2292.TP
2293.I mem_lvl
bc9d90b5 2294Memory hierarchy level hit or miss, a bitwise combination of
ef4f4031 2295the following, shifted left by
bc9d90b5 2296.BR PERF_MEM_LVL_SHIFT :
efeece04 2297.IP
2b538c3e
MK
2298.PD 0
2299.RS
2300.TP 24
d1007d14 2301.B PERF_MEM_LVL_NA
2b538c3e
MK
2302Not available
2303.TP
d1007d14 2304.B PERF_MEM_LVL_HIT
2b538c3e
MK
2305Hit
2306.TP
d1007d14 2307.B PERF_MEM_LVL_MISS
2b538c3e
MK
2308Miss
2309.TP
d1007d14 2310.B PERF_MEM_LVL_L1
2b538c3e
MK
2311Level 1 cache
2312.TP
d1007d14 2313.B PERF_MEM_LVL_LFB
2b538c3e
MK
2314Line fill buffer
2315.TP
d1007d14 2316.B PERF_MEM_LVL_L2
2b538c3e
MK
2317Level 2 cache
2318.TP
d1007d14 2319.B PERF_MEM_LVL_L3
2b538c3e
MK
2320Level 3 cache
2321.TP
d1007d14 2322.B PERF_MEM_LVL_LOC_RAM
2b538c3e
MK
2323Local DRAM
2324.TP
d1007d14 2325.B PERF_MEM_LVL_REM_RAM1
2b538c3e
MK
2326Remote DRAM 1 hop
2327.TP
d1007d14 2328.B PERF_MEM_LVL_REM_RAM2
2b538c3e
MK
2329Remote DRAM 2 hops
2330.TP
d1007d14 2331.B PERF_MEM_LVL_REM_CCE1
2b538c3e
MK
2332Remote cache 1 hop
2333.TP
d1007d14 2334.B PERF_MEM_LVL_REM_CCE2
2b538c3e
MK
2335Remote cache 2 hops
2336.TP
d1007d14 2337.B PERF_MEM_LVL_IO
2b538c3e
MK
2338I/O memory
2339.TP
d1007d14 2340.B PERF_MEM_LVL_UNC
2b538c3e
MK
2341Uncached memory
2342.RE
2343.PD
d1007d14
VW
2344.TP
2345.I mem_snoop
bc9d90b5
VW
2346Snoop mode, a bitwise combination of the following, shifted left by
2347.BR PERF_MEM_SNOOP_SHIFT :
efeece04 2348.IP
2b538c3e
MK
2349.PD 0
2350.RS
2351.TP 24
d1007d14 2352.B PERF_MEM_SNOOP_NA
2b538c3e
MK
2353Not available
2354.TP
d1007d14 2355.B PERF_MEM_SNOOP_NONE
2b538c3e
MK
2356No snoop
2357.TP
d1007d14 2358.B PERF_MEM_SNOOP_HIT
2b538c3e
MK
2359Snoop hit
2360.TP
d1007d14 2361.B PERF_MEM_SNOOP_MISS
2b538c3e
MK
2362Snoop miss
2363.TP
d1007d14 2364.B PERF_MEM_SNOOP_HITM
2b538c3e
MK
2365Snoop hit modified
2366.RE
2367.PD
d1007d14
VW
2368.TP
2369.I mem_lock
bc9d90b5
VW
2370Lock instruction, a bitwise combination of the following, shifted left by
2371.BR PERF_MEM_LOCK_SHIFT :
efeece04 2372.IP
2b538c3e
MK
2373.PD 0
2374.RS
2375.TP 24
d1007d14 2376.B PERF_MEM_LOCK_NA
2b538c3e
MK
2377Not available
2378.TP
d1007d14 2379.B PERF_MEM_LOCK_LOCKED
2b538c3e
MK
2380Locked transaction
2381.RE
2382.PD
d1007d14
VW
2383.TP
2384.I mem_dtlb
bc9d90b5
VW
2385TLB access hit or miss, a bitwise combination of the following, shifted
2386left by
2387.BR PERF_MEM_TLB_SHIFT :
efeece04 2388.IP
2b538c3e
MK
2389.PD 0
2390.RS
2391.TP 24
d1007d14 2392.B PERF_MEM_TLB_NA
2b538c3e
MK
2393Not available
2394.TP
d1007d14 2395.B PERF_MEM_TLB_HIT
2b538c3e
MK
2396Hit
2397.TP
d1007d14 2398.B PERF_MEM_TLB_MISS
2b538c3e
MK
2399Miss
2400.TP
d1007d14 2401.B PERF_MEM_TLB_L1
2b538c3e
MK
2402Level 1 TLB
2403.TP
d1007d14 2404.B PERF_MEM_TLB_L2
2b538c3e
MK
2405Level 2 TLB
2406.TP
d1007d14 2407.B PERF_MEM_TLB_WK
2b538c3e
MK
2408Hardware walker
2409.TP
d1007d14 2410.B PERF_MEM_TLB_OS
2b538c3e
MK
2411OS fault handler
2412.RE
2413.PD
d1007d14 2414.RE
1e043959
VW
2415.TP
2416.I transaction
2417If the
2418.B PERF_SAMPLE_TRANSACTION
37bee118 2419flag is set, then a 64-bit field is recorded describing
1e043959 2420the sources of any transactional memory aborts.
efeece04 2421.IP
1e043959
VW
2422The field is a bitwise combination of the following values:
2423.RS
2424.TP
2425.B PERF_TXN_ELISION
b3f39642 2426Abort from an elision type transaction (Intel-CPU-specific).
1e043959
VW
2427.TP
2428.B PERF_TXN_TRANSACTION
b3f39642 2429Abort from a generic transaction.
1e043959
VW
2430.TP
2431.B PERF_TXN_SYNC
b3f39642 2432Synchronous abort (related to the reported instruction).
1e043959
VW
2433.TP
2434.B PERF_TXN_ASYNC
b3f39642 2435Asynchronous abort (not related to the reported instruction).
1e043959
VW
2436.TP
2437.B PERF_TXN_RETRY
053a3e08 2438Retryable abort (retrying the transaction may have succeeded).
1e043959
VW
2439.TP
2440.B PERF_TXN_CONFLICT
b3f39642 2441Abort due to memory conflicts with other threads.
1e043959
VW
2442.TP
2443.B PERF_TXN_CAPACITY_WRITE
b3f39642 2444Abort due to write capacity overflow.
1e043959
VW
2445.TP
2446.B PERF_TXN_CAPACITY_READ
b3f39642 2447Abort due to read capacity overflow.
1e043959 2448.RE
b3f39642
MK
2449.IP
2450In addition, a user-specified abort code can be obtained from
2451the high 32 bits of the field by shifting right by
1e043959 2452.B PERF_TXN_ABORT_SHIFT
4b3a5f01 2453and masking with the value
1e043959 2454.BR PERF_TXN_ABORT_MASK .
f5281dfd
VW
2455.TP
2456.IR abi ", " regs[weight(mask)]
2457If
2458.B PERF_SAMPLE_REGS_INTR
2459is enabled, then the user CPU registers are recorded.
efeece04 2460.IP
f5281dfd
VW
2461The
2462.I abi
2463field is one of
4b3a5f01
MK
2464.BR PERF_SAMPLE_REGS_ABI_NONE ,
2465.BR PERF_SAMPLE_REGS_ABI_32 ,
2466or
f5281dfd 2467.BR PERF_SAMPLE_REGS_ABI_64 .
efeece04 2468.IP
f5281dfd
VW
2469The
2470.I regs
2471field is an array of the CPU registers that were specified by
2472the
2473.I sample_regs_intr
2474attr field.
2475The number of values is the number of bits set in the
2476.I sample_regs_intr
2477bit mask.
f2b1d720 2478.RE
9bfc542b
VW
2479.TP
2480.B PERF_RECORD_MMAP2
2481This record includes extended information on
2482.BR mmap (2)
2483calls returning executable mappings.
2484The format is similar to that of the
2485.B PERF_RECORD_MMAP
3a058284 2486record, but includes extra values that allow uniquely identifying
9bfc542b 2487shared mappings.
efeece04 2488.IP
9bfc542b 2489.in +4n
b8302363 2490.EX
9bfc542b
VW
2491struct {
2492 struct perf_event_header header;
7a10da70
MK
2493 u32 pid;
2494 u32 tid;
2495 u64 addr;
2496 u64 len;
2497 u64 pgoff;
2498 u32 maj;
2499 u32 min;
2500 u64 ino;
2501 u64 ino_generation;
2502 u32 prot;
2503 u32 flags;
2504 char filename[];
9bfc542b
VW
2505 struct sample_id sample_id;
2506};
ba4924aa 2507.EE
9bfc542b
VW
2508.RS
2509.TP
2510.I pid
3a058284 2511is the process ID.
9bfc542b
VW
2512.TP
2513.I tid
3a058284 2514is the thread ID.
9bfc542b
VW
2515.TP
2516.I addr
2517is the address of the allocated memory.
2518.TP
2519.I len
2520is the length of the allocated memory.
2521.TP
2522.I pgoff
2523is the page offset of the allocated memory.
2524.TP
2525.I maj
3a058284 2526is the major ID of the underlying device.
9bfc542b
VW
2527.TP
2528.I min
3a058284 2529is the minor ID of the underlying device.
9bfc542b
VW
2530.TP
2531.I ino
3a058284 2532is the inode number.
9bfc542b
VW
2533.TP
2534.I ino_generation
2535is the inode generation.
2536.TP
2537.I prot
2538is the protection information.
2539.TP
2540.I flags
2541is the flags information.
2542.TP
2543.I filename
2544is a string describing the backing of the allocated memory.
2545.RE
1fda209c
VW
2546.TP
2547.BR PERF_RECORD_AUX " (since Linux 4.1)"
2548\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0
2549This record reports that new data is available in the separate
2550AUX buffer region.
efeece04 2551.IP
1fda209c 2552.in +4n
b8302363 2553.EX
1fda209c
VW
2554struct {
2555 struct perf_event_header header;
7a10da70
MK
2556 u64 aux_offset;
2557 u64 aux_size;
2558 u64 flags;
1fda209c
VW
2559 struct sample_id sample_id;
2560};
ba4924aa 2561.EE
1fda209c
VW
2562.RS
2563.TP
2564.I aux_offset
2565offset in the AUX mmap region where the new data begins.
2566.TP
2567.I aux_size
2568size of the data made available.
2569.TP
2570.I flags
95655a22 2571describes the AUX update.
1fda209c
VW
2572.RS
2573.TP
2574.B PERF_AUX_FLAG_TRUNCATED
95655a22 2575if set, then the data returned was truncated to fit the available
1fda209c 2576buffer size.
b1355f6a
VW
2577.TP
2578.B PERF_AUX_FLAG_OVERWRITE
2579.\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142
95655a22 2580if set, then the data returned has overwritten previous data.
1fda209c
VW
2581.RE
2582.RE
6932aac3
VW
2583.TP
2584.BR PERF_RECORD_ITRACE_START " (since Linux 4.1)"
2585\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368
2586This record indicates which process has initiated an instruction
2587trace event, allowing tools to properly correlate the instruction
2588addresses in the AUX buffer with the proper executable.
efeece04 2589.IP
6932aac3 2590.in +4n
b8302363 2591.EX
6932aac3
VW
2592struct {
2593 struct perf_event_header header;
7a10da70
MK
2594 u32 pid;
2595 u32 tid;
6932aac3 2596};
ba4924aa 2597.EE
6932aac3
VW
2598.RS
2599.TP
2600.I pid
95655a22 2601process ID of the thread starting an instruction trace.
6932aac3
VW
2602.TP
2603.I tid
95655a22 2604thread ID of the thread starting an instruction trace.
6932aac3 2605.RE
46012ba3
DH
2606.TP
2607.BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)"
2608\" f38b0dbb491a6987e198aa6b428db8692a6480f8
2609When using hardware sampling (such as Intel PEBS) this record
4199d3a1 2610indicates some number of samples that may have been lost.
efeece04 2611.IP
46012ba3 2612.in +4n
b8302363 2613.EX
46012ba3
DH
2614struct {
2615 struct perf_event_header header;
7a10da70 2616 u64 lost;
46012ba3
DH
2617 struct sample_id sample_id;
2618};
ba4924aa 2619.EE
46012ba3
DH
2620.RS
2621.TP
2622.I lost
2623the number of potentially lost samples.
2624.RE
9277a75d
VW
2625.TP
2626.BR PERF_RECORD_SWITCH " (since Linux 4.3)"
2627\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
2628This record indicates a context switch has happened.
2629The
2630.B PERF_RECORD_MISC_SWITCH_OUT
2631bit in the
2632.I misc
2633field indicates whether it was a context switch into
2634or away from the current process.
efeece04 2635.IP
9277a75d 2636.in +4n
b8302363 2637.EX
9277a75d
VW
2638struct {
2639 struct perf_event_header header;
2640 struct sample_id sample_id;
2641};
ba4924aa 2642.EE
9277a75d
VW
2643.TP
2644.BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)"
2645\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
2646As with
2647.B PERF_RECORD_SWITCH
2648this record indicates a context switch has happened,
d5a24378 2649but it only occurs when sampling in CPU-wide mode
9277a75d
VW
2650and provides additional information on the process
2651being switched to/from.
2652The
2653.B PERF_RECORD_MISC_SWITCH_OUT
2654bit in the
2655.I misc
2656field indicates whether it was a context switch into
2657or away from the current process.
efeece04 2658.IP
9277a75d 2659.in +4n
b8302363 2660.EX
9277a75d
VW
2661struct {
2662 struct perf_event_header header;
2663 u32 next_prev_pid;
2664 u32 next_prev_tid;
2665 struct sample_id sample_id;
2666};
ba4924aa 2667.EE
9277a75d
VW
2668.RS
2669.TP
2670.I next_prev_pid
d5a24378 2671The process ID of the previous (if switching in)
9277a75d
VW
2672or next (if switching out) process on the CPU.
2673.TP
2674.I next_prev_tid
d5a24378 2675The thread ID of the previous (if switching in)
9277a75d
VW
2676or next (if switching out) thread on the CPU.
2677.RE
f2b1d720 2678.RE
21977c9d
VW
2679.SS Overflow handling
2680Events can be set to notify when a threshold is crossed,
2681indicating an overflow.
2682Overflow conditions can be captured by monitoring the
2683event file descriptor with
f2b1d720
MK
2684.BR poll (2),
2685.BR select (2),
21977c9d 2686or
4b3a5f01 2687.BR epoll (7).
6831ba6b
MK
2688Alternatively, the overflow events can be captured via sa signal handler,
2689by enabling I/O signaling on the file descriptor; see the discussion of the
fc79d996 2690.BR F_SETOWN
6831ba6b
MK
2691and
2692.BR F_SETSIG
2693operations in
2694.BR fcntl (2).
efeece04 2695.PP
6170255e 2696Overflows are generated only by sampling events
f2b1d720 2697.RI ( sample_period
7d182bb6 2698must have a nonzero value).
efeece04 2699.PP
21977c9d 2700There are two ways to generate overflow notifications.
efeece04 2701.PP
f2b1d720
MK
2702The first is to set a
2703.I wakeup_events
2704or
2705.I wakeup_watermark
21977c9d 2706value that will trigger if a certain number of samples
f2b1d720 2707or bytes have been written to the mmap ring buffer.
fc79d996 2708In this case,
7db515ef 2709.B POLL_IN
21977c9d 2710is indicated.
efeece04 2711.PP
f2b1d720 2712The other way is by use of the
7db515ef 2713.B PERF_EVENT_IOC_REFRESH
f2b1d720
MK
2714ioctl.
2715This ioctl adds to a counter that decrements each time the event overflows.
21977c9d 2716When nonzero,
7db515ef 2717.B POLL_IN
21977c9d
VW
2718is indicated, but
2719once the counter reaches 0
7db515ef 2720.B POLL_HUP
21977c9d 2721is indicated and
f2b1d720 2722the underlying event is disabled.
efeece04 2723.PP
50e4319c
VW
2724Refreshing an event group leader refreshes all siblings and
2725refreshing with a parameter of 0 currently enables infinite
2726refreshes;
2727these behaviors are unsupported and should not be relied on.
2728.\" See https://lkml.org/lkml/2011/5/24/337
efeece04 2729.PP
4010bc07 2730Starting with Linux 3.18,
747a6e7c 2731.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883
21977c9d
VW
2732.B POLL_HUP
2733is indicated if the event being monitored is attached to a different
2734process and that process exits.
73d8cece 2735.SS rdpmc instruction
f2b1d720 2736Starting with Linux 3.4 on x86, you can use the
747a6e7c 2737.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
f2b1d720
MK
2738.I rdpmc
2739instruction to get low-latency reads without having to enter the kernel.
2740Note that using
2741.I rdpmc
2742is not necessarily faster than other methods for reading event values.
efeece04 2743.PP
f2b1d720
MK
2744Support for this can be detected with the
2745.I cap_usr_rdpmc
2746field in the mmap page; documentation on how
2747to calculate event values can be found in that section.
efeece04 2748.PP
562c69f6
VW
2749Originally, when rdpmc support was enabled, any process (not just ones
2750with an active perf event) could use the rdpmc instruction to access
2751the counters.
fc79d996 2752Starting with Linux 4.0,
562c69f6
VW
2753.\" 7911d3f7af14a614617e38245fedf98a724e46a9
2754rdpmc support is only allowed if an event is currently enabled
95655a22 2755in a process's context.
562c69f6
VW
2756To restore the old behavior, write the value 2 to
2757.IR /sys/devices/cpu/rdpmc .
73d8cece 2758.SS perf_event ioctl calls
f2b1d720
MK
2759.PP
2760Various ioctls act on
7db515ef 2761.BR perf_event_open ()
ce88f77b 2762file descriptors:
f2b1d720
MK
2763.TP
2764.B PERF_EVENT_IOC_ENABLE
ce88f77b 2765This enables the individual event or event group specified by the
7db515ef 2766file descriptor argument.
efeece04 2767.IP
51700fd7 2768If the
8cc8b90d 2769.B PERF_IOC_FLAG_GROUP
51700fd7 2770bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2771enabled, even if the event specified is not the group leader
2772(but see BUGS).
f2b1d720
MK
2773.TP
2774.B PERF_EVENT_IOC_DISABLE
ce88f77b 2775This disables the individual counter or event group specified by the
7db515ef 2776file descriptor argument.
efeece04 2777.IP
f2b1d720
MK
2778Enabling or disabling the leader of a group enables or disables the
2779entire group; that is, while the group leader is disabled, none of the
2780counters in the group will count.
33a0ccb2
MK
2781Enabling or disabling a member of a group other than the leader
2782affects only that counter; disabling a non-leader
f2b1d720 2783stops that counter from counting but doesn't affect any other counter.
efeece04 2784.IP
51700fd7 2785If the
8cc8b90d 2786.B PERF_IOC_FLAG_GROUP
51700fd7 2787bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2788disabled, even if the event specified is not the group leader
2789(but see BUGS).
f2b1d720
MK
2790.TP
2791.B PERF_EVENT_IOC_REFRESH
2792Non-inherited overflow counters can use this
2793to enable a counter for a number of overflows specified by the argument,
2794after which it is disabled.
2795Subsequent calls of this ioctl add the argument value to the current
2796count.
21977c9d 2797An overflow notification with
7db515ef
MK
2798.B POLL_IN
2799set will happen on each overflow until the
21977c9d
VW
2800count reaches 0; when that happens a notification with
2801.B POLL_HUP
7db515ef 2802set is sent and the event is disabled.
f2b1d720 2803Using an argument of 0 is considered undefined behavior.
f2b1d720
MK
2804.TP
2805.B PERF_EVENT_IOC_RESET
36127c0e 2806Reset the event count specified by the
6061d29f 2807file descriptor argument to zero.
33a0ccb2 2808This resets only the counts; there is no way to reset the
f2b1d720
MK
2809multiplexing
2810.I time_enabled
2811or
2812.I time_running
2813values.
efeece04 2814.IP
51700fd7 2815If the
8cc8b90d 2816.B PERF_IOC_FLAG_GROUP
51700fd7 2817bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2818reset, even if the event specified is not the group leader
2819(but see BUGS).
f2b1d720
MK
2820.TP
2821.B PERF_EVENT_IOC_PERIOD
e6cf5694 2822This updates the overflow period for the event.
efeece04 2823.IP
747a6e7c
VW
2824Since Linux 3.7 (on ARM)
2825.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc
2826and Linux 3.14 (all other architectures),
2827.\" commit bad7192b842c83e580747ca57104dd51fe08c223
3f118a29 2828the new period takes effect immediately.
ed81fdd9 2829On older kernels, the new period did not take effect until
3f118a29 2830after the next overflow.
efeece04 2831.IP
f2b1d720
MK
2832The argument is a pointer to a 64-bit value containing the
2833desired new period.
efeece04 2834.IP
fc79d996 2835Prior to Linux 2.6.36,
747a6e7c
VW
2836.\" commit ad0cf3478de8677f720ee06393b3147819568d6a
2837this ioctl always failed due to a bug
e6cf5694 2838in the kernel.
f2b1d720
MK
2839.TP
2840.B PERF_EVENT_IOC_SET_OUTPUT
2841This tells the kernel to report event notifications to the specified
2842file descriptor rather than the default one.
2843The file descriptors must all be on the same CPU.
efeece04 2844.IP
f2b1d720
MK
2845The argument specifies the desired file descriptor, or \-1 if
2846output should be ignored.
f2b1d720 2847.TP
31c1f2b0 2848.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
60dafbc1 2849.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830
f2b1d720 2850This adds an ftrace filter to this event.
efeece04 2851.IP
f2b1d720 2852The argument is a pointer to the desired ftrace filter.
a0dcc8dd 2853.TP
31c1f2b0 2854.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
60dafbc1 2855.\" commit cf4957f17f2a89984915ea808876d9c82225b862
bec6277e 2856This returns the event ID value for the given event file descriptor.
efeece04 2857.IP
a0dcc8dd
VW
2858The argument is a pointer to a 64-bit unsigned integer
2859to hold the result.
b0f7b411
VW
2860.TP
2861.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)"
2862.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
2863This allows attaching a Berkeley Packet Filter (BPF)
2864program to an existing kprobe tracepoint event.
2865You need
2866.B CAP_SYS_ADMIN
2867privileges to use this ioctl.
efeece04 2868.IP
b0f7b411
VW
2869The argument is a BPF program file descriptor that was created by
2870a previous
2871.BR bpf (2)
2872system call.
fc79d996 2873.SS Using prctl(2)
f2b1d720
MK
2874A process can enable or disable all the event groups that are
2875attached to it using the
2876.BR prctl (2)
2877.B PR_TASK_PERF_EVENTS_ENABLE
2878and
2879.B PR_TASK_PERF_EVENTS_DISABLE
2880operations.
ee7b0cbf 2881This applies to all counters on the calling process, whether created by
f2b1d720
MK
2882this process or by another, and does not affect any counters that this
2883process has created on other processes.
33a0ccb2 2884It enables or disables only
f2b1d720 2885the group leaders, not any other members in the groups.
f2b1d720 2886.SS perf_event related configuration files
efeece04 2887.PP
7db515ef
MK
2888Files in
2889.I /proc/sys/kernel/
7db515ef 2890.RS 4
f2b1d720 2891.TP
7db515ef 2892.I /proc/sys/kernel/perf_event_paranoid
f2b1d720
MK
2893The
2894.I perf_event_paranoid
2895file can be set to restrict access to the performance counters.
efeece04 2896.IP
dc9ec146 2897.PD 0
2b538c3e
MK
2898.RS
2899.IP 2 4
3eb95192 2900allow only user-space measurements (default since Linux 4.6).
b5eb75f7 2901.\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66
2b538c3e 2902.IP 1
3eb95192 2903allow both kernel and user measurements (default before Linux 4.6).
2b538c3e
MK
2904.IP 0
2905allow access to CPU-specific data but not raw tracepoint samples.
2906.IP \-1
2907no restrictions.
2908.RE
dc9ec146 2909.PD
2b538c3e 2910.IP
f2b1d720
MK
2911The existence of the
2912.I perf_event_paranoid
2913file is the official method for determining if a kernel supports
7db515ef 2914.BR perf_event_open ().
f2b1d720
MK
2915.TP
2916.I /proc/sys/kernel/perf_event_max_sample_rate
7db515ef
MK
2917This sets the maximum sample rate.
2918Setting this too high can allow
f2b1d720 2919users to sample at a rate that impacts overall machine performance
7db515ef
MK
2920and potentially lock up the machine.
2921The default value is
f2b1d720 2922100000 (samples per second).
fd133d5d
VW
2923.TP
2924.I /proc/sys/kernel/perf_event_max_stack
2925.\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e
5dd3feec 2926This file sets the maximum depth of stack frame entries reported
fd133d5d 2927when generating a call trace.
f2b1d720
MK
2928.TP
2929.I /proc/sys/kernel/perf_event_mlock_kb
ce88f77b
MK
2930Maximum number of pages an unprivileged user can
2931.BR mlock (2).
f2b1d720
MK
2932The default is 516 (kB).
2933.RE
efeece04 2934.PP
7db515ef
MK
2935Files in
2936.I /sys/bus/event_source/devices/
efeece04 2937.PP
7db515ef 2938.RS 4
ce88f77b 2939Since Linux 2.6.34, the kernel supports having multiple PMUs
f2b1d720
MK
2940available for monitoring.
2941Information on how to program these PMUs can be found under
2942.IR /sys/bus/event_source/devices/ .
2943Each subdirectory corresponds to a different PMU.
f2b1d720 2944.TP
31c1f2b0 2945.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
747a6e7c 2946.\" commit abe43400579d5de0078c2d3a760e6598e183f871
f2b1d720
MK
2947This contains an integer that can be used in the
2948.I type
ce88f77b
MK
2949field of
2950.I perf_event_attr
2951to indicate that you wish to use this PMU.
f2b1d720 2952.TP
562c69f6 2953.IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)"
747a6e7c 2954.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f
8a94e783 2955If this file is 1, then direct user-space access to the
e30dc77f
VW
2956performance counter registers is allowed via the rdpmc instruction.
2957This can be disabled by echoing 0 to the file.
efeece04 2958.IP
562c69f6
VW
2959As of Linux 4.0
2960.\" a66734297f78707ce39d756b656bfae861d53f62
2961.\" 7911d3f7af14a614617e38245fedf98a724e46a9
2962the behavior has changed, so that 1 now means only allow access
2963to processes with active perf events, with 2 indicating the old
2964allow-anyone-access behavior.
f2b1d720 2965.TP
31c1f2b0 2966.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
747a6e7c 2967.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6
MK
2968This subdirectory contains information on the architecture-specific
2969subfields available for programming the various
f2b1d720 2970.I config
ce88f77b
MK
2971fields in the
2972.I perf_event_attr
2973struct.
efeece04 2974.IP
e30dc77f
VW
2975The content of each file is the name of the config field, followed
2976by a colon, followed by a series of integer bit ranges separated by
2977commas.
8a94e783 2978For example, the file
e30dc77f
VW
2979.I event
2980may contain the value
d2fdb1e3
MK
2981.I config1:1,6\-10,44
2982which indicates that event is an attribute that occupies bits 1,6\(en10, and 44
ce88f77b
MK
2983of
2984.IR perf_event_attr::config1 .
e30dc77f 2985.TP
31c1f2b0 2986.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
747a6e7c 2987.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6 2988This subdirectory contains files with predefined events.
f2b1d720 2989The contents are strings describing the event settings
e30dc77f 2990expressed in terms of the fields found in the previously mentioned
f2b1d720
MK
2991.I ./format/
2992directory.
2993These are not necessarily complete lists of all events supported by
2994a PMU, but usually a subset of events deemed useful or interesting.
efeece04 2995.IP
e30dc77f 2996The content of each file is a list of attribute names
8a94e783
MK
2997separated by commas.
2998Each entry has an optional value (either hex or decimal).
37bee118 2999If no value is specified, then it is assumed to be a single-bit
e30dc77f
VW
3000field with a value of 1.
3001An example entry may look like this:
699893d8 3002.IR event=0x2,inv,ldlat=3 .
f2b1d720
MK
3003.TP
3004.I /sys/bus/event_source/devices/*/uevent
e30dc77f
VW
3005This file is the standard kernel device interface
3006for injecting hotplug events.
3007.TP
31c1f2b0 3008.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
747a6e7c 3009.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac
699893d8
DP
3010The
3011.I cpumask
3012file contains a comma-separated list of integers that
3013indicate a representative CPU number for each socket (package)
e30dc77f
VW
3014on the motherboard.
3015This is needed when setting up uncore or northbridge events, as
3016those PMUs present socket-wide events.
f2b1d720 3017.RE
47297adb 3018.SH RETURN VALUE
f2b1d720
MK
3019.BR perf_event_open ()
3020returns the new file descriptor, or \-1 if an error occurred
3021(in which case,
3022.I errno
3023is set appropriately).
3024.SH ERRORS
d8b7d950
VW
3025The errors returned by
3026.BR perf_event_open ()
3027can be inconsistent, and may
3028vary across processor architectures and performance monitoring units.
f2b1d720 3029.TP
82b09254 3030.B E2BIG
ce88f77b
MK
3031Returned if the
3032.I perf_event_attr
82b09254
VW
3033.I size
3034value is too small
3035(smaller than
3036.BR PERF_ATTR_SIZE_VER0 ),
3037too big (larger than the page size),
3038or larger than the kernel supports and the extra bytes are not zero.
3039When
3040.B E2BIG
ce88f77b
MK
3041is returned, the
3042.I perf_event_attr
e9bd9b2c 3043.I size
d6af98f8 3044field is overwritten by the kernel to be the size of the structure
82b09254
VW
3045it was expecting.
3046.TP
d8b7d950 3047.B EACCES
27f0af8e
VW
3048Returned when the requested event requires
3049.B CAP_SYS_ADMIN
3050permissions (or a more permissive perf_event paranoid setting).
3051Some common cases where an unprivileged process
3052may encounter this error:
3053attaching to a process owned by a different user;
2b23ecbd
MK
3054monitoring all processes on a given CPU (i.e., specifying the
3055.I pid
3056argument as \-1);
079928f3 3057and not setting
accec051 3058.I exclude_kernel
079928f3 3059when the paranoid setting requires it.
d8b7d950
VW
3060.TP
3061.B EBADF
3062Returned if the
3063.I group_fd
accec051
MK
3064file descriptor is not valid, or, if
3065.B PERF_FLAG_PID_CGROUP
3066is set,
d8b7d950
VW
3067the cgroup file descriptor in
3068.I pid
3069is not valid.
3070.TP
f27486cb
VW
3071.BR EBUSY " (since Linux 4.1)"
3072.\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0
3073Returned if another event already has exclusive
3074access to the PMU.
3075.TP
d8b7d950
VW
3076.B EFAULT
3077Returned if the
3078.I attr
3079pointer points at an invalid memory address.
3080.TP
f2b1d720 3081.B EINVAL
d8b7d950
VW
3082Returned if the specified event is invalid.
3083There are many possible reasons for this.
3084A not-exhaustive list:
3085.I sample_freq
accec051 3086is higher than the maximum setting;
d8b7d950
VW
3087the
3088.I cpu
accec051 3089to monitor does not exist;
d8b7d950 3090.I read_format
accec051 3091is out of range;
d8b7d950 3092.I sample_type
accec051 3093is out of range;
d8b7d950
VW
3094the
3095.I flags
accec051 3096value is out of range;
d8b7d950
VW
3097.I exclusive
3098or
3099.I pinned
accec051 3100set and the event is not a group leader;
d8b7d950
VW
3101the event
3102.I config
accec051
MK
3103values are out of range or set reserved bits;
3104the generic event selected is not supported; or
d8b7d950
VW
3105there is not enough room to add the selected event.
3106.TP
3107.B EMFILE
3108Each opened event uses one file descriptor.
26c32fab
MK
3109If a large number of events are opened,
3110the per-process limit on the number of open file descriptors will be reached,
3111and no more events can be created.
d8b7d950
VW
3112.TP
3113.B ENODEV
3114Returned when the event involves a feature not supported
accec051 3115by the current CPU.
d8b7d950
VW
3116.TP
3117.B ENOENT
3118Returned if the
3119.I type
3120setting is not valid.
accec051 3121This error is also returned for
d8b7d950 3122some unsupported generic events.
f2b1d720
MK
3123.TP
3124.B ENOSPC
3125Prior to Linux 3.3, if there was not enough room for the event,
747a6e7c 3126.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6
f2b1d720
MK
3127.B ENOSPC
3128was returned.
accec051 3129In Linux 3.3, this was changed to
f2b1d720
MK
3130.BR EINVAL .
3131.B ENOSPC
d8b7d950 3132is still returned if you try to add more breakpoint events
accec051 3133than supported by the hardware.
d8b7d950
VW
3134.TP
3135.B ENOSYS
3136Returned if
3137.B PERF_SAMPLE_STACK_USER
3138is set in
3139.I sample_type
3140and it is not supported by hardware.
3141.TP
3142.B EOPNOTSUPP
3143Returned if an event requiring a specific hardware feature is
3144requested but there is no hardware support.
3145This includes requesting low-skid events if not supported,
3146branch tracing if it is not available, sampling if no PMU
3147interrupt is available, and branch stacks for software events.
3148.TP
fd133d5d
VW
3149.BR EOVERFLOW " (since Linux 4.8)"
3150.\" 97c79a38cd454602645f0470ffb444b3b75ce574
3151Returned if
3152.B PERF_SAMPLE_CALLCHAIN
3153is requested and
3154.I sample_max_stack
3155is larger than the maximum specified in
3156.IR /proc/sys/kernel/perf_event_max_stack .
3157.TP
d8b7d950 3158.B EPERM
27f0af8e
VW
3159Returned on many (but not all) architectures when an unsupported
3160.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel
3161setting is specified.
efeece04 3162.IP
27f0af8e
VW
3163It can also happen, as with
3164.BR EACCES ,
3165when the requested event requires
3166.B CAP_SYS_ADMIN
3167permissions (or a more permissive perf_event paranoid setting).
3168This includes setting a breakpoint on a kernel address,
3169and (since Linux 3.13) setting a kernel function-trace tracepoint.
747a6e7c 3170.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34
d8b7d950
VW
3171.TP
3172.B ESRCH
3173Returned if attempting to attach to a process that does not exist.
f2b1d720 3174.SH VERSION
f2b1d720
MK
3175.BR perf_event_open ()
3176was introduced in Linux 2.6.31 but was called
747a6e7c 3177.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e
ffd4dec0 3178.BR perf_counter_open ().
f2b1d720 3179It was renamed in Linux 2.6.32.
747a6e7c 3180.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
f2b1d720 3181.SH CONFORMING TO
7db515ef
MK
3182This
3183.BR perf_event_open ()
dc9ec146 3184system call Linux-specific
f2b1d720 3185and should not be used in programs intended to be portable.
f2b1d720
MK
3186.SH NOTES
3187Glibc does not provide a wrapper for this system call; call it using
3188.BR syscall (2).
7db515ef 3189See the example below.
efeece04 3190.PP
f2b1d720 3191The official way of knowing if
7db515ef 3192.BR perf_event_open ()
f2b1d720
MK
3193support is enabled is checking
3194for the existence of the file
7db515ef 3195.IR /proc/sys/kernel/perf_event_paranoid .
f2b1d720 3196.SH BUGS
f2b1d720
MK
3197The
3198.B F_SETOWN_EX
3199option to
7db515ef 3200.BR fcntl (2)
f2b1d720
MK
3201is needed to properly get overflow signals in threads.
3202This was introduced in Linux 2.6.32.
747a6e7c 3203.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5
efeece04 3204.PP
747a6e7c
VW
3205Prior to Linux 2.6.33 (at least for x86),
3206.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1
3207the kernel did not check
f2b1d720
MK
3208if events could be scheduled together until read time.
3209The same happens on all known kernels if the NMI watchdog is enabled.
3210This means to see if a given set of events works you have to
3211.BR perf_event_open (),
3212start, then read before you know for sure you
3213can get valid measurements.
efeece04 3214.PP
b5190152
MK
3215Prior to Linux 2.6.34,
3216.\" FIXME . cannot find a kernel commit for this one
3217event constraints were not enforced by the kernel.
f2b1d720
MK
3218In that case, some events would silently return "0" if the kernel
3219scheduled them in an improper counter slot.
efeece04 3220.PP
ce88f77b 3221Prior to Linux 2.6.34, there was a bug when multiplexing where the
f2b1d720 3222wrong results could be returned.
747a6e7c 3223.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8
efeece04 3224.PP
f2b1d720
MK
3225Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
3226"inherit" is enabled and many threads are started.
747a6e7c 3227.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd
efeece04 3228.PP
f2b1d720 3229Prior to Linux 2.6.35,
747a6e7c 3230.\" commit 050735b08ca8a016bbace4445fa025b88fee770b
f2b1d720
MK
3231.B PERF_FORMAT_GROUP
3232did not work with attached processes.
efeece04 3233.PP
f2b1d720
MK
3234There is a bug in the kernel code between
3235Linux 2.6.36 and Linux 3.0 that ignores the
3236"watermark" field and acts as if a wakeup_event
3237was chosen if the union has a
7d182bb6 3238nonzero value in it.
747a6e7c 3239.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02
efeece04 3240.PP
8a94e783 3241From Linux 2.6.31 to Linux 3.4, the
dbc01ecd
VW
3242.B PERF_IOC_FLAG_GROUP
3243ioctl argument was broken and would repeatedly operate
3244on the event specified rather than iterating across
3245all sibling events in a group.
747a6e7c 3246.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e
efeece04 3247.PP
7205b8df 3248From Linux 3.4 to Linux 3.11, the mmap
747a6e7c 3249.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
3250.I cap_usr_rdpmc
3251and
3252.I cap_usr_time
3253bits mapped to the same location.
3254Code should migrate to the new
3255.I cap_user_rdpmc
3256and
3257.I cap_user_time
3258fields instead.
efeece04 3259.PP
7db515ef
MK
3260Always double-check your results!
3261Various generalized events have had wrong values.
f2b1d720
MK
3262For example, retired branches measured
3263the wrong thing on AMD machines until Linux 2.6.35.
747a6e7c 3264.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
f2b1d720
MK
3265.SH EXAMPLE
3266The following is a short example that measures the total
7db515ef
MK
3267instruction count of a call to
3268.BR printf (3).
408731d4
MK
3269.PP
3270.EX
f2b1d720
MK
3271#include <stdlib.h>
3272#include <stdio.h>
3273#include <unistd.h>
3274#include <string.h>
3275#include <sys/ioctl.h>
3276#include <linux/perf_event.h>
3277#include <asm/unistd.h>
3278
571767ca 3279static long
7db515ef
MK
3280perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
3281 int cpu, int group_fd, unsigned long flags)
f2b1d720
MK
3282{
3283 int ret;
3284
7db515ef
MK
3285 ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
3286 group_fd, flags);
f2b1d720
MK
3287 return ret;
3288}
3289
f2b1d720
MK
3290int
3291main(int argc, char **argv)
3292{
f2b1d720
MK
3293 struct perf_event_attr pe;
3294 long long count;
3295 int fd;
3296
3297 memset(&pe, 0, sizeof(struct perf_event_attr));
3298 pe.type = PERF_TYPE_HARDWARE;
3299 pe.size = sizeof(struct perf_event_attr);
3300 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
3301 pe.disabled = 1;
3302 pe.exclude_kernel = 1;
3303 pe.exclude_hv = 1;
3304
3305 fd = perf_event_open(&pe, 0, \-1, \-1, 0);
7db515ef 3306 if (fd == \-1) {
f2b1d720 3307 fprintf(stderr, "Error opening leader %llx\\n", pe.config);
7db515ef 3308 exit(EXIT_FAILURE);
f2b1d720
MK
3309 }
3310
3311 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
3312 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
3313
3314 printf("Measuring instruction count for this printf\\n");
3315
3316 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
3317 read(fd, &count, sizeof(long long));
3318
3319 printf("Used %lld instructions\\n", count);
3320
3321 close(fd);
3322}
408731d4 3323.EE
47297adb 3324.SH SEE ALSO
f2b1d720
MK
3325.BR fcntl (2),
3326.BR mmap (2),
3327.BR open (2),
3328.BR prctl (2),
3329.BR read (2)