]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/perf_event_open.2
All pages: Remove the 5th argument to .TH
[thirdparty/man-pages.git] / man2 / perf_event_open.2
CommitLineData
f2b1d720
MK
1.\" Copyright (c) 2012, Vincent Weaver
2.\"
e4a74ca8 3.\" SPDX-License-Identifier: GPL-2.0-or-later
f2b1d720
MK
4.\"
5.\" This document is based on the perf_event.h header file, the
6.\" tools/perf/design.txt file, and a lot of bitter experience.
7.\"
45186a5d 8.TH PERF_EVENT_OPEN 2 2021-08-27 "Linux man-pages (unreleased)"
f2b1d720
MK
9.SH NAME
10perf_event_open \- set up performance monitoring
35183937
AC
11.SH LIBRARY
12Standard C library
8fc3b2cf 13.RI ( libc ", " \-lc )
f2b1d720
MK
14.SH SYNOPSIS
15.nf
e6915791
AC
16.BR "#include <linux/perf_event.h>" " /* Definition of " PERF_* " constants */"
17.BR "#include <linux/hw_breakpoint.h>" " /* Definition of " HW_* " constants */"
18.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
19.B #include <unistd.h>
68e4db0a 20.PP
e6915791
AC
21.BI "int syscall(SYS_perf_event_open, struct perf_event_attr *" attr ,
22.BI " pid_t " pid ", int " cpu ", int " group_fd \
23", unsigned long " flags );
f2b1d720 24.fi
dbfe9c70 25.PP
f2b1d720 26.IR Note :
e6915791
AC
27glibc provides no wrapper for
28.BR perf_event_open (),
29necessitating the use of
30.BR syscall (2).
f2b1d720
MK
31.SH DESCRIPTION
32Given a list of parameters,
33.BR perf_event_open ()
34returns a file descriptor, for use in subsequent system calls
35.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
36.PP
37A call to
38.BR perf_event_open ()
39creates a file descriptor that allows measuring performance
40information.
41Each file descriptor corresponds to one
42event that is measured; these can be grouped together
43to measure multiple events simultaneously.
44.PP
45Events can be enabled and disabled in two ways: via
46.BR ioctl (2)
47and via
0fe9e4b1 48.BR prctl (2).
f2b1d720
MK
49When an event is disabled it does not count or generate overflows but does
50continue to exist and maintain its count value.
51.PP
52Events come in two flavors: counting and sampled.
53A
54.I counting
55event is one that is used for counting the aggregate number of events
56that occur.
57In general, counting event results are gathered with a
58.BR read (2)
59call.
60A
61.I sampling
62event periodically writes measurements to a buffer that can then
63be accessed via
0fe9e4b1 64.BR mmap (2).
f2b1d720 65.SS Arguments
f2b1d720 66The
a02a1737 67.I pid
f2b1d720 68and
a02a1737
VW
69.I cpu
70arguments allow specifying which process and CPU to monitor:
71.TP
f2d15dc9 72.BR "pid == 0" " and " "cpu == \-1"
ee7b0cbf 73This measures the calling process/thread on any CPU.
a02a1737 74.TP
f2d15dc9 75.BR "pid == 0" " and " "cpu >= 0"
ee7b0cbf 76This measures the calling process/thread only
a02a1737
VW
77when running on the specified CPU.
78.TP
f2d15dc9 79.BR "pid > 0" " and " "cpu == \-1"
a02a1737
VW
80This measures the specified process/thread on any CPU.
81.TP
f2d15dc9 82.BR "pid > 0" " and " "cpu >= 0"
a02a1737
VW
83This measures the specified process/thread only
84when running on the specified CPU.
85.TP
f2d15dc9 86.BR "pid == \-1" " and " "cpu >= 0"
a02a1737 87This measures all processes/threads on the specified CPU.
ce88f77b 88This requires
d19b29a5
AB
89.B CAP_PERFMON
90(since Linux 5.8) or
f2b1d720
MK
91.B CAP_SYS_ADMIN
92capability or a
93.I /proc/sys/kernel/perf_event_paranoid
94value of less than 1.
a02a1737 95.TP
ce88f77b 96.BR "pid == \-1" " and " "cpu == \-1"
a02a1737 97This setting is invalid and will return an error.
11ac5b51 98.PP
13ec13dc
MK
99When
100.I pid
101is greater than zero, permission to perform this system call
d19b29a5
AB
102is governed by
103.B CAP_PERFMON
104(since Linux 5.9) and a ptrace access mode
13ec13dc 105.B PTRACE_MODE_READ_REALCREDS
d19b29a5 106check on older Linux versions; see
13ec13dc 107.BR ptrace (2).
efeece04 108.PP
f2b1d720
MK
109The
110.I group_fd
111argument allows event groups to be created.
112An event group has one event which is the group leader.
113The leader is created first, with
114.IR group_fd " = \-1."
115The rest of the group members are created with subsequent
116.BR perf_event_open ()
117calls with
2b9bf369 118.I group_fd
bec6277e 119being set to the file descriptor of the group leader.
f2b1d720
MK
120(A single event on its own is created with
121.IR group_fd " = \-1"
122and is considered to be a group with only 1 member.)
33a0ccb2 123An event group is scheduled onto the CPU as a unit: it will
d1007d14 124be put onto the CPU only if all of the events in the group can be put onto
f2b1d720
MK
125the CPU.
126This means that the values of the member events can be
ce88f77b 127meaningfully compared\(emadded, divided (to get ratios), and so on\(emwith each
f2b1d720
MK
128other, since they have counted events for the same set of executed
129instructions.
11ac5b51 130.PP
f2b1d720
MK
131The
132.I flags
08e325e8 133argument is formed by ORing together zero or more of the following values:
f2b1d720 134.TP
60dafbc1
MK
135.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)"
136.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e
e9b1ab78
MK
137This flag enables the close-on-exec flag for the created
138event file descriptor,
139so that the file descriptor is automatically closed on
140.BR execve (2).
8bad22e5
MK
141Setting the close-on-exec flags at creation time, rather than later with
142.BR fcntl (2),
e9b1ab78
MK
143avoids potential race conditions where the calling thread invokes
144.BR perf_event_open ()
a61dba34
MK
145and
146.BR fcntl (2)
e9b1ab78
MK
147at the same time as another thread calls
148.BR fork (2)
149then
150.BR execve (2).
151.TP
1ae6b2c7 152.B PERF_FLAG_FD_NO_GROUP
31266c04 153This flag tells the event to ignore the
2b9bf369 154.I group_fd
31266c04
VW
155parameter except for the purpose of setting up output redirection
156using the
157.B PERF_FLAG_FD_OUTPUT
158flag.
f2b1d720 159.TP
3117263f 160.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)"
747a6e7c 161.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318
31266c04
VW
162This flag re-routes the event's sampled output to instead
163be included in the mmap buffer of the event specified by
164.IR group_fd .
f2b1d720 165.TP
3117263f 166.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)"
60dafbc1 167.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25
f2b1d720
MK
168This flag activates per-container system-wide monitoring.
169A container
ce88f77b 170is an abstraction that isolates a set of resources for finer-grained
699893d8 171control (CPUs, memory, etc.).
f2b1d720
MK
172In this mode, the event is measured
173only if the thread running on the monitored CPU belongs to the designated
174container (cgroup).
175The cgroup is identified by passing a file descriptor
176opened on its directory in the cgroupfs filesystem.
177For instance, if the
178cgroup to monitor is called
179.IR test ,
180then a file descriptor opened on
181.I /dev/cgroup/test
182(assuming cgroupfs is mounted on
183.IR /dev/cgroup )
184must be passed as the
185.I pid
186parameter.
33a0ccb2 187cgroup monitoring is available only
f2b1d720 188for system-wide events and may therefore require extra permissions.
11ac5b51 189.PP
f2b1d720
MK
190The
191.I perf_event_attr
192structure provides detailed configuration information
193for the event being created.
efeece04 194.PP
f2b1d720 195.in +4n
b8302363 196.EX
f2b1d720 197struct perf_event_attr {
da8bd8a4
MK
198 __u32 type; /* Type of event */
199 __u32 size; /* Size of attribute structure */
d064d41a 200 __u64 config; /* Type\-specific configuration */
f2b1d720
MK
201
202 union {
203 __u64 sample_period; /* Period of sampling */
204 __u64 sample_freq; /* Frequency of sampling */
205 };
206
ce88f77b
MK
207 __u64 sample_type; /* Specifies values included in sample */
208 __u64 read_format; /* Specifies values returned in read */
209
210 __u64 disabled : 1, /* off by default */
211 inherit : 1, /* children inherit it */
212 pinned : 1, /* must always be on PMU */
213 exclusive : 1, /* only group on PMU */
861d36ba
MK
214 exclude_user : 1, /* don\(aqt count user */
215 exclude_kernel : 1, /* don\(aqt count kernel */
216 exclude_hv : 1, /* don\(aqt count hypervisor */
217 exclude_idle : 1, /* don\(aqt count when idle */
ce88f77b
MK
218 mmap : 1, /* include mmap data */
219 comm : 1, /* include comm data */
220 freq : 1, /* use freq, not period */
221 inherit_stat : 1, /* per task counts */
222 enable_on_exec : 1, /* next exec enables */
223 task : 1, /* trace fork/exit */
224 watermark : 1, /* wakeup_watermark */
225 precise_ip : 2, /* skid constraint */
d064d41a 226 mmap_data : 1, /* non\-exec mmap data */
ce88f77b 227 sample_id_all : 1, /* sample_type all events */
861d36ba
MK
228 exclude_host : 1, /* don\(aqt count in host */
229 exclude_guest : 1, /* don\(aqt count in guest */
ce88f77b
MK
230 exclude_callchain_kernel : 1,
231 /* exclude kernel callchains */
232 exclude_callchain_user : 1,
233 /* exclude user callchains */
9bfc542b 234 mmap2 : 1, /* include mmap with inode data */
dc9ec146
MK
235 comm_exec : 1, /* flag comm events that are
236 due to exec */
6bd5186a 237 use_clockid : 1, /* use clockid for time fields */
9277a75d 238 context_switch : 1, /* context switch data */
1e554f3e
NK
239 write_backward : 1, /* Write ring buffer from end
240 to beginning */
241 namespaces : 1, /* include namespaces data */
242 ksymbol : 1, /* include ksymbol events */
243 bpf_event : 1, /* include bpf events */
244 aux_output : 1, /* generate AUX records
245 instead of events */
246 cgroup : 1, /* include cgroup events */
247 text_poke : 1, /* include text poke events */
6bd5186a 248
1e554f3e 249 __reserved_1 : 30;
f2b1d720
MK
250
251 union {
252 __u32 wakeup_events; /* wakeup every n events */
7db515ef 253 __u32 wakeup_watermark; /* bytes before wakeup */
f2b1d720
MK
254 };
255
256 __u32 bp_type; /* breakpoint type */
257
258 union {
259 __u64 bp_addr; /* breakpoint address */
7d8449ba
SL
260 __u64 kprobe_func; /* for perf_kprobe */
261 __u64 uprobe_path; /* for perf_uprobe */
f2b1d720
MK
262 __u64 config1; /* extension of config */
263 };
264
265 union {
266 __u64 bp_len; /* breakpoint length */
7d8449ba
SL
267 __u64 kprobe_addr; /* with kprobe_func == NULL */
268 __u64 probe_offset; /* for perf_[k,u]probe */
f2b1d720
MK
269 __u64 config2; /* extension of config1 */
270 };
ce88f77b
MK
271 __u64 branch_sample_type; /* enum perf_branch_sample_type */
272 __u64 sample_regs_user; /* user regs to dump on samples */
273 __u32 sample_stack_user; /* size of stack to dump on
7db515ef 274 samples */
6bd5186a 275 __s32 clockid; /* clock to use for time fields */
f5281dfd 276 __u64 sample_regs_intr; /* regs to dump on samples */
cdc52f4a 277 __u32 aux_watermark; /* aux bytes before wakeup */
fd133d5d
VW
278 __u16 sample_max_stack; /* max frames in callchain */
279 __u16 __reserved_2; /* align to u64 */
cdc52f4a 280
f2b1d720 281};
b8302363 282.EE
f2b1d720 283.in
efeece04 284.PP
f2b1d720
MK
285The fields of the
286.I perf_event_attr
287structure are described in more detail below:
f2b1d720
MK
288.TP
289.I type
290This field specifies the overall event type.
291It has one of the following values:
292.RS
293.TP
294.B PERF_TYPE_HARDWARE
295This indicates one of the "generalized" hardware events provided
296by the kernel.
297See the
298.I config
299field definition for more details.
300.TP
301.B PERF_TYPE_SOFTWARE
302This indicates one of the software-defined events provided by the kernel
303(even if no hardware support is available).
304.TP
305.B PERF_TYPE_TRACEPOINT
306This indicates a tracepoint
307provided by the kernel tracepoint infrastructure.
308.TP
309.B PERF_TYPE_HW_CACHE
310This indicates a hardware cache event.
311This has a special encoding, described in the
312.I config
313field definition.
314.TP
315.B PERF_TYPE_RAW
316This indicates a "raw" implementation-specific event in the
317.IR config " field."
318.TP
31c1f2b0 319.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
60dafbc1 320.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
321This indicates a hardware breakpoint as provided by the CPU.
322Breakpoints can be read/write accesses to an address as well as
323execution of an instruction address.
324.TP
7d281e0a 325dynamic PMU
747a6e7c
VW
326Since Linux 2.6.38,
327.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19
7db515ef 328.BR perf_event_open ()
f2b1d720
MK
329can support multiple PMUs.
330To enable this, a value exported by the kernel can be used in the
331.I type
332field to indicate which PMU to use.
333The value to use can be found in the sysfs filesystem:
334there is a subdirectory per PMU instance under
335.IR /sys/bus/event_source/devices .
7d182bb6 336In each subdirectory there is a
f2b1d720
MK
337.I type
338file whose content is an integer that can be used in the
339.I type
340field.
341For instance,
342.I /sys/bus/event_source/devices/cpu/type
343contains the value for the core CPU PMU, which is usually 4.
7d8449ba 344.TP
6170a241
MK
345.BR kprobe " and " uprobe " (since Linux 4.17)"
346.\" commit 65074d43fc77bcae32776724b7fa2696923c78e4
347.\" commit e12f03d7031a977356e3d7b75a68c2185ff8d155
348.\" commit 33ea4b24277b06dbc55d7f5772a46f029600255e
7d8449ba 349These two dynamic PMUs create a kprobe/uprobe and attach it to the
c87e72a2
MK
350file descriptor generated by perf_event_open.
351The kprobe/uprobe will be destroyed on the destruction of the file descriptor.
7d8449ba 352See fields
2b9bf369
AC
353.IR kprobe_func ,
354.IR uprobe_path ,
355.IR kprobe_addr ,
356and
357.I probe_offset
7d8449ba 358for more details.
f2b1d720 359.RE
f2b1d720
MK
360.TP
361.I "size"
362The size of the
363.I perf_event_attr
364structure for forward/backward compatibility.
365Set this using
366.I sizeof(struct perf_event_attr)
367to allow the kernel to see
368the struct size at the time of compilation.
efeece04 369.IP
f2b1d720
MK
370The related define
371.B PERF_ATTR_SIZE_VER0
372is set to 64; this was the size of the first published struct.
373.B PERF_ATTR_SIZE_VER1
374is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
747a6e7c
VW
375.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
376.\" this was added much later when PERF_ATTR_SIZE_VER2 happened
377.\" but the actual attr_size had increased in 2.6.33
f2b1d720
MK
378.B PERF_ATTR_SIZE_VER2
379is 80 corresponding to the addition of branch sampling in Linux 3.4.
747a6e7c 380.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
d2a6be2f 381.B PERF_ATTR_SIZE_VER3
f2b1d720 382is 96 corresponding to the addition
7ede2f66
DP
383of
384.I sample_regs_user
385and
386.I sample_stack_user
387in Linux 3.7.
747a6e7c 388.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03
f5281dfd
VW
389.B PERF_ATTR_SIZE_VER4
390is 104 corresponding to the addition of
391.I sample_regs_intr
392in Linux 3.19.
393.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
cdc52f4a
VW
394.B PERF_ATTR_SIZE_VER5
395is 112 corresponding to the addition of
2050c098 396.I aux_watermark
cdc52f4a
VW
397in Linux 4.1.
398.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
f2b1d720
MK
399.TP
400.I "config"
401This specifies which event you want, in conjunction with
402the
403.I type
404field.
405The
2b9bf369
AC
406.I config1
407and
408.I config2
f2b1d720
MK
409fields are also taken into account in cases where 64 bits is not
410enough to fully specify the event.
411The encoding of these fields are event dependent.
efeece04 412.IP
f2b1d720
MK
413There are various ways to set the
414.I config
415field that are dependent on the value of the previously
416described
417.I type
418field.
419What follows are various possible settings for
420.I config
421separated out by
422.IR type .
efeece04 423.IP
f2b1d720
MK
424If
425.I type
426is
427.BR PERF_TYPE_HARDWARE ,
428we are measuring one of the generalized hardware CPU events.
429Not all of these are available on all platforms.
430Set
431.I config
432to one of the following:
433.RS 12
434.TP
435.B PERF_COUNT_HW_CPU_CYCLES
436Total cycles.
2b538c3e 437Be wary of what happens during CPU frequency scaling.
f2b1d720
MK
438.TP
439.B PERF_COUNT_HW_INSTRUCTIONS
440Retired instructions.
441Be careful, these can be affected by various
2b538c3e 442issues, most notably hardware interrupt counts.
f2b1d720
MK
443.TP
444.B PERF_COUNT_HW_CACHE_REFERENCES
445Cache accesses.
446Usually this indicates Last Level Cache accesses but this may
447vary depending on your CPU.
448This may include prefetches and coherency messages; again this
449depends on the design of your CPU.
450.TP
451.B PERF_COUNT_HW_CACHE_MISSES
452Cache misses.
453Usually this indicates Last Level Cache misses; this is intended to be
454used in conjunction with the
455.B PERF_COUNT_HW_CACHE_REFERENCES
456event to calculate cache miss rates.
457.TP
458.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
459Retired branch instructions.
747a6e7c 460Prior to Linux 2.6.35, this used
f2b1d720 461the wrong event on AMD processors.
747a6e7c 462.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
f2b1d720
MK
463.TP
464.B PERF_COUNT_HW_BRANCH_MISSES
465Mispredicted branch instructions.
466.TP
467.B PERF_COUNT_HW_BUS_CYCLES
468Bus cycles, which can be different from total cycles.
469.TP
31c1f2b0 470.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
747a6e7c 471.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
472Stalled cycles during issue.
473.TP
31c1f2b0 474.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
747a6e7c 475.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
476Stalled cycles during retirement.
477.TP
31c1f2b0 478.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
60dafbc1 479.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890
f2b1d720
MK
480Total cycles; not affected by CPU frequency scaling.
481.RE
482.IP
483If
484.I type
485is
486.BR PERF_TYPE_SOFTWARE ,
487we are measuring software events provided by the kernel.
488Set
489.I config
490to one of the following:
491.RS 12
492.TP
493.B PERF_COUNT_SW_CPU_CLOCK
494This reports the CPU clock, a high-resolution per-CPU timer.
495.TP
496.B PERF_COUNT_SW_TASK_CLOCK
497This reports a clock count specific to the task that is running.
498.TP
499.B PERF_COUNT_SW_PAGE_FAULTS
500This reports the number of page faults.
501.TP
502.B PERF_COUNT_SW_CONTEXT_SWITCHES
503This counts context switches.
504Until Linux 2.6.34, these were all reported as user-space
505events, after that they are reported as happening in the kernel.
747a6e7c 506.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21
f2b1d720
MK
507.TP
508.B PERF_COUNT_SW_CPU_MIGRATIONS
509This reports the number of times the process
510has migrated to a new CPU.
511.TP
512.B PERF_COUNT_SW_PAGE_FAULTS_MIN
513This counts the number of minor page faults.
514These did not require disk I/O to handle.
515.TP
516.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
517This counts the number of major page faults.
518These required disk I/O to handle.
519.TP
31c1f2b0 520.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
60dafbc1 521.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
522This counts the number of alignment faults.
523These happen when unaligned memory accesses happen; the kernel
524can handle these but it reduces performance.
33a0ccb2 525This happens only on some architectures (never on x86).
f2b1d720 526.TP
31c1f2b0 527.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
60dafbc1 528.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
529This counts the number of emulation faults.
530The kernel sometimes traps on unimplemented instructions
7db515ef 531and emulates them for user space.
f2b1d720 532This can negatively impact performance.
dab38455 533.TP
31c1f2b0 534.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
60dafbc1 535.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77
dab38455
VW
536This is a placeholder event that counts nothing.
537Informational sample record types such as mmap or comm
538must be associated with an active event.
539This dummy event allows gathering such records without requiring
540a counting event.
f2b1d720 541.RE
efeece04 542.PP
f2b1d720
MK
543.RS
544If
545.I type
546is
547.BR PERF_TYPE_TRACEPOINT ,
548then we are measuring kernel tracepoints.
549The value to use in
550.I config
551can be obtained from under debugfs
552.I tracing/events/*/*/id
553if ftrace is enabled in the kernel.
f2b1d720 554.RE
efeece04 555.PP
f2b1d720
MK
556.RS
557If
558.I type
559is
560.BR PERF_TYPE_HW_CACHE ,
561then we are measuring a hardware CPU cache event.
562To calculate the appropriate
563.I config
300ef55c 564value, use the following equation:
5020bc28 565.RS 4
2b9bf369
AC
566.PP
567.in +4n
5020bc28
AC
568.EX
569config = (perf_hw_cache_id) |
570 (perf_hw_cache_op_id << 8) |
571 (perf_hw_cache_op_result_id << 16);
572.EE
2b9bf369 573.in
11ac5b51 574.PP
f2b1d720
MK
575where
576.I perf_hw_cache_id
577is one of:
7db515ef 578.RS 4
f2b1d720
MK
579.TP
580.B PERF_COUNT_HW_CACHE_L1D
581for measuring Level 1 Data Cache
582.TP
583.B PERF_COUNT_HW_CACHE_L1I
584for measuring Level 1 Instruction Cache
585.TP
586.B PERF_COUNT_HW_CACHE_LL
587for measuring Last-Level Cache
588.TP
589.B PERF_COUNT_HW_CACHE_DTLB
590for measuring the Data TLB
591.TP
592.B PERF_COUNT_HW_CACHE_ITLB
593for measuring the Instruction TLB
594.TP
595.B PERF_COUNT_HW_CACHE_BPU
596for measuring the branch prediction unit
597.TP
5a69ce9c
MK
598.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)"
599.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477
f2b1d720
MK
600for measuring local memory accesses
601.RE
11ac5b51 602.PP
f2b1d720
MK
603and
604.I perf_hw_cache_op_id
4af27572 605is one of:
7db515ef 606.RS 4
f2b1d720
MK
607.TP
608.B PERF_COUNT_HW_CACHE_OP_READ
609for read accesses
610.TP
611.B PERF_COUNT_HW_CACHE_OP_WRITE
612for write accesses
613.TP
614.B PERF_COUNT_HW_CACHE_OP_PREFETCH
615for prefetch accesses
616.RE
11ac5b51 617.PP
f2b1d720
MK
618and
619.I perf_hw_cache_op_result_id
4af27572 620is one of:
7db515ef 621.RS 4
f2b1d720
MK
622.TP
623.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
624to measure accesses
625.TP
626.B PERF_COUNT_HW_CACHE_RESULT_MISS
627to measure misses
628.RE
629.RE
efeece04 630.PP
f2b1d720
MK
631If
632.I type
633is
634.BR PERF_TYPE_RAW ,
635then a custom "raw"
636.I config
637value is needed.
638Most CPUs support events that are not covered by the "generalized" events.
639These are implementation defined; see your CPU manual (for example
640the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
641Guide).
642The libpfm4 library can be used to translate from the name in the
643architectural manuals to the raw hex value
644.BR perf_event_open ()
645expects in this field.
efeece04 646.PP
f2b1d720
MK
647If
648.I type
649is
650.BR PERF_TYPE_BREAKPOINT ,
651then leave
652.I config
653set to zero.
654Its parameters are set in other places.
7d8449ba
SL
655.PP
656If
657.I type
658is
2b9bf369 659.B kprobe
7d8449ba
SL
660or
661.BR uprobe ,
662set
2b9bf369 663.I retprobe
7d8449ba
SL
664(bit 0 of
665.IR config ,
c87e72a2
MK
666see
667.IR /sys/bus/event_source/devices/[k,u]probe/format/retprobe )
668for kretprobe/uretprobe.
669See fields
2b9bf369
AC
670.IR kprobe_func ,
671.IR uprobe_path ,
672.IR kprobe_addr ,
673and
674.I probe_offset
7d8449ba
SL
675for more details.
676.RE
677.TP
678.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset
c87e72a2 679These fields describe the kprobe/uprobe for dynamic PMUs
2b9bf369 680.B kprobe
7d8449ba
SL
681and
682.BR uprobe .
683For
c87e72a2 684.BR kprobe :
7d8449ba
SL
685use
686.I kprobe_func
687and
688.IR probe_offset ,
689or use
690.I kprobe_addr
691and leave
692.I kprobe_func
c87e72a2
MK
693as NULL.
694For
695.BR uprobe :
7d8449ba
SL
696use
697.I uprobe_path
698and
699.IR probe_offset .
f2b1d720
MK
700.TP
701.IR sample_period ", " sample_freq
21977c9d 702A "sampling" event is one that generates an overflow notification
f2b1d720
MK
703every N events, where N is given by
704.IR sample_period .
21977c9d 705A sampling event has
f2b1d720 706.IR sample_period " > 0."
21977c9d 707When an overflow occurs, requested data is recorded
f2b1d720
MK
708in the mmap buffer.
709The
710.I sample_type
21977c9d 711field controls what data is recorded on each overflow.
efeece04 712.IP
f2b1d720
MK
713.I sample_freq
714can be used if you wish to use frequency rather than period.
37bee118 715In this case, you set the
f2b1d720
MK
716.I freq
717flag.
718The kernel will adjust the sampling period
719to try and achieve the desired rate.
720The rate of adjustment is a
721timer tick.
f2b1d720 722.TP
2b9bf369 723.I sample_type
f2b1d720
MK
724The various bits in this field specify which values to include
725in the sample.
726They will be recorded in a ring-buffer,
ad73a2cc 727which is available to user space using
f2b1d720
MK
728.BR mmap (2).
729The order in which the values are saved in the
730sample are documented in the MMAP Layout subsection below;
731it is not the
732.I "enum perf_event_sample_format"
733order.
734.RS
735.TP
736.B PERF_SAMPLE_IP
737Records instruction pointer.
738.TP
739.B PERF_SAMPLE_TID
7db515ef 740Records the process and thread IDs.
f2b1d720
MK
741.TP
742.B PERF_SAMPLE_TIME
743Records a timestamp.
744.TP
745.B PERF_SAMPLE_ADDR
746Records an address, if applicable.
747.TP
748.B PERF_SAMPLE_READ
749Record counter values for all events in a group, not just the group leader.
750.TP
751.B PERF_SAMPLE_CALLCHAIN
752Records the callchain (stack backtrace).
753.TP
754.B PERF_SAMPLE_ID
755Records a unique ID for the opened event's group leader.
756.TP
757.B PERF_SAMPLE_CPU
758Records CPU number.
759.TP
760.B PERF_SAMPLE_PERIOD
761Records the current sampling period.
762.TP
763.B PERF_SAMPLE_STREAM_ID
764Records a unique ID for the opened event.
765Unlike
766.B PERF_SAMPLE_ID
767the actual ID is returned, not the group leader.
8859d3a9
DP
768This ID is the same as the one returned by
769.BR PERF_FORMAT_ID .
f2b1d720
MK
770.TP
771.B PERF_SAMPLE_RAW
772Records additional data, if applicable.
773Usually returned by tracepoint events.
774.TP
31c1f2b0 775.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
60dafbc1 776.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
045bf4d3
VW
777This provides a record of recent branches, as provided
778by CPU branch sampling hardware (such as Intel Last Branch Record).
779Not all hardware supports this feature.
efeece04 780.IP
045bf4d3
VW
781See the
782.I branch_sample_type
783field for how to filter which branches are reported.
f2b1d720 784.TP
31c1f2b0 785.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
60dafbc1 786.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
d1007d14
VW
787Records the current user-level CPU register state
788(the values in the process before the kernel was called).
f2b1d720 789.TP
31c1f2b0 790.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
60dafbc1 791.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
792Records the user level stack, allowing stack unwinding.
793.TP
31c1f2b0 794.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
60dafbc1 795.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c
d1007d14 796Records a hardware provided weight value that expresses how
51700fd7 797costly the sampled event was.
d1007d14
VW
798This allows the hardware to highlight expensive events in
799a profile.
800.TP
31c1f2b0 801.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
60dafbc1 802.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1
d1007d14
VW
803Records the data source: where in the memory hierarchy
804the data associated with the sampled instruction came from.
6170255e 805This is available only if the underlying hardware
d1007d14 806supports this feature.
7480dabb 807.TP
31c1f2b0 808.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
60dafbc1 809.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955
8859d3a9
DP
810Places the
811.B SAMPLE_ID
812value in a fixed position in the record,
7480dabb
VW
813either at the beginning (for sample events) or at the end
814(if a non-sample event).
efeece04 815.IP
7480dabb
VW
816This was necessary because a sample stream may have
817records from various different event sources with different
818.I sample_type
819settings.
e9bd9b2c 820Parsing the event stream properly was not possible because the
8859d3a9
DP
821format of the record was needed to find
822.BR SAMPLE_ID ,
823but
27f52b52 824the format could not be found without knowing what
7480dabb
VW
825event the sample belonged to (causing a circular
826dependency).
efeece04 827.IP
e41c36b2 828The
7480dabb
VW
829.B PERF_SAMPLE_IDENTIFIER
830setting makes the event stream always parsable
8859d3a9
DP
831by putting
832.B SAMPLE_ID
833in a fixed location, even though
834it means having duplicate
835.B SAMPLE_ID
836values in records.
1e043959 837.TP
60dafbc1
MK
838.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)"
839.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5
84fc2a6e 840Records reasons for transactional memory abort events
1e043959 841(for example, from Intel TSX transactional memory support).
efeece04 842.IP
1e043959
VW
843The
844.I precise_ip
b3f39642 845setting must be greater than 0 and a transactional memory abort
1e043959 846event must be measured or no values will be recorded.
84fc2a6e
MK
847Also note that some perf_event measurements, such as sampled
848cycle counting, may cause extraneous aborts (by causing an
1e043959 849interrupt during a transaction).
f5281dfd
VW
850.TP
851.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)"
852.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
853Records a subset of the current CPU register state
854as specified by
855.IR sample_regs_intr .
856Unlike
857.B PERF_SAMPLE_REGS_USER
858the register values will return kernel register
859state if the overflow happened while kernel
860code is running.
861If the CPU supports hardware sampling of
b01ae37b 862register state (i.e., PEBS on Intel x86) and
f5281dfd
VW
863.I precise_ip
864is set higher than zero then the register
865values returned are those captured by
866hardware at the time of the sampled
867instruction's retirement.
1e554f3e
NK
868.TP
869.BR PERF_SAMPLE_PHYS_ADDR " (since Linux 4.13)"
870.\" commit fc7ce9c74c3ad232b084d80148654f926d01ece7
871Records physical address of data like in
5ae2634d 872.BR PERF_SAMPLE_ADDR .
1e554f3e
NK
873.TP
874.BR PERF_SAMPLE_CGROUP " (since Linux 5.7)"
875.\" commit 96aaab686505c449e24d76e76507290dcc30e008
5ae2634d 876Records (perf_event) cgroup ID of the process.
1e554f3e
NK
877This corresponds to the
878.I id
879field in the
880.B PERF_RECORD_CGROUP
881event.
f2b1d720 882.RE
f2b1d720 883.TP
2b9bf369 884.I read_format
f2b1d720
MK
885This field specifies the format of the data returned by
886.BR read (2)
887on a
7db515ef 888.BR perf_event_open ()
f2b1d720
MK
889file descriptor.
890.RS
891.TP
892.B PERF_FORMAT_TOTAL_TIME_ENABLED
7ede2f66
DP
893Adds the 64-bit
894.I time_enabled
895field.
f2b1d720
MK
896This can be used to calculate estimated totals if
897the PMU is overcommitted and multiplexing is happening.
898.TP
899.B PERF_FORMAT_TOTAL_TIME_RUNNING
7ede2f66
DP
900Adds the 64-bit
901.I time_running
902field.
f2b1d720 903This can be used to calculate estimated totals if
3d1ee497 904the PMU is overcommitted and multiplexing is happening.
f2b1d720
MK
905.TP
906.B PERF_FORMAT_ID
907Adds a 64-bit unique value that corresponds to the event group.
908.TP
909.B PERF_FORMAT_GROUP
910Allows all counter values in an event group to be read with one read.
911.RE
f2b1d720 912.TP
2b9bf369 913.I disabled
f2b1d720
MK
914The
915.I disabled
916bit specifies whether the counter starts out disabled or enabled.
917If disabled, the event can later be enabled by
918.BR ioctl (2),
919.BR prctl (2),
920or
921.IR enable_on_exec .
efeece04 922.IP
406650db
VW
923When creating an event group, typically the group leader is initialized
924with
925.I disabled
926set to 1 and any child events are initialized with
927.I disabled
928set to 0.
929Despite
930.I disabled
931being 0, the child events will not start until the group leader
932is enabled.
f2b1d720 933.TP
2b9bf369 934.I inherit
f2b1d720
MK
935The
936.I inherit
937bit specifies that this counter should count events of child
938tasks as well as the task specified.
33a0ccb2 939This applies only to new children, not to any existing children at
f2b1d720
MK
940the time the counter is created (nor to any new children of
941existing children).
efeece04 942.IP
f2b1d720 943Inherit does not work for some combinations of
1ae6b2c7 944.I read_format
4b3a5f01 945values, such as
f2b1d720 946.BR PERF_FORMAT_GROUP .
f2b1d720 947.TP
2b9bf369 948.I pinned
f2b1d720
MK
949The
950.I pinned
951bit specifies that the counter should always be on the CPU if at all
952possible.
33a0ccb2 953It applies only to hardware counters and only to group leaders.
f2b1d720
MK
954If a pinned counter cannot be put onto the CPU (e.g., because there are
955not enough hardware counters or because of a conflict with some other
956event), then the counter goes into an 'error' state, where reads
957return end-of-file (i.e.,
958.BR read (2)
959returns 0) until the counter is subsequently enabled or disabled.
f2b1d720 960.TP
2b9bf369 961.I exclusive
f2b1d720
MK
962The
963.I exclusive
964bit specifies that when this counter's group is on the CPU,
965it should be the only group using the CPU's counters.
966In the future this may allow monitoring programs to
967support PMU features that need to run alone so that they do not
968disrupt other hardware counters.
efeece04 969.IP
bea10c8c
VW
970Note that many unexpected situations may prevent events with the
971.I exclusive
d3532647 972bit set from ever running.
bea10c8c 973This includes any users running a system-wide
d3532647 974measurement as well as any kernel use of the performance counters
bea10c8c 975(including the commonly enabled NMI Watchdog Timer interface).
f2b1d720 976.TP
2b9bf369 977.I exclude_user
ad73a2cc 978If this bit is set, the count excludes events that happen in user space.
f2b1d720 979.TP
2b9bf369 980.I exclude_kernel
edb3e316 981If this bit is set, the count excludes events that happen in kernel space.
f2b1d720 982.TP
2b9bf369 983.I exclude_hv
f2b1d720
MK
984If this bit is set, the count excludes events that happen in the
985hypervisor.
986This is mainly for PMUs that have built-in support for handling this
987(such as POWER).
988Extra support is needed for handling hypervisor measurements on most
989machines.
f2b1d720 990.TP
2b9bf369 991.I exclude_idle
38b581e8
VW
992If set, don't count when the CPU is running the idle task.
993While you can currently enable this for any event type, it is ignored
994for all but software events.
f2b1d720 995.TP
2b9bf369 996.I mmap
f2b1d720
MK
997The
998.I mmap
75ee11e5 999bit enables generation of
cd7c700a 1000.B PERF_RECORD_MMAP
75ee11e5
VW
1001samples for every
1002.BR mmap (2)
1003call that has
cd7c700a 1004.B PROT_EXEC
75ee11e5
VW
1005set.
1006This allows tools to notice new executable code being mapped into
1007a program (dynamic shared libraries for example)
1008so that addresses can be mapped back to the original code.
f2b1d720 1009.TP
2b9bf369 1010.I comm
f2b1d720
MK
1011The
1012.I comm
1013bit enables tracking of process command name as modified by the
ee2379ef 1014.BR execve (2)
f2b1d720 1015and
cd7c700a 1016.BR prctl (PR_SET_NAME)
49bc411c
VW
1017system calls as well as writing to
1018.IR /proc/self/comm .
790ee6d6 1019If the
49bc411c 1020.I comm_exec
790ee6d6 1021flag is also successfully set (possible since Linux 3.16),
747a6e7c 1022.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
1023then the misc flag
1024.B PERF_RECORD_MISC_COMM_EXEC
1025can be used to differentiate the
ee2379ef 1026.BR execve (2)
49bc411c 1027case from the others.
f2b1d720 1028.TP
2b9bf369 1029.I freq
f2b1d720
MK
1030If this bit is set, then
1031.I sample_frequency
1032not
1033.I sample_period
1034is used when setting up the sampling interval.
f2b1d720 1035.TP
2b9bf369 1036.I inherit_stat
f2b1d720
MK
1037This bit enables saving of event counts on context switch for
1038inherited tasks.
33a0ccb2 1039This is meaningful only if the
f2b1d720
MK
1040.I inherit
1041field is set.
f2b1d720 1042.TP
2b9bf369 1043.I enable_on_exec
f2b1d720
MK
1044If this bit is set, a counter is automatically
1045enabled after a call to
ee2379ef 1046.BR execve (2).
f2b1d720 1047.TP
2b9bf369 1048.I task
f2b1d720
MK
1049If this bit is set, then
1050fork/exit notifications are included in the ring buffer.
f2b1d720 1051.TP
2b9bf369 1052.I watermark
21977c9d 1053If set, have an overflow notification happen when we cross the
f2b1d720
MK
1054.I wakeup_watermark
1055boundary.
21977c9d 1056Otherwise, overflow notifications happen after
f2b1d720
MK
1057.I wakeup_events
1058samples.
f2b1d720 1059.TP
2b9bf369 1060.IR precise_ip " (since Linux 2.6.35)"
747a6e7c 1061.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076
f2b1d720
MK
1062This controls the amount of skid.
1063Skid is how many instructions
1064execute between an event of interest happening and the kernel
1065being able to stop and record the event.
1066Smaller skid is
1067better and allows more accurate reporting of which events
1068correspond to which instructions, but hardware is often limited
1069with how small this can be.
efeece04 1070.IP
5d73bc3f 1071The possible values of this field are the following:
f2b1d720 1072.RS
dc9ec146 1073.IP 0 3
f2b1d720 1074.B SAMPLE_IP
2b538c3e 1075can have arbitrary skid.
dc9ec146 1076.IP 1
f2b1d720 1077.B SAMPLE_IP
2b538c3e 1078must have constant skid.
dc9ec146 1079.IP 2
f2b1d720 1080.B SAMPLE_IP
2b538c3e 1081requested to have 0 skid.
dc9ec146 1082.IP 3
f2b1d720
MK
1083.B SAMPLE_IP
1084must have 0 skid.
5d73bc3f 1085See also the description of
f2b1d720
MK
1086.BR PERF_RECORD_MISC_EXACT_IP .
1087.RE
f2b1d720 1088.TP
2b9bf369 1089.IR mmap_data " (since Linux 2.6.36)"
747a6e7c 1090.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e
b01ae37b 1091This is the counterpart of the
f2b1d720 1092.I mmap
75ee11e5
VW
1093field.
1094This enables generation of
cd7c700a 1095.B PERF_RECORD_MMAP
75ee11e5
VW
1096samples for
1097.BR mmap (2)
1098calls that do not have
cd7c700a 1099.B PROT_EXEC
75ee11e5 1100set (for example data and SysV shared memory).
f2b1d720 1101.TP
2b9bf369 1102.IR sample_id_all " (since Linux 2.6.38)"
747a6e7c 1103.\" commit c980d1091810df13f21aabbce545fd98f545bbf7
7480dabb 1104If set, then TID, TIME, ID, STREAM_ID, and CPU can
f2b1d720
MK
1105additionally be included in
1106.RB non- PERF_RECORD_SAMPLE s
1107if the corresponding
1108.I sample_type
1109is selected.
efeece04 1110.IP
e9bd9b2c 1111If
7480dabb 1112.B PERF_SAMPLE_IDENTIFIER
37bee118 1113is specified, then an additional ID value is included
7480dabb
VW
1114as the last value to ease parsing the record stream.
1115This may lead to the
e9bd9b2c 1116.I id
7480dabb 1117value appearing twice.
efeece04 1118.IP
7480dabb 1119The layout is described by this pseudo-structure:
efeece04 1120.IP
7480dabb 1121.in +4n
b8302363 1122.EX
7480dabb 1123struct sample_id {
5b0fbedb
MK
1124 { u32 pid, tid; } /* if PERF_SAMPLE_TID set */
1125 { u64 time; } /* if PERF_SAMPLE_TIME set */
1126 { u64 id; } /* if PERF_SAMPLE_ID set */
1127 { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
1128 { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
1129 { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
7480dabb 1130};
5383b93b 1131.EE
c0b34c18 1132.in
f2b1d720 1133.TP
2b9bf369 1134.IR exclude_host " (since Linux 3.2)"
747a6e7c 1135.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e 1136When conducting measurements that include processes running
5d73bc3f
MK
1137VM instances (i.e., have executed a
1138.B KVM_RUN
1139.BR ioctl (2)),
1140only measure events happening inside a guest instance.
e38fb93e
VW
1141This is only meaningful outside the guests; this setting does
1142not change counts gathered inside of a guest.
34d4e61d 1143Currently, this functionality is x86 only.
f2b1d720 1144.TP
2b9bf369 1145.IR exclude_guest " (since Linux 3.2)"
747a6e7c 1146.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e 1147When conducting measurements that include processes running
5d73bc3f
MK
1148VM instances (i.e., have executed a
1149.B KVM_RUN
1150.BR ioctl (2)),
1151do not measure events happening inside guest instances.
e38fb93e
VW
1152This is only meaningful outside the guests; this setting does
1153not change counts gathered inside of a guest.
34d4e61d 1154Currently, this functionality is x86 only.
f2b1d720 1155.TP
2b9bf369 1156.IR exclude_callchain_kernel " (since Linux 3.7)"
747a6e7c 1157.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1158Do not include kernel callchains.
f2b1d720 1159.TP
2b9bf369 1160.IR exclude_callchain_user " (since Linux 3.7)"
747a6e7c 1161.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1162Do not include user callchains.
f2b1d720 1163.TP
2b9bf369 1164.IR mmap2 " (since Linux 3.16)"
747a6e7c
VW
1165.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741
1166.\" This is tricky; was committed during 3.12 development
1167.\" but right before release was disabled.
1168.\" So while you could select mmap2 starting with 3.12
1169.\" it did not work until 3.16
1170.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005
9bfc542b
VW
1171Generate an extended executable mmap record that contains enough
1172additional information to uniquely identify shared mappings.
1173The
1174.I mmap
1175flag must also be set for this to work.
1176.TP
2b9bf369 1177.IR comm_exec " (since Linux 3.16)"
747a6e7c 1178.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
5ab35ae5 1179This is purely a feature-detection flag, it does not change
49bc411c 1180kernel behavior.
5ab35ae5 1181If this flag can successfully be set, then, when
49bc411c 1182.I comm
5ab35ae5 1183is enabled, the
49bc411c
VW
1184.B PERF_RECORD_MISC_COMM_EXEC
1185flag will be set in the
1186.I misc
1187field of a comm record header if the rename event being
1188reported was caused by a call to
ee2379ef 1189.BR execve (2).
49bc411c
VW
1190This allows tools to distinguish between the various
1191types of process renaming.
1192.TP
2b9bf369 1193.IR use_clockid " (since Linux 4.1)"
6bd5186a
VW
1194.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1195This allows selecting which internal Linux clock to use
1196when generating timestamps via the
1197.I clockid
1198field.
1199This can make it easier to correlate perf sample times with
1200timestamps generated by other tools.
1201.TP
2b9bf369 1202.IR context_switch " (since Linux 4.3)"
9277a75d
VW
1203.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
1204This enables the generation of
1205.B PERF_RECORD_SWITCH
1206records when a context switch occurs.
1207It also enables the generation of
1208.B PERF_RECORD_SWITCH_CPU_WIDE
d5a24378 1209records when sampling in CPU-wide mode.
9277a75d
VW
1210This functionality is in addition to existing tracepoint and
1211software events for measuring context switches.
54905b0f
MK
1212The advantage of this method is that it will give full
1213information even with strict
9277a75d
VW
1214.I perf_event_paranoid
1215settings.
1216.TP
1e554f3e
NK
1217.IR write_backward " (since Linux 4.6)"
1218.\" commit 9ecda41acb971ebd07c8fb35faf24005c0baea12
5ae2634d 1219This causes the ring buffer to be written from the end to the beginning.
1e554f3e
NK
1220This is to support reading from overwritable ring buffer.
1221.TP
1222.IR namespaces " (since Linux 4.11)"
1223.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
1224This enables the generation of
1225.B PERF_RECORD_NAMESPACES
5ae2634d 1226records when a task enters a new namespace.
1e554f3e
NK
1227Each namespace has a combination of device and inode numbers.
1228.TP
1229.IR ksymbol " (since Linux 5.0)"
1230.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
1231This enables the generation of
1232.B PERF_RECORD_KSYMBOL
5ae2634d 1233records when new kernel symbols are registered or unregistered.
1e554f3e
NK
1234This is analyzing dynamic kernel functions like eBPF.
1235.TP
1236.IR bpf_event " (since Linux 5.0)"
1237.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
1238This enables the generation of
1239.B PERF_RECORD_BPF_EVENT
5ae2634d 1240records when an eBPF program is loaded or unloaded.
1e554f3e
NK
1241.TP
1242.IR auxevent " (since Linux 5.4)"
1243.\" commit ab43762ef010967e4ccd53627f70a2eecbeafefb
1244This allows normal (non-AUX) events to generate data for AUX events
1245if the hardware supports it.
1246.TP
1247.IR cgroup " (since Linux 5.7)"
1248.\" commit 96aaab686505c449e24d76e76507290dcc30e008
1249This enables the generation of
1250.B PERF_RECORD_CGROUP
1251records when a new cgroup is created (and activated).
1252.TP
1253.IR text_poke " (since Linux 5.8)"
1254.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
1255This enables the generation of
1256.B PERF_RECORD_TEXT_POKE
a96c61dd 1257records when there's a change to the kernel text
5ae2634d 1258(i.e., self-modifying code).
1e554f3e 1259.TP
2b9bf369 1260.IR wakeup_events ", " wakeup_watermark
f2b1d720
MK
1261This union sets how many samples
1262.RI ( wakeup_events )
1263or bytes
1264.RI ( wakeup_watermark )
21977c9d 1265happen before an overflow notification happens.
f2b1d720
MK
1266Which one is used is selected by the
1267.I watermark
cb8a928f 1268bit flag.
efeece04 1269.IP
751c0f1a 1270.I wakeup_events
6170255e 1271counts only
751c0f1a 1272.B PERF_RECORD_SAMPLE
51700fd7 1273record types.
21977c9d 1274To receive overflow notification for all
751c0f1a 1275.B PERF_RECORD
21977c9d 1276types choose watermark and set
751c0f1a
VW
1277.I wakeup_watermark
1278to 1.
efeece04 1279.IP
fc79d996 1280Prior to Linux 3.0, setting
747a6e7c 1281.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50
21977c9d
VW
1282.I wakeup_events
1283to 0 resulted in no overflow notifications;
1284more recent kernels treat 0 the same as 1.
f2b1d720 1285.TP
2b9bf369 1286.IR bp_type " (since Linux 2.6.33)"
747a6e7c 1287.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1288This chooses the breakpoint type.
1289It is one of:
1290.RS
1291.TP
2b9bf369 1292.B HW_BREAKPOINT_EMPTY
2b538c3e 1293No breakpoint.
f2b1d720 1294.TP
2b9bf369 1295.B HW_BREAKPOINT_R
2b538c3e 1296Count when we read the memory location.
f2b1d720 1297.TP
2b9bf369 1298.B HW_BREAKPOINT_W
2b538c3e 1299Count when we write the memory location.
f2b1d720 1300.TP
2b9bf369 1301.B HW_BREAKPOINT_RW
2b538c3e 1302Count when we read or write the memory location.
f2b1d720 1303.TP
2b9bf369 1304.B HW_BREAKPOINT_X
2b538c3e 1305Count when we execute code at the memory location.
dd3568a1 1306.PP
7db515ef 1307The values can be combined via a bitwise or, but the
f2b1d720
MK
1308combination of
1309.B HW_BREAKPOINT_R
1310or
1311.B HW_BREAKPOINT_W
1312with
1313.B HW_BREAKPOINT_X
1314is not allowed.
1315.RE
f2b1d720 1316.TP
2b9bf369 1317.IR bp_addr " (since Linux 2.6.33)"
747a6e7c 1318.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
5d73bc3f 1319This is the address of the breakpoint.
4b3a5f01
MK
1320For execution breakpoints, this is the memory address of the instruction
1321of interest; for read and write breakpoints, it is the memory address
f2b1d720 1322of the memory location of interest.
f2b1d720 1323.TP
2b9bf369 1324.IR config1 " (since Linux 2.6.39)"
747a6e7c 1325.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1326.I config1
1327is used for setting events that need an extra register or otherwise
1328do not fit in the regular config field.
1329Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
4b3a5f01 1330on Linux 3.3 and later kernels.
f2b1d720 1331.TP
2b9bf369 1332.IR bp_len " (since Linux 2.6.33)"
747a6e7c 1333.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1334.I bp_len
1335is the length of the breakpoint being measured if
1336.I type
1337is
1338.BR PERF_TYPE_BREAKPOINT .
1339Options are
1340.BR HW_BREAKPOINT_LEN_1 ,
1341.BR HW_BREAKPOINT_LEN_2 ,
1342.BR HW_BREAKPOINT_LEN_4 ,
4b3a5f01 1343and
f2b1d720
MK
1344.BR HW_BREAKPOINT_LEN_8 .
1345For an execution breakpoint, set this to
1346.IR sizeof(long) .
f2b1d720 1347.TP
2b9bf369 1348.IR config2 " (since Linux 2.6.39)"
747a6e7c 1349.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1350.I config2
1351is a further extension of the
1352.I config1
1353field.
f2b1d720 1354.TP
2b9bf369 1355.IR branch_sample_type " (since Linux 3.4)"
747a6e7c 1356.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
8a94e783 1357If
045bf4d3
VW
1358.B PERF_SAMPLE_BRANCH_STACK
1359is enabled, then this specifies what branches to include
1360in the branch record.
efeece04 1361.IP
e3c9782b 1362The first part of the value is the privilege level, which
4b3a5f01 1363is a combination of one of the values listed below.
045bf4d3
VW
1364If the user does not set privilege level explicitly, the kernel
1365will use the event's privilege level.
1366Event and branch privilege levels do not have to match.
f2b1d720
MK
1367.RS
1368.TP
1369.B PERF_SAMPLE_BRANCH_USER
33d6e2c7 1370Branch target is in user space.
f2b1d720
MK
1371.TP
1372.B PERF_SAMPLE_BRANCH_KERNEL
33d6e2c7 1373Branch target is in kernel space.
f2b1d720
MK
1374.TP
1375.B PERF_SAMPLE_BRANCH_HV
33d6e2c7 1376Branch target is in hypervisor.
e3c9782b
VW
1377.TP
1378.B PERF_SAMPLE_BRANCH_PLM_ALL
1379A convenience value that is the three preceding values ORed together.
11ac5b51 1380.PP
e3c9782b
VW
1381In addition to the privilege value, at least one or more of the
1382following bits must be set.
f2b1d720
MK
1383.TP
1384.B PERF_SAMPLE_BRANCH_ANY
33d6e2c7 1385Any branch type.
f2b1d720
MK
1386.TP
1387.B PERF_SAMPLE_BRANCH_ANY_CALL
c6e5df74 1388Any call branch (includes direct calls, indirect calls, and far jumps).
f2b1d720 1389.TP
e3c9782b 1390.B PERF_SAMPLE_BRANCH_IND_CALL
33d6e2c7 1391Indirect calls.
f2b1d720 1392.TP
c6e5df74
VW
1393.BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)"
1394.\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73
1395Direct calls.
1396.TP
1397.B PERF_SAMPLE_BRANCH_ANY_RETURN
1398Any return branch.
1399.TP
dde354c9
VW
1400.BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)"
1401.\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc
1402Indirect jumps.
1403.TP
aea60aad 1404.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)"
60dafbc1 1405.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050
aea60aad
VW
1406Conditional branches.
1407.TP
31c1f2b0 1408.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
60dafbc1 1409.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1410Transactional memory aborts.
e3c9782b 1411.TP
31c1f2b0 1412.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
60dafbc1 1413.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1414Branch in transactional memory transaction.
e3c9782b 1415.TP
31c1f2b0 1416.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
60dafbc1 1417.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1418Branch not in transactional memory transaction.
bb7e6ff0
VW
1419.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)"
1420.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70
95655a22 1421Branch is part of a hardware-generated call stack.
bb7e6ff0
VW
1422This requires hardware support, currently only found
1423on Intel x86 Haswell or newer.
f2b1d720 1424.RE
f2b1d720 1425.TP
2b9bf369 1426.IR sample_regs_user " (since Linux 3.7)"
747a6e7c 1427.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
4651e412 1428This bit mask defines the set of user CPU registers to dump on samples.
76c637e1 1429The layout of the register mask is architecture-specific and
4b3a5f01 1430is described in the kernel header file
d1007d14 1431.IR arch/ARCH/include/uapi/asm/perf_regs.h .
f2b1d720 1432.TP
2b9bf369 1433.IR sample_stack_user " (since Linux 3.7)"
747a6e7c 1434.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
1435This defines the size of the user stack to dump if
1436.B PERF_SAMPLE_STACK_USER
1437is specified.
6bd5186a 1438.TP
2b9bf369 1439.IR clockid " (since Linux 4.1)"
6bd5186a
VW
1440.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1441If
1442.I use_clockid
1443is set, then this field selects which internal Linux timer to
1444use for timestamps.
1445The available timers are defined in
1446.IR linux/time.h ,
1447with
95655a22
MK
1448.BR CLOCK_MONOTONIC ,
1449.BR CLOCK_MONOTONIC_RAW ,
1450.BR CLOCK_REALTIME ,
1451.BR CLOCK_BOOTTIME ,
1452and
1453.B CLOCK_TAI
6bd5186a 1454currently supported.
cdc52f4a 1455.TP
2b9bf369 1456.IR aux_watermark " (since Linux 4.1)"
cdc52f4a
VW
1457.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
1458This specifies how much data is required to trigger a
1459.B PERF_RECORD_AUX
1460sample.
fd133d5d 1461.TP
2b9bf369 1462.IR sample_max_stack " (since Linux 4.8)"
fd133d5d
VW
1463.\" commit 97c79a38cd454602645f0470ffb444b3b75ce574
1464When
1465.I sample_type
1466includes
5dd3feec 1467.BR PERF_SAMPLE_CALLCHAIN ,
4b3a5f01 1468this field specifies how many stack frames to report when
fd133d5d 1469generating the callchain.
73d8cece 1470.SS Reading results
f2b1d720 1471Once a
7db515ef 1472.BR perf_event_open ()
3d1ee497 1473file descriptor has been opened, the values
f2b1d720
MK
1474of the events can be read from the file descriptor.
1475The values that are there are specified by the
1476.I read_format
7db515ef
MK
1477field in the
1478.I attr
1479structure at open time.
efeece04 1480.PP
f2b1d720 1481If you attempt to read into a buffer that is not big enough to hold the
4b3a5f01 1482data, the error
f2b1d720 1483.B ENOSPC
4b3a5f01 1484results.
efeece04 1485.PP
f2b1d720 1486Here is the layout of the data returned by a read:
e525b89f 1487.IP * 2
f2b1d720
MK
1488If
1489.B PERF_FORMAT_GROUP
1490was specified to allow reading all events in a group at once:
efeece04 1491.IP
f2b1d720 1492.in +4n
b8302363 1493.EX
f2b1d720 1494struct read_format {
e525b89f
MK
1495 u64 nr; /* The number of events */
1496 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1497 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
e307112d 1498 struct {
e525b89f
MK
1499 u64 value; /* The value of the event */
1500 u64 id; /* if PERF_FORMAT_ID */
f2b1d720
MK
1501 } values[nr];
1502};
b8302363 1503.EE
f2b1d720 1504.in
e525b89f 1505.IP *
f2b1d720
MK
1506If
1507.B PERF_FORMAT_GROUP
1508was
1509.I not
e525b89f 1510specified:
efeece04 1511.IP
f2b1d720 1512.in +4n
b8302363 1513.EX
f2b1d720
MK
1514struct read_format {
1515 u64 value; /* The value of the event */
1516 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1517 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1518 u64 id; /* if PERF_FORMAT_ID */
1519};
b8302363 1520.EE
f2b1d720 1521.in
e525b89f
MK
1522.PP
1523The values read are as follows:
f2b1d720
MK
1524.TP
1525.I nr
1526The number of events in this file descriptor.
fcc4f4f4 1527Available only if
f2b1d720
MK
1528.B PERF_FORMAT_GROUP
1529was specified.
f2b1d720
MK
1530.TP
1531.IR time_enabled ", " time_running
1532Total time the event was enabled and running.
4b3a5f01 1533Normally these values are the same.
34211ee3
VW
1534Multiplexing happens if the number of events is more than the
1535number of available PMU counter slots.
1536In that case the events run only part of the time and the
f2b1d720
MK
1537.I time_enabled
1538and
1539.I time running
1540values can be used to scale an estimated value for the count.
f2b1d720
MK
1541.TP
1542.I value
1543An unsigned 64-bit value containing the counter result.
f2b1d720
MK
1544.TP
1545.I id
4b3a5f01 1546A globally unique value for this particular event; only present if
f2b1d720 1547.B PERF_FORMAT_ID
e525b89f
MK
1548was specified in
1549.IR read_format .
73d8cece 1550.SS MMAP layout
f2b1d720 1551When using
7db515ef 1552.BR perf_event_open ()
f2b1d720
MK
1553in sampled mode, asynchronous events
1554(like counter overflow or
1555.B PROT_EXEC
1556mmap tracking)
1557are logged into a ring-buffer.
1558This ring-buffer is created and accessed through
1559.BR mmap (2).
efeece04 1560.PP
f2b1d720
MK
1561The mmap size should be 1+2^n pages, where the first page is a
1562metadata page
e525b89f 1563.RI ( "struct perf_event_mmap_page" )
f2b1d720
MK
1564that contains various
1565bits of information such as where the ring-buffer head is.
efeece04 1566.PP
95655a22 1567Before kernel 2.6.39, there is a bug that means you must allocate an mmap
f2b1d720 1568ring buffer when sampling even if you do not plan to access it.
efeece04 1569.PP
f2b1d720 1570The structure of the first metadata mmap page is as follows:
efeece04 1571.PP
f2b1d720 1572.in +4n
b8302363 1573.EX
f2b1d720 1574struct perf_event_mmap_page {
ce88f77b
MK
1575 __u32 version; /* version number of this structure */
1576 __u32 compat_version; /* lowest version this is compat with */
1577 __u32 lock; /* seqlock for synchronization */
1578 __u32 index; /* hardware counter identifier */
1579 __s64 offset; /* add to hardware counter value */
1580 __u64 time_enabled; /* time event active */
1581 __u64 time_running; /* time event on CPU */
f2b1d720
MK
1582 union {
1583 __u64 capabilities;
135cba8b 1584 struct {
ce88f77b
MK
1585 __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
1586 cap_bit0_is_deprecated : 1,
1587 cap_user_rdpmc : 1,
1588 cap_user_time : 1,
1589 cap_user_time_zero : 1,
135cba8b 1590 };
f2b1d720 1591 };
ce88f77b
MK
1592 __u16 pmc_width;
1593 __u16 time_shift;
1594 __u32 time_mult;
1595 __u64 time_offset;
ee8655b5 1596 __u64 __reserved[120]; /* Pad to 1 k */
ce88f77b 1597 __u64 data_head; /* head in the data section */
d064d41a 1598 __u64 data_tail; /* user\-space written tail */
21d9849a
VW
1599 __u64 data_offset; /* where the buffer starts */
1600 __u64 data_size; /* data buffer size */
4e47c6e5
VW
1601 __u64 aux_head;
1602 __u64 aux_tail;
1603 __u64 aux_offset;
1604 __u64 aux_size;
21d9849a 1605
f2b1d720 1606}
b8302363 1607.EE
f2b1d720 1608.in
efeece04 1609.PP
ce88f77b 1610The following list describes the fields in the
f2b1d720 1611.I perf_event_mmap_page
e525b89f 1612structure in more detail:
f2b1d720
MK
1613.TP
1614.I version
1615Version number of this structure.
f2b1d720
MK
1616.TP
1617.I compat_version
1618The lowest version this is compatible with.
f2b1d720
MK
1619.TP
1620.I lock
1621A seqlock for synchronization.
f2b1d720
MK
1622.TP
1623.I index
1624A unique hardware counter identifier.
f2b1d720
MK
1625.TP
1626.I offset
135cba8b
VW
1627When using rdpmc for reads this offset value
1628must be added to the one returned by rdpmc to get
1629the current total event count.
f2b1d720
MK
1630.TP
1631.I time_enabled
1632Time the event was active.
f2b1d720
MK
1633.TP
1634.I time_running
1635Time the event was running.
f2b1d720 1636.TP
31c1f2b0 1637.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
747a6e7c 1638.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
e9bd9b2c 1639There was a bug in the definition of
f2b1d720 1640.I cap_usr_time
135cba8b
VW
1641and
1642.I cap_usr_rdpmc
1643from Linux 3.4 until Linux 3.11.
1644Both bits were defined to point to the same location, so it was
e9bd9b2c 1645impossible to know if
135cba8b
VW
1646.I cap_usr_time
1647or
1648.I cap_usr_rdpmc
1649were actually set.
efeece04 1650.IP
4010bc07 1651Starting with Linux 3.12, these are renamed to
747a6e7c 1652.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b 1653.I cap_bit0
e41c36b2 1654and you should use the
135cba8b
VW
1655.I cap_user_time
1656and
1657.I cap_user_rdpmc
1658fields instead.
f2b1d720 1659.TP
31c1f2b0 1660.IR cap_bit0_is_deprecated " (since Linux 3.12)"
747a6e7c 1661.\" commit fa7315871046b9a4c48627905691dbde57e51033
37bee118 1662If set, this bit indicates that the kernel supports
135cba8b
VW
1663the properly separated
1664.I cap_user_time
1665and
1666.I cap_user_rdpmc
1667bits.
efeece04 1668.IP
135cba8b
VW
1669If not-set, it indicates an older kernel where
1670.I cap_usr_time
1671and
f2b1d720 1672.I cap_usr_rdpmc
135cba8b
VW
1673map to the same bit and thus both features should
1674be used with caution.
135cba8b 1675.TP
31c1f2b0 1676.IR cap_user_rdpmc " (since Linux 3.12)"
747a6e7c 1677.\" commit fa7315871046b9a4c48627905691dbde57e51033
f2b1d720
MK
1678If the hardware supports user-space read of performance counters
1679without syscall (this is the "rdpmc" instruction on x86), then
1680the following code can be used to do a read:
efeece04 1681.IP
f2b1d720 1682.in +4n
b8302363 1683.EX
f2b1d720
MK
1684u32 seq, time_mult, time_shift, idx, width;
1685u64 count, enabled, running;
1686u64 cyc, time_offset;
f2b1d720
MK
1687
1688do {
1689 seq = pc\->lock;
1690 barrier();
1691 enabled = pc\->time_enabled;
1692 running = pc\->time_running;
1693
1694 if (pc\->cap_usr_time && enabled != running) {
1695 cyc = rdtsc();
1696 time_offset = pc\->time_offset;
1697 time_mult = pc\->time_mult;
1698 time_shift = pc\->time_shift;
1699 }
1700
1701 idx = pc\->index;
1702 count = pc\->offset;
1703
1704 if (pc\->cap_usr_rdpmc && idx) {
1705 width = pc\->pmc_width;
135cba8b 1706 count += rdpmc(idx \- 1);
f2b1d720
MK
1707 }
1708
1709 barrier();
1710} while (pc\->lock != seq);
b8302363 1711.EE
f2b1d720 1712.in
f2b1d720 1713.TP
cc19ea28 1714.IR cap_user_time " (since Linux 3.12)"
747a6e7c 1715.\" commit fa7315871046b9a4c48627905691dbde57e51033
7d182bb6 1716This bit indicates the hardware has a constant, nonstop
135cba8b
VW
1717timestamp counter (TSC on x86).
1718.TP
31c1f2b0 1719.IR cap_user_time_zero " (since Linux 3.12)"
747a6e7c 1720.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
1721Indicates the presence of
1722.I time_zero
1723which allows mapping timestamp values to
1724the hardware clock.
1725.TP
f2b1d720
MK
1726.I pmc_width
1727If
1728.IR cap_usr_rdpmc ,
1729this field provides the bit-width of the value
1730read using the rdpmc or equivalent instruction.
1731This can be used to sign extend the result like:
efeece04 1732.IP
f2b1d720 1733.in +4n
b8302363 1734.EX
f2b1d720
MK
1735pmc <<= 64 \- pmc_width;
1736pmc >>= 64 \- pmc_width; // signed shift right
1737count += pmc;
b8302363 1738.EE
f2b1d720 1739.in
f2b1d720
MK
1740.TP
1741.IR time_shift ", " time_mult ", " time_offset
efeece04 1742.IP
f2b1d720
MK
1743If
1744.IR cap_usr_time ,
1745these fields can be used to compute the time
4b3a5f01
MK
1746delta since
1747.I time_enabled
1748(in nanoseconds) using rdtsc or similar.
408731d4 1749.IP
2b9bf369
AC
1750.in +4n
1751.EX
1752u64 quot, rem;
1753u64 delta;
1754
1755quot = cyc >> time_shift;
1756rem = cyc & (((u64)1 << time_shift) \- 1);
1757delta = time_offset + quot * time_mult +
1758 ((rem * time_mult) >> time_shift);
1759.EE
1760.in
efeece04 1761.IP
7db515ef
MK
1762Where
1763.IR time_offset ,
1764.IR time_mult ,
1765.IR time_shift ,
1766and
2b9bf369 1767.I cyc
7db515ef 1768are read in the
f2b1d720
MK
1769seqcount loop described above.
1770This delta can then be added to
1771enabled and possible running (if idx), improving the scaling:
408731d4 1772.IP
2b9bf369
AC
1773.in +4n
1774.EX
1775enabled += delta;
1776if (idx)
1777 running += delta;
1778quot = count / running;
1779rem = count % running;
1780count = quot * enabled + (rem * enabled) / running;
1781.EE
1782.in
f2b1d720 1783.TP
31c1f2b0 1784.IR time_zero " (since Linux 3.12)"
747a6e7c 1785.\" commit fa7315871046b9a4c48627905691dbde57e51033
efeece04 1786.IP
e9bd9b2c 1787If
135cba8b 1788.I cap_usr_time_zero
37bee118 1789is set, then the hardware clock (the TSC timestamp counter on x86)
135cba8b 1790can be calculated from the
2b9bf369
AC
1791.IR time_zero ,
1792.IR time_mult ,
1793and
1794.I time_shift
1795values:
efeece04 1796.IP
2b9bf369
AC
1797.in +4n
1798.EX
d064d41a 1799time = timestamp \- time_zero;
2b9bf369
AC
1800quot = time / time_mult;
1801rem = time % time_mult;
1802cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
1803.EE
1804.in
efeece04 1805.IP
135cba8b 1806And vice versa:
efeece04 1807.IP
2b9bf369
AC
1808.in +4n
1809.EX
1810quot = cyc >> time_shift;
d064d41a 1811rem = cyc & (((u64)1 << time_shift) \- 1);
2b9bf369
AC
1812timestamp = time_zero + quot * time_mult +
1813 ((rem * time_mult) >> time_shift);
1814.EE
1815.in
135cba8b 1816.TP
f2b1d720
MK
1817.I data_head
1818This points to the head of the data section.
7db515ef
MK
1819The value continuously increases, it does not wrap.
1820The value needs to be manually wrapped by the size of the mmap buffer
f2b1d720 1821before accessing the samples.
efeece04 1822.IP
ce88f77b
MK
1823On SMP-capable platforms, after reading the
1824.I data_head
1825value,
ad73a2cc 1826user space should issue an rmb().
f2b1d720 1827.TP
fecd584f 1828.I data_tail
f2b1d720
MK
1829When the mapping is
1830.BR PROT_WRITE ,
7db515ef
MK
1831the
1832.I data_tail
1833value should be written by user space to reflect the last read data.
31020de9 1834In this case, the kernel will not overwrite unread data.
21d9849a
VW
1835.TP
1836.IR data_offset " (since Linux 4.1)"
1837.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1838Contains the offset of the location in the mmap buffer
1839where perf sample data begins.
1840.TP
1841.IR data_size " (since Linux 4.1)"
1842.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1843Contains the size of the perf sample region within
1844the mmap buffer.
4e47c6e5 1845.TP
9bfc9cb1 1846.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)"
4e47c6e5 1847.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff
4111ac76
MK
1848The AUX region allows
1849.BR mmap (2)-ing
1850a separate sample buffer for
95655a22
MK
1851high-bandwidth data streams (separate from the main perf sample buffer).
1852An example of a high-bandwidth stream is instruction tracing support,
4e47c6e5 1853as is found in newer Intel processors.
efeece04 1854.IP
4e47c6e5
VW
1855To set up an AUX area, first
1856.I aux_offset
1857needs to be set with an offset greater than
1858.IR data_offset + data_size
1859and
1860.I aux_size
1861needs to be set to the desired buffer size.
1862The desired offset and size must be page aligned, and the size
1863must be a power of two.
1864These values are then passed to mmap in order to map the AUX buffer.
95655a22 1865Pages in the AUX buffer are included as part of the
2b9bf369 1866.B RLIMIT_MEMLOCK
95655a22
MK
1867resource limit (see
1868.BR setrlimit (2)),
1869and also as part of the
4e47c6e5
VW
1870.I perf_event_mlock_kb
1871allowance.
efeece04 1872.IP
95655a22 1873By default, the AUX buffer will be truncated if it will not fit
b1355f6a
VW
1874in the available space in the ring buffer.
1875If the AUX buffer is mapped as a read only buffer, then it will
1876operate in ring buffer mode where old data will be overwritten
1877by new.
95655a22 1878In overwrite mode, it might not be possible to infer where the
b1355f6a
VW
1879new data began, and it is the consumer's job to disable
1880measurement while reading to avoid possible data races.
efeece04 1881.IP
4e47c6e5 1882The
2b9bf369
AC
1883.I aux_head
1884and
1885.I aux_tail
4e47c6e5
VW
1886ring buffer pointers have the same behavior and ordering
1887rules as the previous described
2b9bf369
AC
1888.I data_head
1889and
1890.IR data_tail .
e525b89f 1891.PP
f2b1d720 1892The following 2^n ring-buffer pages have the layout described below.
efeece04 1893.PP
f2b1d720
MK
1894If
1895.I perf_event_attr.sample_id_all
1896is set, then all event types will
1897have the sample_type selected fields related to where/when (identity)
1898an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1899.B PERF_RECORD_SAMPLE
1900below, it will be stashed just after the
7db515ef
MK
1901.I perf_event_header
1902and the fields already present for the existing
3d1ee497 1903fields, that is, at the end of the payload.
4b3a5f01
MK
1904This allows a newer perf.data
1905file to be supported by older perf tools, with the new optional
f2b1d720 1906fields being ignored.
efeece04 1907.PP
f2b1d720 1908The mmap values start with a header:
efeece04 1909.PP
f2b1d720 1910.in +4n
b8302363 1911.EX
f2b1d720
MK
1912struct perf_event_header {
1913 __u32 type;
1914 __u16 misc;
1915 __u16 size;
1916};
b8302363 1917.EE
f2b1d720 1918.in
efeece04 1919.PP
f2b1d720
MK
1920Below, we describe the
1921.I perf_event_header
1922fields in more detail.
4047bc6c
MK
1923For ease of reading,
1924the fields with shorter descriptions are presented first.
1925.TP
1926.I size
1927This indicates the size of the record.
1928.TP
1929.I misc
1930The
1931.I misc
1932field contains additional information about the sample.
efeece04 1933.IP
4047bc6c
MK
1934The CPU mode can be determined from this value by masking with
1935.B PERF_RECORD_MISC_CPUMODE_MASK
1936and looking for one of the following (note these are not
1937bit masks, only one can be set at a time):
1938.RS
1939.TP
1940.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1941Unknown CPU mode.
1942.TP
1943.B PERF_RECORD_MISC_KERNEL
1944Sample happened in the kernel.
1945.TP
1946.B PERF_RECORD_MISC_USER
1947Sample happened in user code.
1948.TP
1949.B PERF_RECORD_MISC_HYPERVISOR
1950Sample happened in the hypervisor.
1951.TP
747a6e7c 1952.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)"
60dafbc1 1953.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1954Sample happened in the guest kernel.
1955.TP
747a6e7c 1956.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)"
60dafbc1 1957.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1958Sample happened in guest user code.
1959.RE
efeece04 1960.PP
4047bc6c 1961.RS
d5a24378
MK
1962Since the following three statuses are generated by
1963different record types, they alias to the same bit:
4047bc6c 1964.TP
60dafbc1
MK
1965.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)"
1966.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75
4047bc6c
MK
1967This is set when the mapping is not executable;
1968otherwise the mapping is executable.
1969.TP
60dafbc1
MK
1970.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)"
1971.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
1972This is set for a
1973.B PERF_RECORD_COMM
1974record on kernels more recent than Linux 3.16
1975if a process name change was caused by an
ee2379ef 1976.BR execve (2)
49bc411c 1977system call.
9277a75d
VW
1978.TP
1979.BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)"
1980.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
1981When a
2b9bf369 1982.B PERF_RECORD_SWITCH
d5a24378 1983or
2b9bf369 1984.B PERF_RECORD_SWITCH_CPU_WIDE
d5a24378 1985record is generated, this bit indicates that the
9277a75d 1986context switch is away from the current process
d5a24378 1987(instead of into the current process).
9277a75d 1988.RE
efeece04 1989.PP
9277a75d
VW
1990.RS
1991In addition, the following bits can be set:
49bc411c 1992.TP
4047bc6c
MK
1993.B PERF_RECORD_MISC_EXACT_IP
1994This indicates that the content of
1995.B PERF_SAMPLE_IP
1996points
1997to the actual instruction that triggered the event.
1998See also
1999.IR perf_event_attr.precise_ip .
2000.TP
60dafbc1
MK
2001.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)"
2002.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74
4047bc6c 2003This indicates there is extended data available (currently not used).
ffbc7c02
VW
2004.TP
2005.B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT
2006.\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4
2007This bit is not set by the kernel.
141efa1b
MK
2008It is reserved for the user-space perf utility to indicate that
2009.I /proc/i[pid]/maps
2010parsing was taking too long and was stopped, and thus the mmap
ffbc7c02 2011records may be truncated.
4047bc6c 2012.RE
f2b1d720
MK
2013.TP
2014.I type
2015The
2016.I type
2017value is one of the below.
2018The values in the corresponding record (that follows the header)
2019depend on the
2020.I type
2021selected as shown.
f2b1d720 2022.RS
7db515ef 2023.TP 4
f2b1d720
MK
2024.B PERF_RECORD_MMAP
2025The MMAP events record the
2026.B PROT_EXEC
2027mappings so that we can correlate
ad73a2cc 2028user-space IPs to code.
f2b1d720 2029They have the following structure:
efeece04 2030.IP
f2b1d720 2031.in +4n
b8302363 2032.EX
f2b1d720
MK
2033struct {
2034 struct perf_event_header header;
2035 u32 pid, tid;
2036 u64 addr;
2037 u64 len;
2038 u64 pgoff;
2039 char filename[];
2040};
b8302363 2041.EE
f2b1d720 2042.in
9bfc542b
VW
2043.RS
2044.TP
2045.I pid
3a058284 2046is the process ID.
9bfc542b
VW
2047.TP
2048.I tid
3a058284 2049is the thread ID.
9bfc542b
VW
2050.TP
2051.I addr
2052is the address of the allocated memory.
2053.I len
2054is the length of the allocated memory.
2055.I pgoff
2056is the page offset of the allocated memory.
2057.I filename
2058is a string describing the backing of the allocated memory.
2059.RE
f2b1d720
MK
2060.TP
2061.B PERF_RECORD_LOST
2062This record indicates when events are lost.
efeece04 2063.IP
f2b1d720 2064.in +4n
b8302363 2065.EX
f2b1d720
MK
2066struct {
2067 struct perf_event_header header;
7a10da70
MK
2068 u64 id;
2069 u64 lost;
7480dabb 2070 struct sample_id sample_id;
f2b1d720 2071};
b8302363 2072.EE
f2b1d720 2073.in
f2b1d720
MK
2074.RS
2075.TP
2076.I id
2077is the unique event ID for the samples that were lost.
2078.TP
2079.I lost
2080is the number of events that were lost.
2081.RE
f2b1d720
MK
2082.TP
2083.B PERF_RECORD_COMM
2084This record indicates a change in the process name.
efeece04 2085.IP
f2b1d720 2086.in +4n
b8302363 2087.EX
f2b1d720
MK
2088struct {
2089 struct perf_event_header header;
7a10da70
MK
2090 u32 pid;
2091 u32 tid;
2092 char comm[];
7480dabb 2093 struct sample_id sample_id;
f2b1d720 2094};
b8302363 2095.EE
f2b1d720 2096.in
49bc411c
VW
2097.RS
2098.TP
2099.I pid
5ab35ae5 2100is the process ID.
49bc411c
VW
2101.TP
2102.I tid
5ab35ae5 2103is the thread ID.
49bc411c
VW
2104.TP
2105.I comm
2106is a string containing the new name of the process.
2107.RE
f2b1d720
MK
2108.TP
2109.B PERF_RECORD_EXIT
2110This record indicates a process exit event.
efeece04 2111.IP
f2b1d720 2112.in +4n
b8302363 2113.EX
f2b1d720
MK
2114struct {
2115 struct perf_event_header header;
7a10da70
MK
2116 u32 pid, ppid;
2117 u32 tid, ptid;
2118 u64 time;
7480dabb 2119 struct sample_id sample_id;
f2b1d720 2120};
b8302363 2121.EE
f2b1d720 2122.in
f2b1d720
MK
2123.TP
2124.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
2125This record indicates a throttle/unthrottle event.
efeece04 2126.IP
f2b1d720 2127.in +4n
b8302363 2128.EX
f2b1d720
MK
2129struct {
2130 struct perf_event_header header;
7a10da70
MK
2131 u64 time;
2132 u64 id;
2133 u64 stream_id;
7480dabb 2134 struct sample_id sample_id;
f2b1d720 2135};
b8302363 2136.EE
f2b1d720 2137.in
f2b1d720
MK
2138.TP
2139.B PERF_RECORD_FORK
2140This record indicates a fork event.
efeece04 2141.IP
f2b1d720 2142.in +4n
b8302363 2143.EX
f2b1d720
MK
2144struct {
2145 struct perf_event_header header;
7a10da70
MK
2146 u32 pid, ppid;
2147 u32 tid, ptid;
2148 u64 time;
7480dabb 2149 struct sample_id sample_id;
f2b1d720 2150};
b8302363 2151.EE
f2b1d720 2152.in
f2b1d720
MK
2153.TP
2154.B PERF_RECORD_READ
2155This record indicates a read event.
efeece04 2156.IP
f2b1d720 2157.in +4n
b8302363 2158.EX
f2b1d720
MK
2159struct {
2160 struct perf_event_header header;
7a10da70 2161 u32 pid, tid;
f2b1d720 2162 struct read_format values;
7480dabb 2163 struct sample_id sample_id;
f2b1d720 2164};
b8302363 2165.EE
f2b1d720 2166.in
f2b1d720
MK
2167.TP
2168.B PERF_RECORD_SAMPLE
2169This record indicates a sample.
efeece04 2170.IP
f2b1d720 2171.in +4n
b8302363 2172.EX
f2b1d720
MK
2173struct {
2174 struct perf_event_header header;
f96e6174
MK
2175 u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
2176 u64 ip; /* if PERF_SAMPLE_IP */
2177 u32 pid, tid; /* if PERF_SAMPLE_TID */
2178 u64 time; /* if PERF_SAMPLE_TIME */
2179 u64 addr; /* if PERF_SAMPLE_ADDR */
2180 u64 id; /* if PERF_SAMPLE_ID */
2181 u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
2182 u32 cpu, res; /* if PERF_SAMPLE_CPU */
2183 u64 period; /* if PERF_SAMPLE_PERIOD */
2184 struct read_format v;
2185 /* if PERF_SAMPLE_READ */
2186 u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
2187 u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
2188 u32 size; /* if PERF_SAMPLE_RAW */
1e554f3e 2189 char data[size]; /* if PERF_SAMPLE_RAW */
f96e6174 2190 u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
7db515ef 2191 struct perf_branch_entry lbr[bnr];
f96e6174
MK
2192 /* if PERF_SAMPLE_BRANCH_STACK */
2193 u64 abi; /* if PERF_SAMPLE_REGS_USER */
7a10da70 2194 u64 regs[weight(mask)];
f96e6174
MK
2195 /* if PERF_SAMPLE_REGS_USER */
2196 u64 size; /* if PERF_SAMPLE_STACK_USER */
2197 char data[size]; /* if PERF_SAMPLE_STACK_USER */
2198 u64 dyn_size; /* if PERF_SAMPLE_STACK_USER &&
2199 size != 0 */
2200 u64 weight; /* if PERF_SAMPLE_WEIGHT */
2201 u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
2202 u64 transaction; /* if PERF_SAMPLE_TRANSACTION */
2203 u64 abi; /* if PERF_SAMPLE_REGS_INTR */
7a10da70 2204 u64 regs[weight(mask)];
f96e6174 2205 /* if PERF_SAMPLE_REGS_INTR */
1e554f3e
NK
2206 u64 phys_addr; /* if PERF_SAMPLE_PHYS_ADDR */
2207 u64 cgroup; /* if PERF_SAMPLE_CGROUP */
f2b1d720 2208};
ba4924aa 2209.EE
c089afee 2210.in
4047bc6c
MK
2211.RS 4
2212.TP 4
7480dabb
VW
2213.I sample_id
2214If
2215.B PERF_SAMPLE_IDENTIFIER
2216is enabled, a 64-bit unique ID is included.
e9bd9b2c 2217This is a duplication of the
7480dabb
VW
2218.B PERF_SAMPLE_ID
2219.I id
2220value, but included at the beginning of the sample
2221so parsers can easily obtain the value.
2222.TP
f2b1d720 2223.I ip
7db515ef
MK
2224If
2225.B PERF_SAMPLE_IP
2226is enabled, then a 64-bit instruction
f2b1d720 2227pointer value is included.
f2b1d720 2228.TP
7db515ef
MK
2229.IR pid ", " tid
2230If
2231.B PERF_SAMPLE_TID
2232is enabled, then a 32-bit process ID
2233and 32-bit thread ID are included.
f2b1d720
MK
2234.TP
2235.I time
7db515ef
MK
2236If
2237.B PERF_SAMPLE_TIME
2238is enabled, then a 64-bit timestamp
f2b1d720
MK
2239is included.
2240This is obtained via local_clock() which is a hardware timestamp
2241if available and the jiffies value if not.
f2b1d720
MK
2242.TP
2243.I addr
7db515ef
MK
2244If
2245.B PERF_SAMPLE_ADDR
2246is enabled, then a 64-bit address is included.
f2b1d720
MK
2247This is usually the address of a tracepoint,
2248breakpoint, or software event; otherwise the value is 0.
f2b1d720
MK
2249.TP
2250.I id
7db515ef
MK
2251If
2252.B PERF_SAMPLE_ID
2253is enabled, a 64-bit unique ID is included.
f2b1d720 2254If the event is a member of an event group, the group leader ID is returned.
7db515ef
MK
2255This ID is the same as the one returned by
2256.BR PERF_FORMAT_ID .
f2b1d720
MK
2257.TP
2258.I stream_id
7db515ef
MK
2259If
2260.B PERF_SAMPLE_STREAM_ID
2261is enabled, a 64-bit unique ID is included.
f2b1d720
MK
2262Unlike
2263.B PERF_SAMPLE_ID
2264the actual ID is returned, not the group leader.
7db515ef
MK
2265This ID is the same as the one returned by
2266.BR PERF_FORMAT_ID .
f2b1d720 2267.TP
7db515ef
MK
2268.IR cpu ", " res
2269If
2270.B PERF_SAMPLE_CPU
2271is enabled, this is a 32-bit value indicating
f2b1d720
MK
2272which CPU was being used, in addition to a reserved (unused)
227332-bit value.
f2b1d720
MK
2274.TP
2275.I period
7db515ef
MK
2276If
2277.B PERF_SAMPLE_PERIOD
2278is enabled, a 64-bit value indicating
f2b1d720 2279the current sampling period is written.
f2b1d720
MK
2280.TP
2281.I v
7db515ef
MK
2282If
2283.B PERF_SAMPLE_READ
2284is enabled, a structure of type read_format
f2b1d720
MK
2285is included which has values for all events in the event group.
2286The values included depend on the
2287.I read_format
7db515ef
MK
2288value used at
2289.BR perf_event_open ()
2290time.
f2b1d720 2291.TP
7db515ef
MK
2292.IR nr ", " ips[nr]
2293If
2294.B PERF_SAMPLE_CALLCHAIN
2295is enabled, then a 64-bit number is included
f2b1d720 2296which indicates how many following 64-bit instruction pointers will
7db515ef
MK
2297follow.
2298This is the current callchain.
f2b1d720 2299.TP
7ede2f66 2300.IR size ", " data[size]
7db515ef
MK
2301If
2302.B PERF_SAMPLE_RAW
2303is enabled, then a 32-bit value indicating size
f2b1d720
MK
2304is included followed by an array of 8-bit values of length size.
2305The values are padded with 0 to have 64-bit alignment.
efeece04 2306.IP
f2b1d720
MK
2307This RAW record data is opaque with respect to the ABI.
2308The ABI doesn't make any promises with respect to the stability
2309of its content, it may vary depending
2310on event, hardware, and kernel version.
f2b1d720 2311.TP
7db515ef
MK
2312.IR bnr ", " lbr[bnr]
2313If
2314.B PERF_SAMPLE_BRANCH_STACK
2315is enabled, then a 64-bit value indicating
2316the number of records is included, followed by
2317.I bnr
2318.I perf_branch_entry
045bf4d3
VW
2319structures which each include the fields:
2320.RS
2321.TP
2322.I from
2b538c3e 2323This indicates the source instruction (may not be a branch).
045bf4d3
VW
2324.TP
2325.I to
2b538c3e 2326The branch target.
045bf4d3
VW
2327.TP
2328.I mispred
2b538c3e 2329The branch target was mispredicted.
045bf4d3
VW
2330.TP
2331.I predicted
2b538c3e 2332The branch target was predicted.
e3c9782b 2333.TP
31c1f2b0 2334.IR in_tx " (since Linux 3.11)"
747a6e7c 2335.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2336The branch was in a transactional memory transaction.
e3c9782b 2337.TP
31c1f2b0 2338.IR abort " (since Linux 3.11)"
747a6e7c 2339.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2340The branch was in an aborted transactional memory transaction.
96919592
VW
2341.TP
2342.IR cycles " (since Linux 4.3)"
2343.\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
2344This reports the number of cycles elapsed since the
2345previous branch stack update.
11ac5b51 2346.PP
045bf4d3
VW
2347The entries are from most to least recent, so the first entry
2348has the most recent branch.
efeece04 2349.PP
8a94e783 2350Support for
dceb9af6
MK
2351.IR mispred ,
2352.IR predicted ,
2353and
2b9bf369 2354.I cycles
96919592 2355is optional; if not supported, those
045bf4d3 2356values will be 0.
efeece04 2357.PP
e3c9782b
VW
2358The type of branches recorded is specified by the
2359.I branch_sample_type
2360field.
2361.RE
f2b1d720 2362.TP
7db515ef
MK
2363.IR abi ", " regs[weight(mask)]
2364If
2365.B PERF_SAMPLE_REGS_USER
d1007d14 2366is enabled, then the user CPU registers are recorded.
efeece04 2367.IP
f2b1d720
MK
2368The
2369.I abi
2370field is one of
2b9bf369
AC
2371.BR PERF_SAMPLE_REGS_ABI_NONE ,
2372.BR PERF_SAMPLE_REGS_ABI_32 ,
2373or
7db515ef 2374.BR PERF_SAMPLE_REGS_ABI_64 .
efeece04 2375.IP
d1007d14
VW
2376The
2377.I regs
2378field is an array of the CPU registers that were specified by
2379the
2380.I sample_regs_user
2381attr field.
2382The number of values is the number of bits set in the
51700fd7 2383.I sample_regs_user
4651e412 2384bit mask.
f2b1d720 2385.TP
7db515ef
MK
2386.IR size ", " data[size] ", " dyn_size
2387If
2388.B PERF_SAMPLE_STACK_USER
02ca78a0
VW
2389is enabled, then the user stack is recorded.
2390This can be used to generate stack backtraces.
d1007d14
VW
2391.I size
2392is the size requested by the user in
02ca78a0 2393.I sample_stack_user
d1007d14
VW
2394or else the maximum record size.
2395.I data
02ca78a0
VW
2396is the stack data (a raw dump of the memory pointed to by the
2397stack pointer at the time of sampling).
d1007d14
VW
2398.I dyn_size
2399is the amount of data actually dumped (can be less than
460e3d7a 2400.IR size ).
4dc411dd
KF
2401Note that
2402.I dyn_size
2403is omitted if
2404.I size
2405is 0.
d1007d14 2406.TP
51700fd7 2407.I weight
d1007d14
VW
2408If
2409.B PERF_SAMPLE_WEIGHT
7de4a1e3 2410is enabled, then a 64-bit value provided by the hardware
d1007d14
VW
2411is recorded that indicates how costly the event was.
2412This allows expensive events to stand out more clearly
2413in profiles.
2414.TP
2415.I data_src
51700fd7 2416If
d1007d14 2417.B PERF_SAMPLE_DATA_SRC
7de4a1e3 2418is enabled, then a 64-bit value is recorded that is made up of
d1007d14
VW
2419the following fields:
2420.RS
2b538c3e 2421.TP 4
d1007d14 2422.I mem_op
2b538c3e 2423Type of opcode, a bitwise combination of:
efeece04 2424.IP
2b538c3e
MK
2425.PD 0
2426.RS
2427.TP 24
d1007d14 2428.B PERF_MEM_OP_NA
2b538c3e
MK
2429Not available
2430.TP
d1007d14 2431.B PERF_MEM_OP_LOAD
2b538c3e
MK
2432Load instruction
2433.TP
d1007d14 2434.B PERF_MEM_OP_STORE
2b538c3e
MK
2435Store instruction
2436.TP
d1007d14 2437.B PERF_MEM_OP_PFETCH
2b538c3e
MK
2438Prefetch
2439.TP
d1007d14 2440.B PERF_MEM_OP_EXEC
2b538c3e
MK
2441Executable code
2442.RE
2443.PD
d1007d14
VW
2444.TP
2445.I mem_lvl
bc9d90b5 2446Memory hierarchy level hit or miss, a bitwise combination of
ef4f4031 2447the following, shifted left by
bc9d90b5 2448.BR PERF_MEM_LVL_SHIFT :
efeece04 2449.IP
2b538c3e
MK
2450.PD 0
2451.RS
2452.TP 24
d1007d14 2453.B PERF_MEM_LVL_NA
2b538c3e
MK
2454Not available
2455.TP
d1007d14 2456.B PERF_MEM_LVL_HIT
2b538c3e
MK
2457Hit
2458.TP
d1007d14 2459.B PERF_MEM_LVL_MISS
2b538c3e
MK
2460Miss
2461.TP
d1007d14 2462.B PERF_MEM_LVL_L1
2b538c3e
MK
2463Level 1 cache
2464.TP
d1007d14 2465.B PERF_MEM_LVL_LFB
2b538c3e
MK
2466Line fill buffer
2467.TP
d1007d14 2468.B PERF_MEM_LVL_L2
2b538c3e
MK
2469Level 2 cache
2470.TP
d1007d14 2471.B PERF_MEM_LVL_L3
2b538c3e
MK
2472Level 3 cache
2473.TP
d1007d14 2474.B PERF_MEM_LVL_LOC_RAM
2b538c3e
MK
2475Local DRAM
2476.TP
d1007d14 2477.B PERF_MEM_LVL_REM_RAM1
2b538c3e
MK
2478Remote DRAM 1 hop
2479.TP
d1007d14 2480.B PERF_MEM_LVL_REM_RAM2
2b538c3e
MK
2481Remote DRAM 2 hops
2482.TP
d1007d14 2483.B PERF_MEM_LVL_REM_CCE1
2b538c3e
MK
2484Remote cache 1 hop
2485.TP
d1007d14 2486.B PERF_MEM_LVL_REM_CCE2
2b538c3e
MK
2487Remote cache 2 hops
2488.TP
d1007d14 2489.B PERF_MEM_LVL_IO
2b538c3e
MK
2490I/O memory
2491.TP
d1007d14 2492.B PERF_MEM_LVL_UNC
2b538c3e
MK
2493Uncached memory
2494.RE
2495.PD
d1007d14
VW
2496.TP
2497.I mem_snoop
bc9d90b5
VW
2498Snoop mode, a bitwise combination of the following, shifted left by
2499.BR PERF_MEM_SNOOP_SHIFT :
efeece04 2500.IP
2b538c3e
MK
2501.PD 0
2502.RS
2503.TP 24
d1007d14 2504.B PERF_MEM_SNOOP_NA
2b538c3e
MK
2505Not available
2506.TP
d1007d14 2507.B PERF_MEM_SNOOP_NONE
2b538c3e
MK
2508No snoop
2509.TP
d1007d14 2510.B PERF_MEM_SNOOP_HIT
2b538c3e
MK
2511Snoop hit
2512.TP
d1007d14 2513.B PERF_MEM_SNOOP_MISS
2b538c3e
MK
2514Snoop miss
2515.TP
d1007d14 2516.B PERF_MEM_SNOOP_HITM
2b538c3e
MK
2517Snoop hit modified
2518.RE
2519.PD
d1007d14
VW
2520.TP
2521.I mem_lock
bc9d90b5
VW
2522Lock instruction, a bitwise combination of the following, shifted left by
2523.BR PERF_MEM_LOCK_SHIFT :
efeece04 2524.IP
2b538c3e
MK
2525.PD 0
2526.RS
2527.TP 24
d1007d14 2528.B PERF_MEM_LOCK_NA
2b538c3e
MK
2529Not available
2530.TP
d1007d14 2531.B PERF_MEM_LOCK_LOCKED
2b538c3e
MK
2532Locked transaction
2533.RE
2534.PD
d1007d14
VW
2535.TP
2536.I mem_dtlb
bc9d90b5
VW
2537TLB access hit or miss, a bitwise combination of the following, shifted
2538left by
2539.BR PERF_MEM_TLB_SHIFT :
efeece04 2540.IP
2b538c3e
MK
2541.PD 0
2542.RS
2543.TP 24
d1007d14 2544.B PERF_MEM_TLB_NA
2b538c3e
MK
2545Not available
2546.TP
d1007d14 2547.B PERF_MEM_TLB_HIT
2b538c3e
MK
2548Hit
2549.TP
d1007d14 2550.B PERF_MEM_TLB_MISS
2b538c3e
MK
2551Miss
2552.TP
d1007d14 2553.B PERF_MEM_TLB_L1
2b538c3e
MK
2554Level 1 TLB
2555.TP
d1007d14 2556.B PERF_MEM_TLB_L2
2b538c3e
MK
2557Level 2 TLB
2558.TP
d1007d14 2559.B PERF_MEM_TLB_WK
2b538c3e
MK
2560Hardware walker
2561.TP
d1007d14 2562.B PERF_MEM_TLB_OS
2b538c3e
MK
2563OS fault handler
2564.RE
2565.PD
d1007d14 2566.RE
1e043959
VW
2567.TP
2568.I transaction
2569If the
2570.B PERF_SAMPLE_TRANSACTION
37bee118 2571flag is set, then a 64-bit field is recorded describing
1e043959 2572the sources of any transactional memory aborts.
efeece04 2573.IP
1e043959
VW
2574The field is a bitwise combination of the following values:
2575.RS
2576.TP
2577.B PERF_TXN_ELISION
b3f39642 2578Abort from an elision type transaction (Intel-CPU-specific).
1e043959
VW
2579.TP
2580.B PERF_TXN_TRANSACTION
b3f39642 2581Abort from a generic transaction.
1e043959
VW
2582.TP
2583.B PERF_TXN_SYNC
b3f39642 2584Synchronous abort (related to the reported instruction).
1e043959
VW
2585.TP
2586.B PERF_TXN_ASYNC
b3f39642 2587Asynchronous abort (not related to the reported instruction).
1e043959
VW
2588.TP
2589.B PERF_TXN_RETRY
053a3e08 2590Retryable abort (retrying the transaction may have succeeded).
1e043959
VW
2591.TP
2592.B PERF_TXN_CONFLICT
b3f39642 2593Abort due to memory conflicts with other threads.
1e043959
VW
2594.TP
2595.B PERF_TXN_CAPACITY_WRITE
b3f39642 2596Abort due to write capacity overflow.
1e043959
VW
2597.TP
2598.B PERF_TXN_CAPACITY_READ
b3f39642 2599Abort due to read capacity overflow.
1e043959 2600.RE
b3f39642
MK
2601.IP
2602In addition, a user-specified abort code can be obtained from
2603the high 32 bits of the field by shifting right by
1e043959 2604.B PERF_TXN_ABORT_SHIFT
4b3a5f01 2605and masking with the value
1e043959 2606.BR PERF_TXN_ABORT_MASK .
f5281dfd
VW
2607.TP
2608.IR abi ", " regs[weight(mask)]
2609If
2610.B PERF_SAMPLE_REGS_INTR
2611is enabled, then the user CPU registers are recorded.
efeece04 2612.IP
f5281dfd
VW
2613The
2614.I abi
2615field is one of
4b3a5f01
MK
2616.BR PERF_SAMPLE_REGS_ABI_NONE ,
2617.BR PERF_SAMPLE_REGS_ABI_32 ,
2618or
f5281dfd 2619.BR PERF_SAMPLE_REGS_ABI_64 .
efeece04 2620.IP
f5281dfd
VW
2621The
2622.I regs
2623field is an array of the CPU registers that were specified by
2624the
2625.I sample_regs_intr
2626attr field.
2627The number of values is the number of bits set in the
2628.I sample_regs_intr
2629bit mask.
1e554f3e
NK
2630.TP
2631.I phys_addr
2632If the
2633.B PERF_SAMPLE_PHYS_ADDR
5ae2634d 2634flag is set, then the 64-bit physical address is recorded.
1e554f3e
NK
2635.TP
2636.I cgroup
2637If the
2638.B PERF_SAMPLE_CGROUP
5ae2634d
MK
2639flag is set,
2640then the 64-bit cgroup ID (for the perf_event subsystem) is recorded.
2641To get the pathname of the cgroup, the ID should match to one in a
1e554f3e 2642.B PERF_RECORD_CGROUP .
f2b1d720 2643.RE
9bfc542b
VW
2644.TP
2645.B PERF_RECORD_MMAP2
2646This record includes extended information on
2647.BR mmap (2)
2648calls returning executable mappings.
2649The format is similar to that of the
2650.B PERF_RECORD_MMAP
3a058284 2651record, but includes extra values that allow uniquely identifying
9bfc542b 2652shared mappings.
efeece04 2653.IP
9bfc542b 2654.in +4n
b8302363 2655.EX
9bfc542b
VW
2656struct {
2657 struct perf_event_header header;
7a10da70
MK
2658 u32 pid;
2659 u32 tid;
2660 u64 addr;
2661 u64 len;
2662 u64 pgoff;
2663 u32 maj;
2664 u32 min;
2665 u64 ino;
2666 u64 ino_generation;
2667 u32 prot;
2668 u32 flags;
2669 char filename[];
9bfc542b
VW
2670 struct sample_id sample_id;
2671};
ba4924aa 2672.EE
c089afee 2673.in
9bfc542b
VW
2674.RS
2675.TP
2676.I pid
3a058284 2677is the process ID.
9bfc542b
VW
2678.TP
2679.I tid
3a058284 2680is the thread ID.
9bfc542b
VW
2681.TP
2682.I addr
2683is the address of the allocated memory.
2684.TP
2685.I len
2686is the length of the allocated memory.
2687.TP
2688.I pgoff
2689is the page offset of the allocated memory.
2690.TP
2691.I maj
3a058284 2692is the major ID of the underlying device.
9bfc542b
VW
2693.TP
2694.I min
3a058284 2695is the minor ID of the underlying device.
9bfc542b
VW
2696.TP
2697.I ino
3a058284 2698is the inode number.
9bfc542b
VW
2699.TP
2700.I ino_generation
2701is the inode generation.
2702.TP
2703.I prot
2704is the protection information.
2705.TP
2706.I flags
2707is the flags information.
2708.TP
2709.I filename
2710is a string describing the backing of the allocated memory.
2711.RE
1fda209c
VW
2712.TP
2713.BR PERF_RECORD_AUX " (since Linux 4.1)"
243d656f 2714.\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0
1fda209c
VW
2715This record reports that new data is available in the separate
2716AUX buffer region.
efeece04 2717.IP
1fda209c 2718.in +4n
b8302363 2719.EX
1fda209c
VW
2720struct {
2721 struct perf_event_header header;
7a10da70
MK
2722 u64 aux_offset;
2723 u64 aux_size;
2724 u64 flags;
1fda209c
VW
2725 struct sample_id sample_id;
2726};
ba4924aa 2727.EE
c089afee 2728.in
1fda209c
VW
2729.RS
2730.TP
2731.I aux_offset
2732offset in the AUX mmap region where the new data begins.
2733.TP
2734.I aux_size
2735size of the data made available.
2736.TP
2737.I flags
95655a22 2738describes the AUX update.
1fda209c
VW
2739.RS
2740.TP
2741.B PERF_AUX_FLAG_TRUNCATED
95655a22 2742if set, then the data returned was truncated to fit the available
1fda209c 2743buffer size.
b1355f6a
VW
2744.TP
2745.B PERF_AUX_FLAG_OVERWRITE
2746.\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142
95655a22 2747if set, then the data returned has overwritten previous data.
1fda209c
VW
2748.RE
2749.RE
6932aac3
VW
2750.TP
2751.BR PERF_RECORD_ITRACE_START " (since Linux 4.1)"
243d656f 2752.\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368
6932aac3
VW
2753This record indicates which process has initiated an instruction
2754trace event, allowing tools to properly correlate the instruction
2755addresses in the AUX buffer with the proper executable.
efeece04 2756.IP
6932aac3 2757.in +4n
b8302363 2758.EX
6932aac3
VW
2759struct {
2760 struct perf_event_header header;
7a10da70
MK
2761 u32 pid;
2762 u32 tid;
6932aac3 2763};
ba4924aa 2764.EE
c089afee 2765.in
6932aac3
VW
2766.RS
2767.TP
2768.I pid
95655a22 2769process ID of the thread starting an instruction trace.
6932aac3
VW
2770.TP
2771.I tid
95655a22 2772thread ID of the thread starting an instruction trace.
6932aac3 2773.RE
46012ba3
DH
2774.TP
2775.BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)"
243d656f 2776.\" f38b0dbb491a6987e198aa6b428db8692a6480f8
46012ba3 2777When using hardware sampling (such as Intel PEBS) this record
4199d3a1 2778indicates some number of samples that may have been lost.
efeece04 2779.IP
46012ba3 2780.in +4n
b8302363 2781.EX
46012ba3
DH
2782struct {
2783 struct perf_event_header header;
7a10da70 2784 u64 lost;
46012ba3
DH
2785 struct sample_id sample_id;
2786};
ba4924aa 2787.EE
c089afee 2788.in
46012ba3
DH
2789.RS
2790.TP
2791.I lost
2792the number of potentially lost samples.
2793.RE
9277a75d
VW
2794.TP
2795.BR PERF_RECORD_SWITCH " (since Linux 4.3)"
243d656f 2796.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
9277a75d
VW
2797This record indicates a context switch has happened.
2798The
2799.B PERF_RECORD_MISC_SWITCH_OUT
2800bit in the
2801.I misc
2802field indicates whether it was a context switch into
2803or away from the current process.
efeece04 2804.IP
9277a75d 2805.in +4n
b8302363 2806.EX
9277a75d
VW
2807struct {
2808 struct perf_event_header header;
2809 struct sample_id sample_id;
2810};
ba4924aa 2811.EE
c089afee 2812.in
9277a75d
VW
2813.TP
2814.BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)"
243d656f 2815.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
9277a75d
VW
2816As with
2817.B PERF_RECORD_SWITCH
2818this record indicates a context switch has happened,
d5a24378 2819but it only occurs when sampling in CPU-wide mode
9277a75d
VW
2820and provides additional information on the process
2821being switched to/from.
2822The
2823.B PERF_RECORD_MISC_SWITCH_OUT
2824bit in the
2825.I misc
2826field indicates whether it was a context switch into
2827or away from the current process.
efeece04 2828.IP
9277a75d 2829.in +4n
b8302363 2830.EX
9277a75d
VW
2831struct {
2832 struct perf_event_header header;
2833 u32 next_prev_pid;
2834 u32 next_prev_tid;
2835 struct sample_id sample_id;
2836};
ba4924aa 2837.EE
c089afee 2838.in
9277a75d
VW
2839.RS
2840.TP
2841.I next_prev_pid
d5a24378 2842The process ID of the previous (if switching in)
9277a75d
VW
2843or next (if switching out) process on the CPU.
2844.TP
2845.I next_prev_tid
d5a24378 2846The thread ID of the previous (if switching in)
9277a75d
VW
2847or next (if switching out) thread on the CPU.
2848.RE
1e554f3e
NK
2849.TP
2850.BR PERF_RECORD_NAMESPACES " (since Linux 4.11)"
2851.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
2852This record includes various namespace information of a process.
2853.IP
2854.in +4n
2855.EX
2856struct {
2857 struct perf_event_header header;
5ae2634d
MK
2858 u32 pid;
2859 u32 tid;
2860 u64 nr_namespaces;
1e554f3e
NK
2861 struct { u64 dev, inode } [nr_namespaces];
2862 struct sample_id sample_id;
2863};
2864.EE
2865.in
2866.RS
2867.TP
2868.I pid
2869is the process ID
2870.TP
2871.I tid
2872is the thread ID
2873.TP
2874.I nr_namespace
2875is the number of namespaces in this record
2876.RE
2877.IP
2878Each namespace has
2879.I dev
2880and
2881.I inode
2882fields and is recorded in the
2883fixed position like below:
2884.RS
2885.TP
2886.BR NET_NS_INDEX = 0
2887Network namespace
2888.TP
2889.BR UTS_NS_INDEX = 1
2890UTS namespace
2891.TP
2892.BR IPC_NS_INDEX = 2
2893IPC namespace
2894.TP
2895.BR PID_NS_INDEX = 3
2896PID namespace
2897.TP
2898.BR USER_NS_INDEX = 4
2899User namespace
2900.TP
2901.BR MNT_NS_INDEX = 5
2902Mount namespace
2903.TP
2904.BR CGROUP_NS_INDEX = 6
2905Cgroup namespace
2906.RE
2907.TP
2908.BR PERF_RECORD_KSYMBOL " (since Linux 5.0)"
2909.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
2910This record indicates kernel symbol register/unregister events.
2911.IP
2912.in +4n
2913.EX
2914struct {
2915 struct perf_event_header header;
5ae2634d
MK
2916 u64 addr;
2917 u32 len;
2918 u16 ksym_type;
2919 u16 flags;
2920 char name[];
1e554f3e
NK
2921 struct sample_id sample_id;
2922};
2923.EE
2924.in
2925.RS
2926.TP
2927.I addr
5ae2634d 2928is the address of the kernel symbol.
1e554f3e
NK
2929.TP
2930.I len
5ae2634d 2931is the length of the kernel symbol.
1e554f3e
NK
2932.TP
2933.I ksym_type
2934is the type of the kernel symbol.
5ae2634d 2935Currently the following types are available:
1e554f3e
NK
2936.RS
2937.TP
2938.B PERF_RECORD_KSYMBOL_TYPE_BPF
5ae2634d 2939The kernel symbol is a BPF function.
1e554f3e
NK
2940.RE
2941.TP
2942.I flags
2943If the
2944.B PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER
2945is set, then this event is for unregistering the kernel symbol.
2946.RE
2947.TP
2948.BR PERF_RECORD_BPF_EVENT " (since Linux 5.0)"
2949.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
2950This record indicates BPF program is loaded or unloaded.
2951.IP
2952.in +4n
2953.EX
2954struct {
2955 struct perf_event_header header;
2956 u16 type;
2957 u16 flags;
2958 u32 id;
2959 u8 tag[BPF_TAG_SIZE];
2960 struct sample_id sample_id;
2961};
2962.EE
2963.in
2964.RS
2965.TP
2966.I type
2967is one of the following values:
2968.RS
2969.TP
2970.B PERF_BPF_EVENT_PROG_LOAD
2971A BPF program is loaded
2972.TP
2973.B PERF_BPF_EVENT_PROG_UNLOAD
2974A BPF program is unloaded
2975.RE
2976.TP
2977.I id
5ae2634d 2978is the ID of the BPF program.
1e554f3e
NK
2979.TP
2980.I tag
2981is the tag of the BPF program.
2982Currently,
2983.B BPF_TAG_SIZE
2984is defined as 8.
2985.RE
2986.TP
2987.BR PERF_RECORD_CGROUP " (since Linux 5.7)"
2988.\" commit 96aaab686505c449e24d76e76507290dcc30e008
2989This record indicates a new cgroup is created and activated.
2990.IP
2991.in +4n
2992.EX
2993struct {
2994 struct perf_event_header header;
5ae2634d
MK
2995 u64 id;
2996 char path[];
1e554f3e
NK
2997 struct sample_id sample_id;
2998};
2999.EE
3000.in
3001.RS
3002.TP
3003.I id
3004is the cgroup identifier.
5ae2634d 3005This can be also retrieved by
1e554f3e
NK
3006.BR name_to_handle_at (2)
3007on the cgroup path (as a file handle).
3008.TP
3009.I path
3010is the path of the cgroup from the root.
3011.RE
3012.TP
3013.BR PERF_RECORD_TEXT_POKE " (since Linux 5.8)"
3014.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
3015This record indicates a change in the kernel text.
3016This includes addition and removal of the text
3017and the corresponding length is zero in this case.
3018.IP
3019.in +4n
3020.EX
3021struct {
3022 struct perf_event_header header;
5ae2634d
MK
3023 u64 addr;
3024 u16 old_len;
3025 u16 new_len;
3026 u8 bytes[];
1e554f3e
NK
3027 struct sample_id sample_id;
3028};
3029.EE
3030.in
3031.RS
3032.TP
3033.I addr
3034is the address of the change
3035.TP
3036.I old_len
3037is the old length
3038.TP
3039.I new_len
3040is the new length
3041.TP
3042.I bytes
3043contains old bytes immediately followed by new bytes.
3044.RE
f2b1d720 3045.RE
21977c9d
VW
3046.SS Overflow handling
3047Events can be set to notify when a threshold is crossed,
3048indicating an overflow.
3049Overflow conditions can be captured by monitoring the
3050event file descriptor with
f2b1d720
MK
3051.BR poll (2),
3052.BR select (2),
21977c9d 3053or
4b3a5f01 3054.BR epoll (7).
6831ba6b
MK
3055Alternatively, the overflow events can be captured via sa signal handler,
3056by enabling I/O signaling on the file descriptor; see the discussion of the
1ae6b2c7 3057.B F_SETOWN
6831ba6b 3058and
1ae6b2c7 3059.B F_SETSIG
6831ba6b
MK
3060operations in
3061.BR fcntl (2).
efeece04 3062.PP
6170255e 3063Overflows are generated only by sampling events
f2b1d720 3064.RI ( sample_period
7d182bb6 3065must have a nonzero value).
efeece04 3066.PP
21977c9d 3067There are two ways to generate overflow notifications.
efeece04 3068.PP
f2b1d720
MK
3069The first is to set a
3070.I wakeup_events
3071or
3072.I wakeup_watermark
21977c9d 3073value that will trigger if a certain number of samples
f2b1d720 3074or bytes have been written to the mmap ring buffer.
fc79d996 3075In this case,
7db515ef 3076.B POLL_IN
21977c9d 3077is indicated.
efeece04 3078.PP
f2b1d720 3079The other way is by use of the
7db515ef 3080.B PERF_EVENT_IOC_REFRESH
f2b1d720
MK
3081ioctl.
3082This ioctl adds to a counter that decrements each time the event overflows.
21977c9d 3083When nonzero,
7db515ef 3084.B POLL_IN
21977c9d
VW
3085is indicated, but
3086once the counter reaches 0
7db515ef 3087.B POLL_HUP
21977c9d 3088is indicated and
f2b1d720 3089the underlying event is disabled.
efeece04 3090.PP
50e4319c
VW
3091Refreshing an event group leader refreshes all siblings and
3092refreshing with a parameter of 0 currently enables infinite
3093refreshes;
3094these behaviors are unsupported and should not be relied on.
3095.\" See https://lkml.org/lkml/2011/5/24/337
efeece04 3096.PP
4010bc07 3097Starting with Linux 3.18,
747a6e7c 3098.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883
21977c9d
VW
3099.B POLL_HUP
3100is indicated if the event being monitored is attached to a different
3101process and that process exits.
73d8cece 3102.SS rdpmc instruction
f2b1d720 3103Starting with Linux 3.4 on x86, you can use the
747a6e7c 3104.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
f2b1d720
MK
3105.I rdpmc
3106instruction to get low-latency reads without having to enter the kernel.
3107Note that using
3108.I rdpmc
3109is not necessarily faster than other methods for reading event values.
efeece04 3110.PP
f2b1d720
MK
3111Support for this can be detected with the
3112.I cap_usr_rdpmc
3113field in the mmap page; documentation on how
3114to calculate event values can be found in that section.
efeece04 3115.PP
562c69f6
VW
3116Originally, when rdpmc support was enabled, any process (not just ones
3117with an active perf event) could use the rdpmc instruction to access
3118the counters.
fc79d996 3119Starting with Linux 4.0,
562c69f6
VW
3120.\" 7911d3f7af14a614617e38245fedf98a724e46a9
3121rdpmc support is only allowed if an event is currently enabled
95655a22 3122in a process's context.
562c69f6
VW
3123To restore the old behavior, write the value 2 to
3124.IR /sys/devices/cpu/rdpmc .
73d8cece 3125.SS perf_event ioctl calls
f2b1d720 3126Various ioctls act on
7db515ef 3127.BR perf_event_open ()
ce88f77b 3128file descriptors:
f2b1d720
MK
3129.TP
3130.B PERF_EVENT_IOC_ENABLE
ce88f77b 3131This enables the individual event or event group specified by the
7db515ef 3132file descriptor argument.
efeece04 3133.IP
51700fd7 3134If the
8cc8b90d 3135.B PERF_IOC_FLAG_GROUP
51700fd7 3136bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
3137enabled, even if the event specified is not the group leader
3138(but see BUGS).
f2b1d720
MK
3139.TP
3140.B PERF_EVENT_IOC_DISABLE
ce88f77b 3141This disables the individual counter or event group specified by the
7db515ef 3142file descriptor argument.
efeece04 3143.IP
f2b1d720
MK
3144Enabling or disabling the leader of a group enables or disables the
3145entire group; that is, while the group leader is disabled, none of the
3146counters in the group will count.
33a0ccb2
MK
3147Enabling or disabling a member of a group other than the leader
3148affects only that counter; disabling a non-leader
f2b1d720 3149stops that counter from counting but doesn't affect any other counter.
efeece04 3150.IP
51700fd7 3151If the
8cc8b90d 3152.B PERF_IOC_FLAG_GROUP
51700fd7 3153bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
3154disabled, even if the event specified is not the group leader
3155(but see BUGS).
f2b1d720
MK
3156.TP
3157.B PERF_EVENT_IOC_REFRESH
3158Non-inherited overflow counters can use this
3159to enable a counter for a number of overflows specified by the argument,
3160after which it is disabled.
3161Subsequent calls of this ioctl add the argument value to the current
3162count.
21977c9d 3163An overflow notification with
7db515ef
MK
3164.B POLL_IN
3165set will happen on each overflow until the
21977c9d
VW
3166count reaches 0; when that happens a notification with
3167.B POLL_HUP
7db515ef 3168set is sent and the event is disabled.
f2b1d720 3169Using an argument of 0 is considered undefined behavior.
f2b1d720
MK
3170.TP
3171.B PERF_EVENT_IOC_RESET
36127c0e 3172Reset the event count specified by the
6061d29f 3173file descriptor argument to zero.
33a0ccb2 3174This resets only the counts; there is no way to reset the
f2b1d720
MK
3175multiplexing
3176.I time_enabled
3177or
3178.I time_running
3179values.
efeece04 3180.IP
51700fd7 3181If the
8cc8b90d 3182.B PERF_IOC_FLAG_GROUP
51700fd7 3183bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
3184reset, even if the event specified is not the group leader
3185(but see BUGS).
f2b1d720
MK
3186.TP
3187.B PERF_EVENT_IOC_PERIOD
e6cf5694 3188This updates the overflow period for the event.
efeece04 3189.IP
747a6e7c
VW
3190Since Linux 3.7 (on ARM)
3191.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc
3192and Linux 3.14 (all other architectures),
3193.\" commit bad7192b842c83e580747ca57104dd51fe08c223
3f118a29 3194the new period takes effect immediately.
ed81fdd9 3195On older kernels, the new period did not take effect until
3f118a29 3196after the next overflow.
efeece04 3197.IP
f2b1d720
MK
3198The argument is a pointer to a 64-bit value containing the
3199desired new period.
efeece04 3200.IP
fc79d996 3201Prior to Linux 2.6.36,
747a6e7c
VW
3202.\" commit ad0cf3478de8677f720ee06393b3147819568d6a
3203this ioctl always failed due to a bug
e6cf5694 3204in the kernel.
f2b1d720
MK
3205.TP
3206.B PERF_EVENT_IOC_SET_OUTPUT
3207This tells the kernel to report event notifications to the specified
3208file descriptor rather than the default one.
3209The file descriptors must all be on the same CPU.
efeece04 3210.IP
f2b1d720
MK
3211The argument specifies the desired file descriptor, or \-1 if
3212output should be ignored.
f2b1d720 3213.TP
31c1f2b0 3214.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
60dafbc1 3215.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830
f2b1d720 3216This adds an ftrace filter to this event.
efeece04 3217.IP
f2b1d720 3218The argument is a pointer to the desired ftrace filter.
a0dcc8dd 3219.TP
31c1f2b0 3220.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
60dafbc1 3221.\" commit cf4957f17f2a89984915ea808876d9c82225b862
bec6277e 3222This returns the event ID value for the given event file descriptor.
efeece04 3223.IP
a0dcc8dd
VW
3224The argument is a pointer to a 64-bit unsigned integer
3225to hold the result.
b0f7b411
VW
3226.TP
3227.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)"
3228.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
3229This allows attaching a Berkeley Packet Filter (BPF)
3230program to an existing kprobe tracepoint event.
3231You need
d19b29a5
AB
3232.B CAP_PERFMON
3233(since Linux 5.8) or
b0f7b411
VW
3234.B CAP_SYS_ADMIN
3235privileges to use this ioctl.
efeece04 3236.IP
b0f7b411
VW
3237The argument is a BPF program file descriptor that was created by
3238a previous
3239.BR bpf (2)
3240system call.
06a61b36
VW
3241.TP
3242.BR PERF_EVENT_IOC_PAUSE_OUTPUT " (since Linux 4.7)"
3243.\" commit 86e7972f690c1017fd086cdfe53d8524e68c661c
3244This allows pausing and resuming the event's ring-buffer.
3245A paused ring-buffer does not prevent generation of samples,
3246but simply discards them.
3247The discarded samples are considered lost, and cause a
1ae6b2c7 3248.B PERF_RECORD_LOST
06a61b36
VW
3249sample to be generated when possible.
3250An overflow signal may still be triggered by the discarded sample
3251even though the ring-buffer remains empty.
3252.IP
3253The argument is an unsigned 32-bit integer.
3254A nonzero value pauses the ring-buffer, while a
3255zero value resumes the ring-buffer.
8496491d
VW
3256.TP
3257.BR PERF_EVENT_MODIFY_ATTRIBUTES " (since Linux 4.17)"
3258.\" commit 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573
3259This allows modifying an existing event without the overhead
3260of closing and reopening a new event.
92e696b9 3261Currently this is supported only for breakpoint events.
8496491d
VW
3262.IP
3263The argument is a pointer to a
3264.I perf_event_attr
3265structure containing the updated event settings.
43cc0d8a
VW
3266.TP
3267.BR PERF_EVENT_IOC_QUERY_BPF " (since Linux 4.16)"
3268.\" commit f371b304f12e31fe30207c41ca7754564e0ea4dc
3269This allows querying which Berkeley Packet Filter (BPF)
3270programs are attached to an existing kprobe tracepoint.
3271You can only attach one BPF program per event, but you can
3272have multiple events attached to a tracepoint.
7533c83d 3273Querying this value on one tracepoint event returns the ID
43cc0d8a
VW
3274of all BPF programs in all events attached to the tracepoint.
3275You need
d19b29a5
AB
3276.B CAP_PERFMON
3277(since Linux 5.8) or
43cc0d8a
VW
3278.B CAP_SYS_ADMIN
3279privileges to use this ioctl.
3280.IP
3281The argument is a pointer to a structure
3282.in +4n
3283.EX
3284struct perf_event_query_bpf {
3285 __u32 ids_len;
3286 __u32 prog_cnt;
3287 __u32 ids[0];
3288};
3289.EE
c089afee 3290.in
43cc0d8a
VW
3291.IP
3292The
3293.I ids_len
3294field indicates the number of ids that can fit in the provided
3295.I ids
3296array.
3297The
3298.I prog_cnt
3299value is filled in by the kernel with the number of attached
3300BPF programs.
3301The
3302.I ids
7533c83d 3303array is filled with the ID of each attached BPF program.
43cc0d8a
VW
3304If there are more programs than will fit in the array, then the
3305kernel will return
3306.B ENOSPC
3307and
3308.I ids_len
3309will indicate the number of program IDs that were successfully copied.
06a61b36 3310.\"
fc79d996 3311.SS Using prctl(2)
d134c429
VW
3312A process can enable or disable all currently open event groups
3313using the
f2b1d720
MK
3314.BR prctl (2)
3315.B PR_TASK_PERF_EVENTS_ENABLE
3316and
3317.B PR_TASK_PERF_EVENTS_DISABLE
3318operations.
d134c429
VW
3319This applies only to events created locally by the calling process.
3320This does not apply to events created by other processes attached
3321to the calling process or inherited events from a parent process.
3322Only group leaders are enabled and disabled,
3323not any other members of the groups.
f2b1d720 3324.SS perf_event related configuration files
7db515ef
MK
3325Files in
3326.I /proc/sys/kernel/
7db515ef 3327.RS 4
f2b1d720 3328.TP
7db515ef 3329.I /proc/sys/kernel/perf_event_paranoid
f2b1d720
MK
3330The
3331.I perf_event_paranoid
3332file can be set to restrict access to the performance counters.
efeece04 3333.IP
dc9ec146 3334.PD 0
2b538c3e
MK
3335.RS
3336.IP 2 4
3eb95192 3337allow only user-space measurements (default since Linux 4.6).
b5eb75f7 3338.\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66
2b538c3e 3339.IP 1
3eb95192 3340allow both kernel and user measurements (default before Linux 4.6).
2b538c3e
MK
3341.IP 0
3342allow access to CPU-specific data but not raw tracepoint samples.
3343.IP \-1
3344no restrictions.
3345.RE
dc9ec146 3346.PD
2b538c3e 3347.IP
f2b1d720
MK
3348The existence of the
3349.I perf_event_paranoid
3350file is the official method for determining if a kernel supports
7db515ef 3351.BR perf_event_open ().
f2b1d720
MK
3352.TP
3353.I /proc/sys/kernel/perf_event_max_sample_rate
7db515ef
MK
3354This sets the maximum sample rate.
3355Setting this too high can allow
f2b1d720 3356users to sample at a rate that impacts overall machine performance
7db515ef
MK
3357and potentially lock up the machine.
3358The default value is
f2b1d720 3359100000 (samples per second).
fd133d5d
VW
3360.TP
3361.I /proc/sys/kernel/perf_event_max_stack
3362.\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e
5dd3feec 3363This file sets the maximum depth of stack frame entries reported
fd133d5d 3364when generating a call trace.
f2b1d720
MK
3365.TP
3366.I /proc/sys/kernel/perf_event_mlock_kb
ce88f77b
MK
3367Maximum number of pages an unprivileged user can
3368.BR mlock (2).
f2b1d720
MK
3369The default is 516 (kB).
3370.RE
efeece04 3371.PP
7db515ef
MK
3372Files in
3373.I /sys/bus/event_source/devices/
efeece04 3374.PP
7db515ef 3375.RS 4
ce88f77b 3376Since Linux 2.6.34, the kernel supports having multiple PMUs
f2b1d720
MK
3377available for monitoring.
3378Information on how to program these PMUs can be found under
3379.IR /sys/bus/event_source/devices/ .
3380Each subdirectory corresponds to a different PMU.
f2b1d720 3381.TP
31c1f2b0 3382.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
747a6e7c 3383.\" commit abe43400579d5de0078c2d3a760e6598e183f871
f2b1d720
MK
3384This contains an integer that can be used in the
3385.I type
ce88f77b
MK
3386field of
3387.I perf_event_attr
3388to indicate that you wish to use this PMU.
f2b1d720 3389.TP
562c69f6 3390.IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)"
747a6e7c 3391.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f
8a94e783 3392If this file is 1, then direct user-space access to the
e30dc77f
VW
3393performance counter registers is allowed via the rdpmc instruction.
3394This can be disabled by echoing 0 to the file.
efeece04 3395.IP
562c69f6
VW
3396As of Linux 4.0
3397.\" a66734297f78707ce39d756b656bfae861d53f62
3398.\" 7911d3f7af14a614617e38245fedf98a724e46a9
3399the behavior has changed, so that 1 now means only allow access
3400to processes with active perf events, with 2 indicating the old
3401allow-anyone-access behavior.
f2b1d720 3402.TP
31c1f2b0 3403.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
747a6e7c 3404.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6
MK
3405This subdirectory contains information on the architecture-specific
3406subfields available for programming the various
f2b1d720 3407.I config
ce88f77b
MK
3408fields in the
3409.I perf_event_attr
3410struct.
efeece04 3411.IP
e30dc77f
VW
3412The content of each file is the name of the config field, followed
3413by a colon, followed by a series of integer bit ranges separated by
3414commas.
8a94e783 3415For example, the file
e30dc77f
VW
3416.I event
3417may contain the value
d2fdb1e3
MK
3418.I config1:1,6\-10,44
3419which indicates that event is an attribute that occupies bits 1,6\(en10, and 44
ce88f77b
MK
3420of
3421.IR perf_event_attr::config1 .
e30dc77f 3422.TP
31c1f2b0 3423.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
747a6e7c 3424.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6 3425This subdirectory contains files with predefined events.
f2b1d720 3426The contents are strings describing the event settings
e30dc77f 3427expressed in terms of the fields found in the previously mentioned
f2b1d720
MK
3428.I ./format/
3429directory.
3430These are not necessarily complete lists of all events supported by
3431a PMU, but usually a subset of events deemed useful or interesting.
efeece04 3432.IP
e30dc77f 3433The content of each file is a list of attribute names
8a94e783
MK
3434separated by commas.
3435Each entry has an optional value (either hex or decimal).
37bee118 3436If no value is specified, then it is assumed to be a single-bit
e30dc77f
VW
3437field with a value of 1.
3438An example entry may look like this:
699893d8 3439.IR event=0x2,inv,ldlat=3 .
f2b1d720
MK
3440.TP
3441.I /sys/bus/event_source/devices/*/uevent
e30dc77f
VW
3442This file is the standard kernel device interface
3443for injecting hotplug events.
3444.TP
31c1f2b0 3445.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
747a6e7c 3446.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac
699893d8
DP
3447The
3448.I cpumask
3449file contains a comma-separated list of integers that
3450indicate a representative CPU number for each socket (package)
e30dc77f
VW
3451on the motherboard.
3452This is needed when setting up uncore or northbridge events, as
3453those PMUs present socket-wide events.
f2b1d720 3454.RE
47297adb 3455.SH RETURN VALUE
c112329f 3456On success,
f2b1d720 3457.BR perf_event_open ()
c112329f
MK
3458returns the new file descriptor.
3459On error, \-1 is returned and
f2b1d720 3460.I errno
c112329f 3461is set to indicate the error.
f2b1d720 3462.SH ERRORS
d8b7d950
VW
3463The errors returned by
3464.BR perf_event_open ()
3465can be inconsistent, and may
3466vary across processor architectures and performance monitoring units.
f2b1d720 3467.TP
82b09254 3468.B E2BIG
ce88f77b
MK
3469Returned if the
3470.I perf_event_attr
82b09254
VW
3471.I size
3472value is too small
3473(smaller than
3474.BR PERF_ATTR_SIZE_VER0 ),
3475too big (larger than the page size),
3476or larger than the kernel supports and the extra bytes are not zero.
3477When
3478.B E2BIG
ce88f77b
MK
3479is returned, the
3480.I perf_event_attr
e9bd9b2c 3481.I size
d6af98f8 3482field is overwritten by the kernel to be the size of the structure
82b09254
VW
3483it was expecting.
3484.TP
d8b7d950 3485.B EACCES
27f0af8e 3486Returned when the requested event requires
d19b29a5
AB
3487.B CAP_PERFMON
3488(since Linux 5.8) or
27f0af8e
VW
3489.B CAP_SYS_ADMIN
3490permissions (or a more permissive perf_event paranoid setting).
3491Some common cases where an unprivileged process
3492may encounter this error:
3493attaching to a process owned by a different user;
2b23ecbd
MK
3494monitoring all processes on a given CPU (i.e., specifying the
3495.I pid
3496argument as \-1);
079928f3 3497and not setting
accec051 3498.I exclude_kernel
079928f3 3499when the paranoid setting requires it.
d8b7d950
VW
3500.TP
3501.B EBADF
3502Returned if the
3503.I group_fd
accec051
MK
3504file descriptor is not valid, or, if
3505.B PERF_FLAG_PID_CGROUP
3506is set,
d8b7d950
VW
3507the cgroup file descriptor in
3508.I pid
3509is not valid.
3510.TP
f27486cb
VW
3511.BR EBUSY " (since Linux 4.1)"
3512.\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0
3513Returned if another event already has exclusive
3514access to the PMU.
3515.TP
d8b7d950
VW
3516.B EFAULT
3517Returned if the
3518.I attr
3519pointer points at an invalid memory address.
3520.TP
97e2d8e6
MK
3521.B EINTR
3522Returned when trying to mix perf and ftrace handling
3523for a uprobe.
3524.TP
f2b1d720 3525.B EINVAL
d8b7d950
VW
3526Returned if the specified event is invalid.
3527There are many possible reasons for this.
3528A not-exhaustive list:
3529.I sample_freq
accec051 3530is higher than the maximum setting;
d8b7d950
VW
3531the
3532.I cpu
accec051 3533to monitor does not exist;
d8b7d950 3534.I read_format
accec051 3535is out of range;
d8b7d950 3536.I sample_type
accec051 3537is out of range;
d8b7d950
VW
3538the
3539.I flags
accec051 3540value is out of range;
d8b7d950
VW
3541.I exclusive
3542or
3543.I pinned
accec051 3544set and the event is not a group leader;
d8b7d950
VW
3545the event
3546.I config
accec051
MK
3547values are out of range or set reserved bits;
3548the generic event selected is not supported; or
d8b7d950
VW
3549there is not enough room to add the selected event.
3550.TP
3551.B EMFILE
3552Each opened event uses one file descriptor.
26c32fab
MK
3553If a large number of events are opened,
3554the per-process limit on the number of open file descriptors will be reached,
3555and no more events can be created.
d8b7d950
VW
3556.TP
3557.B ENODEV
3558Returned when the event involves a feature not supported
accec051 3559by the current CPU.
d8b7d950
VW
3560.TP
3561.B ENOENT
3562Returned if the
3563.I type
3564setting is not valid.
accec051 3565This error is also returned for
d8b7d950 3566some unsupported generic events.
f2b1d720
MK
3567.TP
3568.B ENOSPC
3569Prior to Linux 3.3, if there was not enough room for the event,
747a6e7c 3570.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6
f2b1d720
MK
3571.B ENOSPC
3572was returned.
accec051 3573In Linux 3.3, this was changed to
f2b1d720
MK
3574.BR EINVAL .
3575.B ENOSPC
d8b7d950 3576is still returned if you try to add more breakpoint events
accec051 3577than supported by the hardware.
d8b7d950
VW
3578.TP
3579.B ENOSYS
3580Returned if
3581.B PERF_SAMPLE_STACK_USER
3582is set in
3583.I sample_type
3584and it is not supported by hardware.
3585.TP
3586.B EOPNOTSUPP
3587Returned if an event requiring a specific hardware feature is
3588requested but there is no hardware support.
3589This includes requesting low-skid events if not supported,
3590branch tracing if it is not available, sampling if no PMU
3591interrupt is available, and branch stacks for software events.
3592.TP
fd133d5d
VW
3593.BR EOVERFLOW " (since Linux 4.8)"
3594.\" 97c79a38cd454602645f0470ffb444b3b75ce574
3595Returned if
3596.B PERF_SAMPLE_CALLCHAIN
3597is requested and
3598.I sample_max_stack
3599is larger than the maximum specified in
3600.IR /proc/sys/kernel/perf_event_max_stack .
3601.TP
d8b7d950 3602.B EPERM
27f0af8e
VW
3603Returned on many (but not all) architectures when an unsupported
3604.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel
3605setting is specified.
efeece04 3606.IP
27f0af8e
VW
3607It can also happen, as with
3608.BR EACCES ,
3609when the requested event requires
d19b29a5
AB
3610.B CAP_PERFMON
3611(since Linux 5.8) or
27f0af8e
VW
3612.B CAP_SYS_ADMIN
3613permissions (or a more permissive perf_event paranoid setting).
3614This includes setting a breakpoint on a kernel address,
3615and (since Linux 3.13) setting a kernel function-trace tracepoint.
747a6e7c 3616.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34
d8b7d950
VW
3617.TP
3618.B ESRCH
3619Returned if attempting to attach to a process that does not exist.
f2b1d720 3620.SH VERSION
f2b1d720
MK
3621.BR perf_event_open ()
3622was introduced in Linux 2.6.31 but was called
747a6e7c 3623.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e
ffd4dec0 3624.BR perf_counter_open ().
f2b1d720 3625It was renamed in Linux 2.6.32.
747a6e7c 3626.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
3113c7f3 3627.SH STANDARDS
7db515ef
MK
3628This
3629.BR perf_event_open ()
dc9ec146 3630system call Linux-specific
f2b1d720 3631and should not be used in programs intended to be portable.
f2b1d720 3632.SH NOTES
f2b1d720 3633The official way of knowing if
7db515ef 3634.BR perf_event_open ()
f2b1d720
MK
3635support is enabled is checking
3636for the existence of the file
7db515ef 3637.IR /proc/sys/kernel/perf_event_paranoid .
d19b29a5
AB
3638.PP
3639.B CAP_PERFMON
3640capability (since Linux 5.8) provides secure approach to
3641performance monitoring and observability operations in a system
3642according to the principal of least privilege (POSIX IEEE 1003.1e).
3643Accessing system performance monitoring and observability operations
3644using
3645.B CAP_PERFMON
3646rather than the much more powerful
3647.B CAP_SYS_ADMIN
3648excludes chances to misuse credentials and makes operations more secure.
3649.B CAP_SYS_ADMIN
3650usage for secure system performance monitoring and observability
0f667014 3651is discouraged in favor of the
d19b29a5
AB
3652.B CAP_PERFMON
3653capability.
f2b1d720 3654.SH BUGS
f2b1d720
MK
3655The
3656.B F_SETOWN_EX
3657option to
7db515ef 3658.BR fcntl (2)
f2b1d720
MK
3659is needed to properly get overflow signals in threads.
3660This was introduced in Linux 2.6.32.
747a6e7c 3661.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5
efeece04 3662.PP
747a6e7c
VW
3663Prior to Linux 2.6.33 (at least for x86),
3664.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1
3665the kernel did not check
f2b1d720
MK
3666if events could be scheduled together until read time.
3667The same happens on all known kernels if the NMI watchdog is enabled.
3668This means to see if a given set of events works you have to
3669.BR perf_event_open (),
3670start, then read before you know for sure you
3671can get valid measurements.
efeece04 3672.PP
b5190152
MK
3673Prior to Linux 2.6.34,
3674.\" FIXME . cannot find a kernel commit for this one
3675event constraints were not enforced by the kernel.
f2b1d720
MK
3676In that case, some events would silently return "0" if the kernel
3677scheduled them in an improper counter slot.
efeece04 3678.PP
ce88f77b 3679Prior to Linux 2.6.34, there was a bug when multiplexing where the
f2b1d720 3680wrong results could be returned.
747a6e7c 3681.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8
efeece04 3682.PP
f2b1d720
MK
3683Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
3684"inherit" is enabled and many threads are started.
747a6e7c 3685.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd
efeece04 3686.PP
f2b1d720 3687Prior to Linux 2.6.35,
747a6e7c 3688.\" commit 050735b08ca8a016bbace4445fa025b88fee770b
f2b1d720
MK
3689.B PERF_FORMAT_GROUP
3690did not work with attached processes.
efeece04 3691.PP
f2b1d720
MK
3692There is a bug in the kernel code between
3693Linux 2.6.36 and Linux 3.0 that ignores the
3694"watermark" field and acts as if a wakeup_event
3695was chosen if the union has a
7d182bb6 3696nonzero value in it.
747a6e7c 3697.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02
efeece04 3698.PP
8a94e783 3699From Linux 2.6.31 to Linux 3.4, the
dbc01ecd
VW
3700.B PERF_IOC_FLAG_GROUP
3701ioctl argument was broken and would repeatedly operate
3702on the event specified rather than iterating across
3703all sibling events in a group.
747a6e7c 3704.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e
efeece04 3705.PP
7205b8df 3706From Linux 3.4 to Linux 3.11, the mmap
747a6e7c 3707.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
3708.I cap_usr_rdpmc
3709and
3710.I cap_usr_time
3711bits mapped to the same location.
3712Code should migrate to the new
3713.I cap_user_rdpmc
3714and
3715.I cap_user_time
3716fields instead.
efeece04 3717.PP
7db515ef
MK
3718Always double-check your results!
3719Various generalized events have had wrong values.
f2b1d720
MK
3720For example, retired branches measured
3721the wrong thing on AMD machines until Linux 2.6.35.
747a6e7c 3722.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
a14af333 3723.SH EXAMPLES
f2b1d720 3724The following is a short example that measures the total
7db515ef
MK
3725instruction count of a call to
3726.BR printf (3).
408731d4 3727.PP
33857069 3728.\" SRC BEGIN (perf_event_open.c)
408731d4 3729.EX
ddffcbf1 3730#include <linux/perf_event.h>
f2b1d720 3731#include <stdio.h>
ddffcbf1 3732#include <stdlib.h>
f2b1d720
MK
3733#include <string.h>
3734#include <sys/ioctl.h>
ddffcbf1
AC
3735#include <sys/syscall.h>
3736#include <unistd.h>
f2b1d720 3737
571767ca 3738static long
7db515ef
MK
3739perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
3740 int cpu, int group_fd, unsigned long flags)
f2b1d720
MK
3741{
3742 int ret;
3743
ddffcbf1 3744 ret = syscall(SYS_perf_event_open, hw_event, pid, cpu,
4687ab0e 3745 group_fd, flags);
f2b1d720
MK
3746 return ret;
3747}
3748
f2b1d720 3749int
c6ae6d97 3750main(void)
f2b1d720 3751{
f2b1d720
MK
3752 struct perf_event_attr pe;
3753 long long count;
3754 int fd;
3755
3376a638 3756 memset(&pe, 0, sizeof(pe));
f2b1d720 3757 pe.type = PERF_TYPE_HARDWARE;
3376a638 3758 pe.size = sizeof(pe);
f2b1d720
MK
3759 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
3760 pe.disabled = 1;
3761 pe.exclude_kernel = 1;
3762 pe.exclude_hv = 1;
3763
3764 fd = perf_event_open(&pe, 0, \-1, \-1, 0);
7db515ef 3765 if (fd == \-1) {
d1a71985 3766 fprintf(stderr, "Error opening leader %llx\en", pe.config);
7db515ef 3767 exit(EXIT_FAILURE);
f2b1d720
MK
3768 }
3769
3770 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
3771 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
3772
d1a71985 3773 printf("Measuring instruction count for this printf\en");
f2b1d720
MK
3774
3775 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
284a3679 3776 read(fd, &count, sizeof(count));
f2b1d720 3777
d1a71985 3778 printf("Used %lld instructions\en", count);
f2b1d720
MK
3779
3780 close(fd);
3781}
408731d4 3782.EE
33857069 3783.\" SRC END
47297adb 3784.SH SEE ALSO
022b038e 3785.BR perf (1),
f2b1d720
MK
3786.BR fcntl (2),
3787.BR mmap (2),
3788.BR open (2),
3789.BR prctl (2),
3790.BR read (2)
c6ed23c5 3791.PP
1ae6b2c7 3792.I Documentation/admin\-guide/perf\-security.rst
c6ed23c5 3793in the kernel source tree