]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/perf_event_open.2
ipc.5: Remove old link to svipc.7/sysvipc.7 page
[thirdparty/man-pages.git] / man2 / perf_event_open.2
CommitLineData
f2b1d720
MK
1.\" Copyright (c) 2012, Vincent Weaver
2.\"
1dd72f9c 3.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
f2b1d720
MK
4.\" This is free documentation; you can redistribute it and/or
5.\" modify it under the terms of the GNU General Public License as
6.\" published by the Free Software Foundation; either version 2 of
7.\" the License, or (at your option) any later version.
8.\"
9.\" The GNU General Public License's references to "object code"
10.\" and "executables" are to be interpreted as the output of any
11.\" document formatting or typesetting system, including
12.\" intermediate and printed output.
13.\"
14.\" This manual is distributed in the hope that it will be useful,
15.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
16.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17.\" GNU General Public License for more details.
18.\"
19.\" You should have received a copy of the GNU General Public
20.\" License along with this manual; if not, see
21.\" <http://www.gnu.org/licenses/>.
6a8d8745 22.\" %%%LICENSE_END
f2b1d720
MK
23.\"
24.\" This document is based on the perf_event.h header file, the
25.\" tools/perf/design.txt file, and a lot of bitter experience.
26.\"
9ba01802 27.TH PERF_EVENT_OPEN 2 2019-03-06 "Linux" "Linux Programmer's Manual"
f2b1d720
MK
28.SH NAME
29perf_event_open \- set up performance monitoring
30.SH SYNOPSIS
31.nf
32.B #include <linux/perf_event.h>
33.B #include <linux/hw_breakpoint.h>
68e4db0a 34.PP
f2b1d720
MK
35.BI "int perf_event_open(struct perf_event_attr *" attr ,
36.BI " pid_t " pid ", int " cpu ", int " group_fd ,
37.BI " unsigned long " flags );
38.fi
dbfe9c70 39.PP
f2b1d720
MK
40.IR Note :
41There is no glibc wrapper for this system call; see NOTES.
42.SH DESCRIPTION
43Given a list of parameters,
44.BR perf_event_open ()
45returns a file descriptor, for use in subsequent system calls
46.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
47.PP
48A call to
49.BR perf_event_open ()
50creates a file descriptor that allows measuring performance
51information.
52Each file descriptor corresponds to one
53event that is measured; these can be grouped together
54to measure multiple events simultaneously.
55.PP
56Events can be enabled and disabled in two ways: via
57.BR ioctl (2)
58and via
0fe9e4b1 59.BR prctl (2).
f2b1d720
MK
60When an event is disabled it does not count or generate overflows but does
61continue to exist and maintain its count value.
62.PP
63Events come in two flavors: counting and sampled.
64A
65.I counting
66event is one that is used for counting the aggregate number of events
67that occur.
68In general, counting event results are gathered with a
69.BR read (2)
70call.
71A
72.I sampling
73event periodically writes measurements to a buffer that can then
74be accessed via
0fe9e4b1 75.BR mmap (2).
f2b1d720 76.SS Arguments
11ac5b51 77.PP
f2b1d720 78The
a02a1737 79.I pid
f2b1d720 80and
a02a1737
VW
81.I cpu
82arguments allow specifying which process and CPU to monitor:
83.TP
f2d15dc9 84.BR "pid == 0" " and " "cpu == \-1"
ee7b0cbf 85This measures the calling process/thread on any CPU.
a02a1737 86.TP
f2d15dc9 87.BR "pid == 0" " and " "cpu >= 0"
ee7b0cbf 88This measures the calling process/thread only
a02a1737
VW
89when running on the specified CPU.
90.TP
f2d15dc9 91.BR "pid > 0" " and " "cpu == \-1"
a02a1737
VW
92This measures the specified process/thread on any CPU.
93.TP
f2d15dc9 94.BR "pid > 0" " and " "cpu >= 0"
a02a1737
VW
95This measures the specified process/thread only
96when running on the specified CPU.
97.TP
f2d15dc9 98.BR "pid == \-1" " and " "cpu >= 0"
a02a1737 99This measures all processes/threads on the specified CPU.
ce88f77b 100This requires
f2b1d720
MK
101.B CAP_SYS_ADMIN
102capability or a
103.I /proc/sys/kernel/perf_event_paranoid
104value of less than 1.
a02a1737 105.TP
ce88f77b 106.BR "pid == \-1" " and " "cpu == \-1"
a02a1737 107This setting is invalid and will return an error.
11ac5b51 108.PP
13ec13dc
MK
109When
110.I pid
111is greater than zero, permission to perform this system call
112is governed by a ptrace access mode
113.B PTRACE_MODE_READ_REALCREDS
114check; see
115.BR ptrace (2).
efeece04 116.PP
f2b1d720
MK
117The
118.I group_fd
119argument allows event groups to be created.
120An event group has one event which is the group leader.
121The leader is created first, with
122.IR group_fd " = \-1."
123The rest of the group members are created with subsequent
124.BR perf_event_open ()
125calls with
126.IR group_fd
bec6277e 127being set to the file descriptor of the group leader.
f2b1d720
MK
128(A single event on its own is created with
129.IR group_fd " = \-1"
130and is considered to be a group with only 1 member.)
33a0ccb2 131An event group is scheduled onto the CPU as a unit: it will
d1007d14 132be put onto the CPU only if all of the events in the group can be put onto
f2b1d720
MK
133the CPU.
134This means that the values of the member events can be
ce88f77b 135meaningfully compared\(emadded, divided (to get ratios), and so on\(emwith each
f2b1d720
MK
136other, since they have counted events for the same set of executed
137instructions.
11ac5b51 138.PP
f2b1d720
MK
139The
140.I flags
08e325e8 141argument is formed by ORing together zero or more of the following values:
f2b1d720 142.TP
60dafbc1
MK
143.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)"
144.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e
e9b1ab78
MK
145This flag enables the close-on-exec flag for the created
146event file descriptor,
147so that the file descriptor is automatically closed on
148.BR execve (2).
8bad22e5
MK
149Setting the close-on-exec flags at creation time, rather than later with
150.BR fcntl (2),
e9b1ab78
MK
151avoids potential race conditions where the calling thread invokes
152.BR perf_event_open ()
a61dba34
MK
153and
154.BR fcntl (2)
e9b1ab78
MK
155at the same time as another thread calls
156.BR fork (2)
157then
158.BR execve (2).
159.TP
f2b1d720 160.BR PERF_FLAG_FD_NO_GROUP
31266c04
VW
161This flag tells the event to ignore the
162.IR group_fd
163parameter except for the purpose of setting up output redirection
164using the
165.B PERF_FLAG_FD_OUTPUT
166flag.
f2b1d720 167.TP
3117263f 168.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)"
747a6e7c 169.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318
31266c04
VW
170This flag re-routes the event's sampled output to instead
171be included in the mmap buffer of the event specified by
172.IR group_fd .
f2b1d720 173.TP
3117263f 174.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)"
60dafbc1 175.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25
f2b1d720
MK
176This flag activates per-container system-wide monitoring.
177A container
ce88f77b 178is an abstraction that isolates a set of resources for finer-grained
699893d8 179control (CPUs, memory, etc.).
f2b1d720
MK
180In this mode, the event is measured
181only if the thread running on the monitored CPU belongs to the designated
182container (cgroup).
183The cgroup is identified by passing a file descriptor
184opened on its directory in the cgroupfs filesystem.
185For instance, if the
186cgroup to monitor is called
187.IR test ,
188then a file descriptor opened on
189.I /dev/cgroup/test
190(assuming cgroupfs is mounted on
191.IR /dev/cgroup )
192must be passed as the
193.I pid
194parameter.
33a0ccb2 195cgroup monitoring is available only
f2b1d720 196for system-wide events and may therefore require extra permissions.
11ac5b51 197.PP
f2b1d720
MK
198The
199.I perf_event_attr
200structure provides detailed configuration information
201for the event being created.
efeece04 202.PP
f2b1d720 203.in +4n
b8302363 204.EX
f2b1d720 205struct perf_event_attr {
da8bd8a4
MK
206 __u32 type; /* Type of event */
207 __u32 size; /* Size of attribute structure */
208 __u64 config; /* Type-specific configuration */
f2b1d720
MK
209
210 union {
211 __u64 sample_period; /* Period of sampling */
212 __u64 sample_freq; /* Frequency of sampling */
213 };
214
ce88f77b
MK
215 __u64 sample_type; /* Specifies values included in sample */
216 __u64 read_format; /* Specifies values returned in read */
217
218 __u64 disabled : 1, /* off by default */
219 inherit : 1, /* children inherit it */
220 pinned : 1, /* must always be on PMU */
221 exclusive : 1, /* only group on PMU */
222 exclude_user : 1, /* don't count user */
223 exclude_kernel : 1, /* don't count kernel */
224 exclude_hv : 1, /* don't count hypervisor */
225 exclude_idle : 1, /* don't count when idle */
226 mmap : 1, /* include mmap data */
227 comm : 1, /* include comm data */
228 freq : 1, /* use freq, not period */
229 inherit_stat : 1, /* per task counts */
230 enable_on_exec : 1, /* next exec enables */
231 task : 1, /* trace fork/exit */
232 watermark : 1, /* wakeup_watermark */
233 precise_ip : 2, /* skid constraint */
234 mmap_data : 1, /* non-exec mmap data */
235 sample_id_all : 1, /* sample_type all events */
236 exclude_host : 1, /* don't count in host */
237 exclude_guest : 1, /* don't count in guest */
238 exclude_callchain_kernel : 1,
239 /* exclude kernel callchains */
240 exclude_callchain_user : 1,
241 /* exclude user callchains */
9bfc542b 242 mmap2 : 1, /* include mmap with inode data */
dc9ec146
MK
243 comm_exec : 1, /* flag comm events that are
244 due to exec */
6bd5186a 245 use_clockid : 1, /* use clockid for time fields */
9277a75d 246 context_switch : 1, /* context switch data */
6bd5186a 247
9277a75d 248 __reserved_1 : 37;
f2b1d720
MK
249
250 union {
251 __u32 wakeup_events; /* wakeup every n events */
7db515ef 252 __u32 wakeup_watermark; /* bytes before wakeup */
f2b1d720
MK
253 };
254
255 __u32 bp_type; /* breakpoint type */
256
257 union {
258 __u64 bp_addr; /* breakpoint address */
7d8449ba
SL
259 __u64 kprobe_func; /* for perf_kprobe */
260 __u64 uprobe_path; /* for perf_uprobe */
f2b1d720
MK
261 __u64 config1; /* extension of config */
262 };
263
264 union {
265 __u64 bp_len; /* breakpoint length */
7d8449ba
SL
266 __u64 kprobe_addr; /* with kprobe_func == NULL */
267 __u64 probe_offset; /* for perf_[k,u]probe */
f2b1d720
MK
268 __u64 config2; /* extension of config1 */
269 };
ce88f77b
MK
270 __u64 branch_sample_type; /* enum perf_branch_sample_type */
271 __u64 sample_regs_user; /* user regs to dump on samples */
272 __u32 sample_stack_user; /* size of stack to dump on
7db515ef 273 samples */
6bd5186a 274 __s32 clockid; /* clock to use for time fields */
f5281dfd 275 __u64 sample_regs_intr; /* regs to dump on samples */
cdc52f4a 276 __u32 aux_watermark; /* aux bytes before wakeup */
fd133d5d
VW
277 __u16 sample_max_stack; /* max frames in callchain */
278 __u16 __reserved_2; /* align to u64 */
cdc52f4a 279
f2b1d720 280};
b8302363 281.EE
f2b1d720 282.in
efeece04 283.PP
f2b1d720
MK
284The fields of the
285.I perf_event_attr
286structure are described in more detail below:
f2b1d720
MK
287.TP
288.I type
289This field specifies the overall event type.
290It has one of the following values:
291.RS
292.TP
293.B PERF_TYPE_HARDWARE
294This indicates one of the "generalized" hardware events provided
295by the kernel.
296See the
297.I config
298field definition for more details.
299.TP
300.B PERF_TYPE_SOFTWARE
301This indicates one of the software-defined events provided by the kernel
302(even if no hardware support is available).
303.TP
304.B PERF_TYPE_TRACEPOINT
305This indicates a tracepoint
306provided by the kernel tracepoint infrastructure.
307.TP
308.B PERF_TYPE_HW_CACHE
309This indicates a hardware cache event.
310This has a special encoding, described in the
311.I config
312field definition.
313.TP
314.B PERF_TYPE_RAW
315This indicates a "raw" implementation-specific event in the
316.IR config " field."
317.TP
31c1f2b0 318.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
60dafbc1 319.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
320This indicates a hardware breakpoint as provided by the CPU.
321Breakpoints can be read/write accesses to an address as well as
322execution of an instruction address.
323.TP
7d281e0a 324dynamic PMU
747a6e7c
VW
325Since Linux 2.6.38,
326.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19
7db515ef 327.BR perf_event_open ()
f2b1d720
MK
328can support multiple PMUs.
329To enable this, a value exported by the kernel can be used in the
330.I type
331field to indicate which PMU to use.
332The value to use can be found in the sysfs filesystem:
333there is a subdirectory per PMU instance under
334.IR /sys/bus/event_source/devices .
7d182bb6 335In each subdirectory there is a
f2b1d720
MK
336.I type
337file whose content is an integer that can be used in the
338.I type
339field.
340For instance,
341.I /sys/bus/event_source/devices/cpu/type
342contains the value for the core CPU PMU, which is usually 4.
7d8449ba 343.TP
6170a241
MK
344.BR kprobe " and " uprobe " (since Linux 4.17)"
345.\" commit 65074d43fc77bcae32776724b7fa2696923c78e4
346.\" commit e12f03d7031a977356e3d7b75a68c2185ff8d155
347.\" commit 33ea4b24277b06dbc55d7f5772a46f029600255e
7d8449ba 348These two dynamic PMUs create a kprobe/uprobe and attach it to the
c87e72a2
MK
349file descriptor generated by perf_event_open.
350The kprobe/uprobe will be destroyed on the destruction of the file descriptor.
7d8449ba
SL
351See fields
352.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset
353for more details.
f2b1d720 354.RE
f2b1d720
MK
355.TP
356.I "size"
357The size of the
358.I perf_event_attr
359structure for forward/backward compatibility.
360Set this using
361.I sizeof(struct perf_event_attr)
362to allow the kernel to see
363the struct size at the time of compilation.
efeece04 364.IP
f2b1d720
MK
365The related define
366.B PERF_ATTR_SIZE_VER0
367is set to 64; this was the size of the first published struct.
368.B PERF_ATTR_SIZE_VER1
369is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
747a6e7c
VW
370.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
371.\" this was added much later when PERF_ATTR_SIZE_VER2 happened
372.\" but the actual attr_size had increased in 2.6.33
f2b1d720
MK
373.B PERF_ATTR_SIZE_VER2
374is 80 corresponding to the addition of branch sampling in Linux 3.4.
747a6e7c 375.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
d2a6be2f 376.B PERF_ATTR_SIZE_VER3
f2b1d720 377is 96 corresponding to the addition
7ede2f66
DP
378of
379.I sample_regs_user
380and
381.I sample_stack_user
382in Linux 3.7.
747a6e7c 383.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03
f5281dfd
VW
384.B PERF_ATTR_SIZE_VER4
385is 104 corresponding to the addition of
386.I sample_regs_intr
387in Linux 3.19.
388.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
cdc52f4a
VW
389.B PERF_ATTR_SIZE_VER5
390is 112 corresponding to the addition of
2050c098 391.I aux_watermark
cdc52f4a
VW
392in Linux 4.1.
393.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
f2b1d720
MK
394.TP
395.I "config"
396This specifies which event you want, in conjunction with
397the
398.I type
399field.
400The
401.IR config1 " and " config2
402fields are also taken into account in cases where 64 bits is not
403enough to fully specify the event.
404The encoding of these fields are event dependent.
efeece04 405.IP
f2b1d720
MK
406There are various ways to set the
407.I config
408field that are dependent on the value of the previously
409described
410.I type
411field.
412What follows are various possible settings for
413.I config
414separated out by
415.IR type .
efeece04 416.IP
f2b1d720
MK
417If
418.I type
419is
420.BR PERF_TYPE_HARDWARE ,
421we are measuring one of the generalized hardware CPU events.
422Not all of these are available on all platforms.
423Set
424.I config
425to one of the following:
426.RS 12
427.TP
428.B PERF_COUNT_HW_CPU_CYCLES
429Total cycles.
2b538c3e 430Be wary of what happens during CPU frequency scaling.
f2b1d720
MK
431.TP
432.B PERF_COUNT_HW_INSTRUCTIONS
433Retired instructions.
434Be careful, these can be affected by various
2b538c3e 435issues, most notably hardware interrupt counts.
f2b1d720
MK
436.TP
437.B PERF_COUNT_HW_CACHE_REFERENCES
438Cache accesses.
439Usually this indicates Last Level Cache accesses but this may
440vary depending on your CPU.
441This may include prefetches and coherency messages; again this
442depends on the design of your CPU.
443.TP
444.B PERF_COUNT_HW_CACHE_MISSES
445Cache misses.
446Usually this indicates Last Level Cache misses; this is intended to be
447used in conjunction with the
448.B PERF_COUNT_HW_CACHE_REFERENCES
449event to calculate cache miss rates.
450.TP
451.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
452Retired branch instructions.
747a6e7c 453Prior to Linux 2.6.35, this used
f2b1d720 454the wrong event on AMD processors.
747a6e7c 455.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
f2b1d720
MK
456.TP
457.B PERF_COUNT_HW_BRANCH_MISSES
458Mispredicted branch instructions.
459.TP
460.B PERF_COUNT_HW_BUS_CYCLES
461Bus cycles, which can be different from total cycles.
462.TP
31c1f2b0 463.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
747a6e7c 464.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
465Stalled cycles during issue.
466.TP
31c1f2b0 467.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
747a6e7c 468.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
f2b1d720
MK
469Stalled cycles during retirement.
470.TP
31c1f2b0 471.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
60dafbc1 472.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890
f2b1d720
MK
473Total cycles; not affected by CPU frequency scaling.
474.RE
475.IP
476If
477.I type
478is
479.BR PERF_TYPE_SOFTWARE ,
480we are measuring software events provided by the kernel.
481Set
482.I config
483to one of the following:
484.RS 12
485.TP
486.B PERF_COUNT_SW_CPU_CLOCK
487This reports the CPU clock, a high-resolution per-CPU timer.
488.TP
489.B PERF_COUNT_SW_TASK_CLOCK
490This reports a clock count specific to the task that is running.
491.TP
492.B PERF_COUNT_SW_PAGE_FAULTS
493This reports the number of page faults.
494.TP
495.B PERF_COUNT_SW_CONTEXT_SWITCHES
496This counts context switches.
497Until Linux 2.6.34, these were all reported as user-space
498events, after that they are reported as happening in the kernel.
747a6e7c 499.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21
f2b1d720
MK
500.TP
501.B PERF_COUNT_SW_CPU_MIGRATIONS
502This reports the number of times the process
503has migrated to a new CPU.
504.TP
505.B PERF_COUNT_SW_PAGE_FAULTS_MIN
506This counts the number of minor page faults.
507These did not require disk I/O to handle.
508.TP
509.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
510This counts the number of major page faults.
511These required disk I/O to handle.
512.TP
31c1f2b0 513.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
60dafbc1 514.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
515This counts the number of alignment faults.
516These happen when unaligned memory accesses happen; the kernel
517can handle these but it reduces performance.
33a0ccb2 518This happens only on some architectures (never on x86).
f2b1d720 519.TP
31c1f2b0 520.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
60dafbc1 521.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
f2b1d720
MK
522This counts the number of emulation faults.
523The kernel sometimes traps on unimplemented instructions
7db515ef 524and emulates them for user space.
f2b1d720 525This can negatively impact performance.
dab38455 526.TP
31c1f2b0 527.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
60dafbc1 528.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77
dab38455
VW
529This is a placeholder event that counts nothing.
530Informational sample record types such as mmap or comm
531must be associated with an active event.
532This dummy event allows gathering such records without requiring
533a counting event.
f2b1d720 534.RE
efeece04 535.PP
f2b1d720
MK
536.RS
537If
538.I type
539is
540.BR PERF_TYPE_TRACEPOINT ,
541then we are measuring kernel tracepoints.
542The value to use in
543.I config
544can be obtained from under debugfs
545.I tracing/events/*/*/id
546if ftrace is enabled in the kernel.
f2b1d720 547.RE
efeece04 548.PP
f2b1d720
MK
549.RS
550If
551.I type
552is
553.BR PERF_TYPE_HW_CACHE ,
554then we are measuring a hardware CPU cache event.
555To calculate the appropriate
556.I config
557value use the following equation:
408731d4 558.PP
f2b1d720
MK
559.RS 4
560.nf
f2b1d720
MK
561 (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
562 (perf_hw_cache_op_result_id << 16)
563.fi
11ac5b51 564.PP
f2b1d720
MK
565where
566.I perf_hw_cache_id
567is one of:
7db515ef 568.RS 4
f2b1d720
MK
569.TP
570.B PERF_COUNT_HW_CACHE_L1D
571for measuring Level 1 Data Cache
572.TP
573.B PERF_COUNT_HW_CACHE_L1I
574for measuring Level 1 Instruction Cache
575.TP
576.B PERF_COUNT_HW_CACHE_LL
577for measuring Last-Level Cache
578.TP
579.B PERF_COUNT_HW_CACHE_DTLB
580for measuring the Data TLB
581.TP
582.B PERF_COUNT_HW_CACHE_ITLB
583for measuring the Instruction TLB
584.TP
585.B PERF_COUNT_HW_CACHE_BPU
586for measuring the branch prediction unit
587.TP
5a69ce9c
MK
588.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)"
589.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477
f2b1d720
MK
590for measuring local memory accesses
591.RE
11ac5b51 592.PP
f2b1d720
MK
593and
594.I perf_hw_cache_op_id
4af27572 595is one of:
7db515ef 596.RS 4
f2b1d720
MK
597.TP
598.B PERF_COUNT_HW_CACHE_OP_READ
599for read accesses
600.TP
601.B PERF_COUNT_HW_CACHE_OP_WRITE
602for write accesses
603.TP
604.B PERF_COUNT_HW_CACHE_OP_PREFETCH
605for prefetch accesses
606.RE
11ac5b51 607.PP
f2b1d720
MK
608and
609.I perf_hw_cache_op_result_id
4af27572 610is one of:
7db515ef 611.RS 4
f2b1d720
MK
612.TP
613.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
614to measure accesses
615.TP
616.B PERF_COUNT_HW_CACHE_RESULT_MISS
617to measure misses
618.RE
619.RE
efeece04 620.PP
f2b1d720
MK
621If
622.I type
623is
624.BR PERF_TYPE_RAW ,
625then a custom "raw"
626.I config
627value is needed.
628Most CPUs support events that are not covered by the "generalized" events.
629These are implementation defined; see your CPU manual (for example
630the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
631Guide).
632The libpfm4 library can be used to translate from the name in the
633architectural manuals to the raw hex value
634.BR perf_event_open ()
635expects in this field.
efeece04 636.PP
f2b1d720
MK
637If
638.I type
639is
640.BR PERF_TYPE_BREAKPOINT ,
641then leave
642.I config
643set to zero.
644Its parameters are set in other places.
7d8449ba
SL
645.PP
646If
647.I type
648is
649.BR kprobe
650or
651.BR uprobe ,
652set
653.IR retprobe
654(bit 0 of
655.IR config ,
c87e72a2
MK
656see
657.IR /sys/bus/event_source/devices/[k,u]probe/format/retprobe )
658for kretprobe/uretprobe.
659See fields
7d8449ba
SL
660.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset
661for more details.
662.RE
663.TP
664.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset
c87e72a2 665These fields describe the kprobe/uprobe for dynamic PMUs
7d8449ba
SL
666.BR kprobe
667and
668.BR uprobe .
669For
c87e72a2 670.BR kprobe :
7d8449ba
SL
671use
672.I kprobe_func
673and
674.IR probe_offset ,
675or use
676.I kprobe_addr
677and leave
678.I kprobe_func
c87e72a2
MK
679as NULL.
680For
681.BR uprobe :
7d8449ba
SL
682use
683.I uprobe_path
684and
685.IR probe_offset .
f2b1d720
MK
686.TP
687.IR sample_period ", " sample_freq
21977c9d 688A "sampling" event is one that generates an overflow notification
f2b1d720
MK
689every N events, where N is given by
690.IR sample_period .
21977c9d 691A sampling event has
f2b1d720 692.IR sample_period " > 0."
21977c9d 693When an overflow occurs, requested data is recorded
f2b1d720
MK
694in the mmap buffer.
695The
696.I sample_type
21977c9d 697field controls what data is recorded on each overflow.
efeece04 698.IP
f2b1d720
MK
699.I sample_freq
700can be used if you wish to use frequency rather than period.
37bee118 701In this case, you set the
f2b1d720
MK
702.I freq
703flag.
704The kernel will adjust the sampling period
705to try and achieve the desired rate.
706The rate of adjustment is a
707timer tick.
f2b1d720
MK
708.TP
709.I "sample_type"
710The various bits in this field specify which values to include
711in the sample.
712They will be recorded in a ring-buffer,
ad73a2cc 713which is available to user space using
f2b1d720
MK
714.BR mmap (2).
715The order in which the values are saved in the
716sample are documented in the MMAP Layout subsection below;
717it is not the
718.I "enum perf_event_sample_format"
719order.
720.RS
721.TP
722.B PERF_SAMPLE_IP
723Records instruction pointer.
724.TP
725.B PERF_SAMPLE_TID
7db515ef 726Records the process and thread IDs.
f2b1d720
MK
727.TP
728.B PERF_SAMPLE_TIME
729Records a timestamp.
730.TP
731.B PERF_SAMPLE_ADDR
732Records an address, if applicable.
733.TP
734.B PERF_SAMPLE_READ
735Record counter values for all events in a group, not just the group leader.
736.TP
737.B PERF_SAMPLE_CALLCHAIN
738Records the callchain (stack backtrace).
739.TP
740.B PERF_SAMPLE_ID
741Records a unique ID for the opened event's group leader.
742.TP
743.B PERF_SAMPLE_CPU
744Records CPU number.
745.TP
746.B PERF_SAMPLE_PERIOD
747Records the current sampling period.
748.TP
749.B PERF_SAMPLE_STREAM_ID
750Records a unique ID for the opened event.
751Unlike
752.B PERF_SAMPLE_ID
753the actual ID is returned, not the group leader.
8859d3a9
DP
754This ID is the same as the one returned by
755.BR PERF_FORMAT_ID .
f2b1d720
MK
756.TP
757.B PERF_SAMPLE_RAW
758Records additional data, if applicable.
759Usually returned by tracepoint events.
760.TP
31c1f2b0 761.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
60dafbc1 762.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
045bf4d3
VW
763This provides a record of recent branches, as provided
764by CPU branch sampling hardware (such as Intel Last Branch Record).
765Not all hardware supports this feature.
efeece04 766.IP
045bf4d3
VW
767See the
768.I branch_sample_type
769field for how to filter which branches are reported.
f2b1d720 770.TP
31c1f2b0 771.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
60dafbc1 772.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
d1007d14
VW
773Records the current user-level CPU register state
774(the values in the process before the kernel was called).
f2b1d720 775.TP
31c1f2b0 776.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
60dafbc1 777.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
778Records the user level stack, allowing stack unwinding.
779.TP
31c1f2b0 780.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
60dafbc1 781.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c
d1007d14 782Records a hardware provided weight value that expresses how
51700fd7 783costly the sampled event was.
d1007d14
VW
784This allows the hardware to highlight expensive events in
785a profile.
786.TP
31c1f2b0 787.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
60dafbc1 788.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1
d1007d14
VW
789Records the data source: where in the memory hierarchy
790the data associated with the sampled instruction came from.
6170255e 791This is available only if the underlying hardware
d1007d14 792supports this feature.
7480dabb 793.TP
31c1f2b0 794.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
60dafbc1 795.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955
8859d3a9
DP
796Places the
797.B SAMPLE_ID
798value in a fixed position in the record,
7480dabb
VW
799either at the beginning (for sample events) or at the end
800(if a non-sample event).
efeece04 801.IP
7480dabb
VW
802This was necessary because a sample stream may have
803records from various different event sources with different
804.I sample_type
805settings.
e9bd9b2c 806Parsing the event stream properly was not possible because the
8859d3a9
DP
807format of the record was needed to find
808.BR SAMPLE_ID ,
809but
27f52b52 810the format could not be found without knowing what
7480dabb
VW
811event the sample belonged to (causing a circular
812dependency).
efeece04 813.IP
e41c36b2 814The
7480dabb
VW
815.B PERF_SAMPLE_IDENTIFIER
816setting makes the event stream always parsable
8859d3a9
DP
817by putting
818.B SAMPLE_ID
819in a fixed location, even though
820it means having duplicate
821.B SAMPLE_ID
822values in records.
1e043959 823.TP
60dafbc1
MK
824.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)"
825.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5
84fc2a6e 826Records reasons for transactional memory abort events
1e043959 827(for example, from Intel TSX transactional memory support).
efeece04 828.IP
1e043959
VW
829The
830.I precise_ip
b3f39642 831setting must be greater than 0 and a transactional memory abort
1e043959 832event must be measured or no values will be recorded.
84fc2a6e
MK
833Also note that some perf_event measurements, such as sampled
834cycle counting, may cause extraneous aborts (by causing an
1e043959 835interrupt during a transaction).
f5281dfd
VW
836.TP
837.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)"
838.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
839Records a subset of the current CPU register state
840as specified by
841.IR sample_regs_intr .
842Unlike
843.B PERF_SAMPLE_REGS_USER
844the register values will return kernel register
845state if the overflow happened while kernel
846code is running.
847If the CPU supports hardware sampling of
b01ae37b 848register state (i.e., PEBS on Intel x86) and
f5281dfd
VW
849.I precise_ip
850is set higher than zero then the register
851values returned are those captured by
852hardware at the time of the sampled
853instruction's retirement.
f2b1d720 854.RE
f2b1d720
MK
855.TP
856.IR "read_format"
857This field specifies the format of the data returned by
858.BR read (2)
859on a
7db515ef 860.BR perf_event_open ()
f2b1d720
MK
861file descriptor.
862.RS
863.TP
864.B PERF_FORMAT_TOTAL_TIME_ENABLED
7ede2f66
DP
865Adds the 64-bit
866.I time_enabled
867field.
f2b1d720
MK
868This can be used to calculate estimated totals if
869the PMU is overcommitted and multiplexing is happening.
870.TP
871.B PERF_FORMAT_TOTAL_TIME_RUNNING
7ede2f66
DP
872Adds the 64-bit
873.I time_running
874field.
f2b1d720 875This can be used to calculate estimated totals if
3d1ee497 876the PMU is overcommitted and multiplexing is happening.
f2b1d720
MK
877.TP
878.B PERF_FORMAT_ID
879Adds a 64-bit unique value that corresponds to the event group.
880.TP
881.B PERF_FORMAT_GROUP
882Allows all counter values in an event group to be read with one read.
883.RE
f2b1d720
MK
884.TP
885.IR "disabled"
886The
887.I disabled
888bit specifies whether the counter starts out disabled or enabled.
889If disabled, the event can later be enabled by
890.BR ioctl (2),
891.BR prctl (2),
892or
893.IR enable_on_exec .
efeece04 894.IP
406650db
VW
895When creating an event group, typically the group leader is initialized
896with
897.I disabled
898set to 1 and any child events are initialized with
899.I disabled
900set to 0.
901Despite
902.I disabled
903being 0, the child events will not start until the group leader
904is enabled.
f2b1d720
MK
905.TP
906.IR "inherit"
907The
908.I inherit
909bit specifies that this counter should count events of child
910tasks as well as the task specified.
33a0ccb2 911This applies only to new children, not to any existing children at
f2b1d720
MK
912the time the counter is created (nor to any new children of
913existing children).
efeece04 914.IP
f2b1d720 915Inherit does not work for some combinations of
4b3a5f01
MK
916.IR read_format
917values, such as
f2b1d720 918.BR PERF_FORMAT_GROUP .
f2b1d720
MK
919.TP
920.IR "pinned"
921The
922.I pinned
923bit specifies that the counter should always be on the CPU if at all
924possible.
33a0ccb2 925It applies only to hardware counters and only to group leaders.
f2b1d720
MK
926If a pinned counter cannot be put onto the CPU (e.g., because there are
927not enough hardware counters or because of a conflict with some other
928event), then the counter goes into an 'error' state, where reads
929return end-of-file (i.e.,
930.BR read (2)
931returns 0) until the counter is subsequently enabled or disabled.
f2b1d720
MK
932.TP
933.IR "exclusive"
934The
935.I exclusive
936bit specifies that when this counter's group is on the CPU,
937it should be the only group using the CPU's counters.
938In the future this may allow monitoring programs to
939support PMU features that need to run alone so that they do not
940disrupt other hardware counters.
efeece04 941.IP
bea10c8c
VW
942Note that many unexpected situations may prevent events with the
943.I exclusive
d3532647 944bit set from ever running.
bea10c8c 945This includes any users running a system-wide
d3532647 946measurement as well as any kernel use of the performance counters
bea10c8c 947(including the commonly enabled NMI Watchdog Timer interface).
f2b1d720
MK
948.TP
949.IR "exclude_user"
ad73a2cc 950If this bit is set, the count excludes events that happen in user space.
f2b1d720
MK
951.TP
952.IR "exclude_kernel"
edb3e316 953If this bit is set, the count excludes events that happen in kernel space.
f2b1d720
MK
954.TP
955.IR "exclude_hv"
956If this bit is set, the count excludes events that happen in the
957hypervisor.
958This is mainly for PMUs that have built-in support for handling this
959(such as POWER).
960Extra support is needed for handling hypervisor measurements on most
961machines.
f2b1d720
MK
962.TP
963.IR "exclude_idle"
38b581e8
VW
964If set, don't count when the CPU is running the idle task.
965While you can currently enable this for any event type, it is ignored
966for all but software events.
f2b1d720
MK
967.TP
968.IR "mmap"
969The
970.I mmap
75ee11e5 971bit enables generation of
cd7c700a 972.B PERF_RECORD_MMAP
75ee11e5
VW
973samples for every
974.BR mmap (2)
975call that has
cd7c700a 976.B PROT_EXEC
75ee11e5
VW
977set.
978This allows tools to notice new executable code being mapped into
979a program (dynamic shared libraries for example)
980so that addresses can be mapped back to the original code.
f2b1d720
MK
981.TP
982.IR "comm"
983The
984.I comm
985bit enables tracking of process command name as modified by the
cd7c700a 986.BR exec (2)
f2b1d720 987and
cd7c700a 988.BR prctl (PR_SET_NAME)
49bc411c
VW
989system calls as well as writing to
990.IR /proc/self/comm .
790ee6d6 991If the
49bc411c 992.I comm_exec
790ee6d6 993flag is also successfully set (possible since Linux 3.16),
747a6e7c 994.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
995then the misc flag
996.B PERF_RECORD_MISC_COMM_EXEC
997can be used to differentiate the
998.BR exec (2)
999case from the others.
f2b1d720
MK
1000.TP
1001.IR "freq"
1002If this bit is set, then
1003.I sample_frequency
1004not
1005.I sample_period
1006is used when setting up the sampling interval.
f2b1d720
MK
1007.TP
1008.IR "inherit_stat"
1009This bit enables saving of event counts on context switch for
1010inherited tasks.
33a0ccb2 1011This is meaningful only if the
f2b1d720
MK
1012.I inherit
1013field is set.
f2b1d720
MK
1014.TP
1015.IR "enable_on_exec"
1016If this bit is set, a counter is automatically
1017enabled after a call to
1018.BR exec (2).
f2b1d720
MK
1019.TP
1020.IR "task"
1021If this bit is set, then
1022fork/exit notifications are included in the ring buffer.
f2b1d720
MK
1023.TP
1024.IR "watermark"
21977c9d 1025If set, have an overflow notification happen when we cross the
f2b1d720
MK
1026.I wakeup_watermark
1027boundary.
21977c9d 1028Otherwise, overflow notifications happen after
f2b1d720
MK
1029.I wakeup_events
1030samples.
f2b1d720 1031.TP
31c1f2b0 1032.IR "precise_ip" " (since Linux 2.6.35)"
747a6e7c 1033.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076
f2b1d720
MK
1034This controls the amount of skid.
1035Skid is how many instructions
1036execute between an event of interest happening and the kernel
1037being able to stop and record the event.
1038Smaller skid is
1039better and allows more accurate reporting of which events
1040correspond to which instructions, but hardware is often limited
1041with how small this can be.
efeece04 1042.IP
5d73bc3f 1043The possible values of this field are the following:
f2b1d720 1044.RS
dc9ec146 1045.IP 0 3
f2b1d720 1046.B SAMPLE_IP
2b538c3e 1047can have arbitrary skid.
dc9ec146 1048.IP 1
f2b1d720 1049.B SAMPLE_IP
2b538c3e 1050must have constant skid.
dc9ec146 1051.IP 2
f2b1d720 1052.B SAMPLE_IP
2b538c3e 1053requested to have 0 skid.
dc9ec146 1054.IP 3
f2b1d720
MK
1055.B SAMPLE_IP
1056must have 0 skid.
5d73bc3f 1057See also the description of
f2b1d720
MK
1058.BR PERF_RECORD_MISC_EXACT_IP .
1059.RE
f2b1d720 1060.TP
31c1f2b0 1061.IR "mmap_data" " (since Linux 2.6.36)"
747a6e7c 1062.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e
b01ae37b 1063This is the counterpart of the
f2b1d720 1064.I mmap
75ee11e5
VW
1065field.
1066This enables generation of
cd7c700a 1067.B PERF_RECORD_MMAP
75ee11e5
VW
1068samples for
1069.BR mmap (2)
1070calls that do not have
cd7c700a 1071.B PROT_EXEC
75ee11e5 1072set (for example data and SysV shared memory).
f2b1d720 1073.TP
31c1f2b0 1074.IR "sample_id_all" " (since Linux 2.6.38)"
747a6e7c 1075.\" commit c980d1091810df13f21aabbce545fd98f545bbf7
7480dabb 1076If set, then TID, TIME, ID, STREAM_ID, and CPU can
f2b1d720
MK
1077additionally be included in
1078.RB non- PERF_RECORD_SAMPLE s
1079if the corresponding
1080.I sample_type
1081is selected.
efeece04 1082.IP
e9bd9b2c 1083If
7480dabb 1084.B PERF_SAMPLE_IDENTIFIER
37bee118 1085is specified, then an additional ID value is included
7480dabb
VW
1086as the last value to ease parsing the record stream.
1087This may lead to the
e9bd9b2c 1088.I id
7480dabb 1089value appearing twice.
efeece04 1090.IP
7480dabb 1091The layout is described by this pseudo-structure:
efeece04 1092.IP
7480dabb 1093.in +4n
b8302363 1094.EX
7480dabb 1095struct sample_id {
5b0fbedb
MK
1096 { u32 pid, tid; } /* if PERF_SAMPLE_TID set */
1097 { u64 time; } /* if PERF_SAMPLE_TIME set */
1098 { u64 id; } /* if PERF_SAMPLE_ID set */
1099 { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
1100 { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
1101 { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
7480dabb 1102};
5383b93b 1103.EE
c0b34c18 1104.in
f2b1d720 1105.TP
31c1f2b0 1106.IR "exclude_host" " (since Linux 3.2)"
747a6e7c 1107.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e 1108When conducting measurements that include processes running
5d73bc3f
MK
1109VM instances (i.e., have executed a
1110.B KVM_RUN
1111.BR ioctl (2)),
1112only measure events happening inside a guest instance.
e38fb93e
VW
1113This is only meaningful outside the guests; this setting does
1114not change counts gathered inside of a guest.
34d4e61d 1115Currently, this functionality is x86 only.
f2b1d720 1116.TP
31c1f2b0 1117.IR "exclude_guest" " (since Linux 3.2)"
747a6e7c 1118.\" commit a240f76165e6255384d4bdb8139895fac7988799
e38fb93e 1119When conducting measurements that include processes running
5d73bc3f
MK
1120VM instances (i.e., have executed a
1121.B KVM_RUN
1122.BR ioctl (2)),
1123do not measure events happening inside guest instances.
e38fb93e
VW
1124This is only meaningful outside the guests; this setting does
1125not change counts gathered inside of a guest.
34d4e61d 1126Currently, this functionality is x86 only.
f2b1d720 1127.TP
31c1f2b0 1128.IR "exclude_callchain_kernel" " (since Linux 3.7)"
747a6e7c 1129.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1130Do not include kernel callchains.
f2b1d720 1131.TP
31c1f2b0 1132.IR "exclude_callchain_user" " (since Linux 3.7)"
747a6e7c 1133.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
f2b1d720 1134Do not include user callchains.
f2b1d720 1135.TP
9bfc542b 1136.IR "mmap2" " (since Linux 3.16)"
747a6e7c
VW
1137.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741
1138.\" This is tricky; was committed during 3.12 development
1139.\" but right before release was disabled.
1140.\" So while you could select mmap2 starting with 3.12
1141.\" it did not work until 3.16
1142.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005
9bfc542b
VW
1143Generate an extended executable mmap record that contains enough
1144additional information to uniquely identify shared mappings.
1145The
1146.I mmap
1147flag must also be set for this to work.
1148.TP
49bc411c 1149.IR "comm_exec" " (since Linux 3.16)"
747a6e7c 1150.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
5ab35ae5 1151This is purely a feature-detection flag, it does not change
49bc411c 1152kernel behavior.
5ab35ae5 1153If this flag can successfully be set, then, when
49bc411c 1154.I comm
5ab35ae5 1155is enabled, the
49bc411c
VW
1156.B PERF_RECORD_MISC_COMM_EXEC
1157flag will be set in the
1158.I misc
1159field of a comm record header if the rename event being
1160reported was caused by a call to
1161.BR exec (2).
1162This allows tools to distinguish between the various
1163types of process renaming.
1164.TP
6bd5186a
VW
1165.IR "use_clockid" " (since Linux 4.1)"
1166.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1167This allows selecting which internal Linux clock to use
1168when generating timestamps via the
1169.I clockid
1170field.
1171This can make it easier to correlate perf sample times with
1172timestamps generated by other tools.
1173.TP
9277a75d
VW
1174.IR "context_switch" " (since Linux 4.3)"
1175.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
1176This enables the generation of
1177.B PERF_RECORD_SWITCH
1178records when a context switch occurs.
1179It also enables the generation of
1180.B PERF_RECORD_SWITCH_CPU_WIDE
d5a24378 1181records when sampling in CPU-wide mode.
9277a75d
VW
1182This functionality is in addition to existing tracepoint and
1183software events for measuring context switches.
54905b0f
MK
1184The advantage of this method is that it will give full
1185information even with strict
9277a75d
VW
1186.I perf_event_paranoid
1187settings.
1188.TP
f2b1d720
MK
1189.IR "wakeup_events" ", " "wakeup_watermark"
1190This union sets how many samples
1191.RI ( wakeup_events )
1192or bytes
1193.RI ( wakeup_watermark )
21977c9d 1194happen before an overflow notification happens.
f2b1d720
MK
1195Which one is used is selected by the
1196.I watermark
cb8a928f 1197bit flag.
efeece04 1198.IP
751c0f1a 1199.I wakeup_events
6170255e 1200counts only
751c0f1a 1201.B PERF_RECORD_SAMPLE
51700fd7 1202record types.
21977c9d 1203To receive overflow notification for all
751c0f1a 1204.B PERF_RECORD
21977c9d 1205types choose watermark and set
751c0f1a
VW
1206.I wakeup_watermark
1207to 1.
efeece04 1208.IP
fc79d996 1209Prior to Linux 3.0, setting
747a6e7c 1210.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50
21977c9d
VW
1211.I wakeup_events
1212to 0 resulted in no overflow notifications;
1213more recent kernels treat 0 the same as 1.
f2b1d720 1214.TP
31c1f2b0 1215.IR "bp_type" " (since Linux 2.6.33)"
747a6e7c 1216.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1217This chooses the breakpoint type.
1218It is one of:
1219.RS
1220.TP
1221.BR HW_BREAKPOINT_EMPTY
2b538c3e 1222No breakpoint.
f2b1d720
MK
1223.TP
1224.BR HW_BREAKPOINT_R
2b538c3e 1225Count when we read the memory location.
f2b1d720
MK
1226.TP
1227.BR HW_BREAKPOINT_W
2b538c3e 1228Count when we write the memory location.
f2b1d720
MK
1229.TP
1230.BR HW_BREAKPOINT_RW
2b538c3e 1231Count when we read or write the memory location.
f2b1d720
MK
1232.TP
1233.BR HW_BREAKPOINT_X
2b538c3e 1234Count when we execute code at the memory location.
dd3568a1 1235.PP
7db515ef 1236The values can be combined via a bitwise or, but the
f2b1d720
MK
1237combination of
1238.B HW_BREAKPOINT_R
1239or
1240.B HW_BREAKPOINT_W
1241with
1242.B HW_BREAKPOINT_X
1243is not allowed.
1244.RE
f2b1d720 1245.TP
31c1f2b0 1246.IR "bp_addr" " (since Linux 2.6.33)"
747a6e7c 1247.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
5d73bc3f 1248This is the address of the breakpoint.
4b3a5f01
MK
1249For execution breakpoints, this is the memory address of the instruction
1250of interest; for read and write breakpoints, it is the memory address
f2b1d720 1251of the memory location of interest.
f2b1d720 1252.TP
31c1f2b0 1253.IR "config1" " (since Linux 2.6.39)"
747a6e7c 1254.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1255.I config1
1256is used for setting events that need an extra register or otherwise
1257do not fit in the regular config field.
1258Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
4b3a5f01 1259on Linux 3.3 and later kernels.
f2b1d720 1260.TP
31c1f2b0 1261.IR "bp_len" " (since Linux 2.6.33)"
747a6e7c 1262.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
f2b1d720
MK
1263.I bp_len
1264is the length of the breakpoint being measured if
1265.I type
1266is
1267.BR PERF_TYPE_BREAKPOINT .
1268Options are
1269.BR HW_BREAKPOINT_LEN_1 ,
1270.BR HW_BREAKPOINT_LEN_2 ,
1271.BR HW_BREAKPOINT_LEN_4 ,
4b3a5f01 1272and
f2b1d720
MK
1273.BR HW_BREAKPOINT_LEN_8 .
1274For an execution breakpoint, set this to
1275.IR sizeof(long) .
f2b1d720 1276.TP
31c1f2b0 1277.IR "config2" " (since Linux 2.6.39)"
747a6e7c 1278.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
f2b1d720
MK
1279.I config2
1280is a further extension of the
1281.I config1
1282field.
f2b1d720 1283.TP
31c1f2b0 1284.IR "branch_sample_type" " (since Linux 3.4)"
747a6e7c 1285.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
8a94e783 1286If
045bf4d3
VW
1287.B PERF_SAMPLE_BRANCH_STACK
1288is enabled, then this specifies what branches to include
1289in the branch record.
efeece04 1290.IP
e3c9782b 1291The first part of the value is the privilege level, which
4b3a5f01 1292is a combination of one of the values listed below.
045bf4d3
VW
1293If the user does not set privilege level explicitly, the kernel
1294will use the event's privilege level.
1295Event and branch privilege levels do not have to match.
f2b1d720
MK
1296.RS
1297.TP
1298.B PERF_SAMPLE_BRANCH_USER
33d6e2c7 1299Branch target is in user space.
f2b1d720
MK
1300.TP
1301.B PERF_SAMPLE_BRANCH_KERNEL
33d6e2c7 1302Branch target is in kernel space.
f2b1d720
MK
1303.TP
1304.B PERF_SAMPLE_BRANCH_HV
33d6e2c7 1305Branch target is in hypervisor.
e3c9782b
VW
1306.TP
1307.B PERF_SAMPLE_BRANCH_PLM_ALL
1308A convenience value that is the three preceding values ORed together.
11ac5b51 1309.PP
e3c9782b
VW
1310In addition to the privilege value, at least one or more of the
1311following bits must be set.
f2b1d720
MK
1312.TP
1313.B PERF_SAMPLE_BRANCH_ANY
33d6e2c7 1314Any branch type.
f2b1d720
MK
1315.TP
1316.B PERF_SAMPLE_BRANCH_ANY_CALL
c6e5df74 1317Any call branch (includes direct calls, indirect calls, and far jumps).
f2b1d720 1318.TP
e3c9782b 1319.B PERF_SAMPLE_BRANCH_IND_CALL
33d6e2c7 1320Indirect calls.
f2b1d720 1321.TP
c6e5df74
VW
1322.BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)"
1323.\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73
1324Direct calls.
1325.TP
1326.B PERF_SAMPLE_BRANCH_ANY_RETURN
1327Any return branch.
1328.TP
dde354c9
VW
1329.BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)"
1330.\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc
1331Indirect jumps.
1332.TP
aea60aad 1333.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)"
60dafbc1 1334.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050
aea60aad
VW
1335Conditional branches.
1336.TP
31c1f2b0 1337.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
60dafbc1 1338.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1339Transactional memory aborts.
e3c9782b 1340.TP
31c1f2b0 1341.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
60dafbc1 1342.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1343Branch in transactional memory transaction.
e3c9782b 1344.TP
31c1f2b0 1345.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
60dafbc1 1346.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
33d6e2c7 1347Branch not in transactional memory transaction.
bb7e6ff0
VW
1348.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)"
1349.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70
95655a22 1350Branch is part of a hardware-generated call stack.
bb7e6ff0
VW
1351This requires hardware support, currently only found
1352on Intel x86 Haswell or newer.
f2b1d720 1353.RE
f2b1d720 1354.TP
31c1f2b0 1355.IR "sample_regs_user" " (since Linux 3.7)"
747a6e7c 1356.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
4651e412 1357This bit mask defines the set of user CPU registers to dump on samples.
76c637e1 1358The layout of the register mask is architecture-specific and
4b3a5f01 1359is described in the kernel header file
d1007d14 1360.IR arch/ARCH/include/uapi/asm/perf_regs.h .
f2b1d720 1361.TP
31c1f2b0 1362.IR "sample_stack_user" " (since Linux 3.7)"
747a6e7c 1363.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
d1007d14
VW
1364This defines the size of the user stack to dump if
1365.B PERF_SAMPLE_STACK_USER
1366is specified.
6bd5186a
VW
1367.TP
1368.IR "clockid" " (since Linux 4.1)"
1369.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
1370If
1371.I use_clockid
1372is set, then this field selects which internal Linux timer to
1373use for timestamps.
1374The available timers are defined in
1375.IR linux/time.h ,
1376with
95655a22
MK
1377.BR CLOCK_MONOTONIC ,
1378.BR CLOCK_MONOTONIC_RAW ,
1379.BR CLOCK_REALTIME ,
1380.BR CLOCK_BOOTTIME ,
1381and
1382.B CLOCK_TAI
6bd5186a 1383currently supported.
cdc52f4a
VW
1384.TP
1385.IR "aux_watermark" " (since Linux 4.1)"
1386.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
1387This specifies how much data is required to trigger a
1388.B PERF_RECORD_AUX
1389sample.
fd133d5d
VW
1390.TP
1391.IR "sample_max_stack" " (since Linux 4.8)"
1392.\" commit 97c79a38cd454602645f0470ffb444b3b75ce574
1393When
1394.I sample_type
1395includes
5dd3feec 1396.BR PERF_SAMPLE_CALLCHAIN ,
4b3a5f01 1397this field specifies how many stack frames to report when
fd133d5d 1398generating the callchain.
73d8cece 1399.SS Reading results
f2b1d720 1400Once a
7db515ef 1401.BR perf_event_open ()
3d1ee497 1402file descriptor has been opened, the values
f2b1d720
MK
1403of the events can be read from the file descriptor.
1404The values that are there are specified by the
1405.I read_format
7db515ef
MK
1406field in the
1407.I attr
1408structure at open time.
efeece04 1409.PP
f2b1d720 1410If you attempt to read into a buffer that is not big enough to hold the
4b3a5f01 1411data, the error
f2b1d720 1412.B ENOSPC
4b3a5f01 1413results.
efeece04 1414.PP
f2b1d720 1415Here is the layout of the data returned by a read:
e525b89f 1416.IP * 2
f2b1d720
MK
1417If
1418.B PERF_FORMAT_GROUP
1419was specified to allow reading all events in a group at once:
efeece04 1420.IP
f2b1d720 1421.in +4n
b8302363 1422.EX
f2b1d720 1423struct read_format {
e525b89f
MK
1424 u64 nr; /* The number of events */
1425 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1426 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
e307112d 1427 struct {
e525b89f
MK
1428 u64 value; /* The value of the event */
1429 u64 id; /* if PERF_FORMAT_ID */
f2b1d720
MK
1430 } values[nr];
1431};
b8302363 1432.EE
f2b1d720 1433.in
e525b89f 1434.IP *
f2b1d720
MK
1435If
1436.B PERF_FORMAT_GROUP
1437was
1438.I not
e525b89f 1439specified:
efeece04 1440.IP
f2b1d720 1441.in +4n
b8302363 1442.EX
f2b1d720
MK
1443struct read_format {
1444 u64 value; /* The value of the event */
1445 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1446 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1447 u64 id; /* if PERF_FORMAT_ID */
1448};
b8302363 1449.EE
f2b1d720 1450.in
e525b89f
MK
1451.PP
1452The values read are as follows:
f2b1d720
MK
1453.TP
1454.I nr
1455The number of events in this file descriptor.
fcc4f4f4 1456Available only if
f2b1d720
MK
1457.B PERF_FORMAT_GROUP
1458was specified.
f2b1d720
MK
1459.TP
1460.IR time_enabled ", " time_running
1461Total time the event was enabled and running.
4b3a5f01 1462Normally these values are the same.
34211ee3
VW
1463Multiplexing happens if the number of events is more than the
1464number of available PMU counter slots.
1465In that case the events run only part of the time and the
f2b1d720
MK
1466.I time_enabled
1467and
1468.I time running
1469values can be used to scale an estimated value for the count.
f2b1d720
MK
1470.TP
1471.I value
1472An unsigned 64-bit value containing the counter result.
f2b1d720
MK
1473.TP
1474.I id
4b3a5f01 1475A globally unique value for this particular event; only present if
f2b1d720 1476.B PERF_FORMAT_ID
e525b89f
MK
1477was specified in
1478.IR read_format .
73d8cece 1479.SS MMAP layout
f2b1d720 1480When using
7db515ef 1481.BR perf_event_open ()
f2b1d720
MK
1482in sampled mode, asynchronous events
1483(like counter overflow or
1484.B PROT_EXEC
1485mmap tracking)
1486are logged into a ring-buffer.
1487This ring-buffer is created and accessed through
1488.BR mmap (2).
efeece04 1489.PP
f2b1d720
MK
1490The mmap size should be 1+2^n pages, where the first page is a
1491metadata page
e525b89f 1492.RI ( "struct perf_event_mmap_page" )
f2b1d720
MK
1493that contains various
1494bits of information such as where the ring-buffer head is.
efeece04 1495.PP
95655a22 1496Before kernel 2.6.39, there is a bug that means you must allocate an mmap
f2b1d720 1497ring buffer when sampling even if you do not plan to access it.
efeece04 1498.PP
f2b1d720 1499The structure of the first metadata mmap page is as follows:
efeece04 1500.PP
f2b1d720 1501.in +4n
b8302363 1502.EX
f2b1d720 1503struct perf_event_mmap_page {
ce88f77b
MK
1504 __u32 version; /* version number of this structure */
1505 __u32 compat_version; /* lowest version this is compat with */
1506 __u32 lock; /* seqlock for synchronization */
1507 __u32 index; /* hardware counter identifier */
1508 __s64 offset; /* add to hardware counter value */
1509 __u64 time_enabled; /* time event active */
1510 __u64 time_running; /* time event on CPU */
f2b1d720
MK
1511 union {
1512 __u64 capabilities;
135cba8b 1513 struct {
ce88f77b
MK
1514 __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
1515 cap_bit0_is_deprecated : 1,
1516 cap_user_rdpmc : 1,
1517 cap_user_time : 1,
1518 cap_user_time_zero : 1,
135cba8b 1519 };
f2b1d720 1520 };
ce88f77b
MK
1521 __u16 pmc_width;
1522 __u16 time_shift;
1523 __u32 time_mult;
1524 __u64 time_offset;
ee8655b5 1525 __u64 __reserved[120]; /* Pad to 1 k */
ce88f77b
MK
1526 __u64 data_head; /* head in the data section */
1527 __u64 data_tail; /* user-space written tail */
21d9849a
VW
1528 __u64 data_offset; /* where the buffer starts */
1529 __u64 data_size; /* data buffer size */
4e47c6e5
VW
1530 __u64 aux_head;
1531 __u64 aux_tail;
1532 __u64 aux_offset;
1533 __u64 aux_size;
21d9849a 1534
f2b1d720 1535}
b8302363 1536.EE
f2b1d720 1537.in
efeece04 1538.PP
ce88f77b 1539The following list describes the fields in the
f2b1d720 1540.I perf_event_mmap_page
e525b89f 1541structure in more detail:
f2b1d720
MK
1542.TP
1543.I version
1544Version number of this structure.
f2b1d720
MK
1545.TP
1546.I compat_version
1547The lowest version this is compatible with.
f2b1d720
MK
1548.TP
1549.I lock
1550A seqlock for synchronization.
f2b1d720
MK
1551.TP
1552.I index
1553A unique hardware counter identifier.
f2b1d720
MK
1554.TP
1555.I offset
135cba8b
VW
1556When using rdpmc for reads this offset value
1557must be added to the one returned by rdpmc to get
1558the current total event count.
f2b1d720
MK
1559.TP
1560.I time_enabled
1561Time the event was active.
f2b1d720
MK
1562.TP
1563.I time_running
1564Time the event was running.
f2b1d720 1565.TP
31c1f2b0 1566.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
747a6e7c 1567.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
e9bd9b2c 1568There was a bug in the definition of
f2b1d720 1569.I cap_usr_time
135cba8b
VW
1570and
1571.I cap_usr_rdpmc
1572from Linux 3.4 until Linux 3.11.
1573Both bits were defined to point to the same location, so it was
e9bd9b2c 1574impossible to know if
135cba8b
VW
1575.I cap_usr_time
1576or
1577.I cap_usr_rdpmc
1578were actually set.
efeece04 1579.IP
4010bc07 1580Starting with Linux 3.12, these are renamed to
747a6e7c 1581.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b 1582.I cap_bit0
e41c36b2 1583and you should use the
135cba8b
VW
1584.I cap_user_time
1585and
1586.I cap_user_rdpmc
1587fields instead.
f2b1d720 1588.TP
31c1f2b0 1589.IR cap_bit0_is_deprecated " (since Linux 3.12)"
747a6e7c 1590.\" commit fa7315871046b9a4c48627905691dbde57e51033
37bee118 1591If set, this bit indicates that the kernel supports
135cba8b
VW
1592the properly separated
1593.I cap_user_time
1594and
1595.I cap_user_rdpmc
1596bits.
efeece04 1597.IP
135cba8b
VW
1598If not-set, it indicates an older kernel where
1599.I cap_usr_time
1600and
f2b1d720 1601.I cap_usr_rdpmc
135cba8b
VW
1602map to the same bit and thus both features should
1603be used with caution.
135cba8b 1604.TP
31c1f2b0 1605.IR cap_user_rdpmc " (since Linux 3.12)"
747a6e7c 1606.\" commit fa7315871046b9a4c48627905691dbde57e51033
f2b1d720
MK
1607If the hardware supports user-space read of performance counters
1608without syscall (this is the "rdpmc" instruction on x86), then
1609the following code can be used to do a read:
efeece04 1610.IP
f2b1d720 1611.in +4n
b8302363 1612.EX
f2b1d720
MK
1613u32 seq, time_mult, time_shift, idx, width;
1614u64 count, enabled, running;
1615u64 cyc, time_offset;
f2b1d720
MK
1616
1617do {
1618 seq = pc\->lock;
1619 barrier();
1620 enabled = pc\->time_enabled;
1621 running = pc\->time_running;
1622
1623 if (pc\->cap_usr_time && enabled != running) {
1624 cyc = rdtsc();
1625 time_offset = pc\->time_offset;
1626 time_mult = pc\->time_mult;
1627 time_shift = pc\->time_shift;
1628 }
1629
1630 idx = pc\->index;
1631 count = pc\->offset;
1632
1633 if (pc\->cap_usr_rdpmc && idx) {
1634 width = pc\->pmc_width;
135cba8b 1635 count += rdpmc(idx \- 1);
f2b1d720
MK
1636 }
1637
1638 barrier();
1639} while (pc\->lock != seq);
b8302363 1640.EE
f2b1d720 1641.in
f2b1d720 1642.TP
cc19ea28 1643.IR cap_user_time " (since Linux 3.12)"
747a6e7c 1644.\" commit fa7315871046b9a4c48627905691dbde57e51033
7d182bb6 1645This bit indicates the hardware has a constant, nonstop
135cba8b
VW
1646timestamp counter (TSC on x86).
1647.TP
31c1f2b0 1648.IR cap_user_time_zero " (since Linux 3.12)"
747a6e7c 1649.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
1650Indicates the presence of
1651.I time_zero
1652which allows mapping timestamp values to
1653the hardware clock.
1654.TP
f2b1d720
MK
1655.I pmc_width
1656If
1657.IR cap_usr_rdpmc ,
1658this field provides the bit-width of the value
1659read using the rdpmc or equivalent instruction.
1660This can be used to sign extend the result like:
efeece04 1661.IP
f2b1d720 1662.in +4n
b8302363 1663.EX
f2b1d720
MK
1664pmc <<= 64 \- pmc_width;
1665pmc >>= 64 \- pmc_width; // signed shift right
1666count += pmc;
b8302363 1667.EE
f2b1d720 1668.in
f2b1d720
MK
1669.TP
1670.IR time_shift ", " time_mult ", " time_offset
efeece04 1671.IP
f2b1d720
MK
1672If
1673.IR cap_usr_time ,
1674these fields can be used to compute the time
4b3a5f01
MK
1675delta since
1676.I time_enabled
1677(in nanoseconds) using rdtsc or similar.
408731d4 1678.IP
f2b1d720 1679.nf
f2b1d720
MK
1680 u64 quot, rem;
1681 u64 delta;
1682 quot = (cyc >> time_shift);
988688f6 1683 rem = cyc & (((u64)1 << time_shift) \- 1);
f2b1d720
MK
1684 delta = time_offset + quot * time_mult +
1685 ((rem * time_mult) >> time_shift);
1686.fi
efeece04 1687.IP
7db515ef
MK
1688Where
1689.IR time_offset ,
1690.IR time_mult ,
1691.IR time_shift ,
1692and
1693.IR cyc
1694are read in the
f2b1d720
MK
1695seqcount loop described above.
1696This delta can then be added to
1697enabled and possible running (if idx), improving the scaling:
408731d4 1698.IP
f2b1d720 1699.nf
f2b1d720
MK
1700 enabled += delta;
1701 if (idx)
1702 running += delta;
1703 quot = count / running;
1704 rem = count % running;
1705 count = quot * enabled + (rem * enabled) / running;
1706.fi
f2b1d720 1707.TP
31c1f2b0 1708.IR time_zero " (since Linux 3.12)"
747a6e7c 1709.\" commit fa7315871046b9a4c48627905691dbde57e51033
efeece04 1710.IP
e9bd9b2c 1711If
135cba8b 1712.I cap_usr_time_zero
37bee118 1713is set, then the hardware clock (the TSC timestamp counter on x86)
135cba8b
VW
1714can be calculated from the
1715.IR time_zero ", " time_mult ", and " time_shift " values:"
efeece04 1716.IP
135cba8b
VW
1717.nf
1718 time = timestamp - time_zero;
1719 quot = time / time_mult;
1720 rem = time % time_mult;
1721 cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
1722.fi
efeece04 1723.IP
135cba8b 1724And vice versa:
efeece04 1725.IP
135cba8b
VW
1726.nf
1727 quot = cyc >> time_shift;
988688f6 1728 rem = cyc & (((u64)1 << time_shift) - 1);
135cba8b
VW
1729 timestamp = time_zero + quot * time_mult +
1730 ((rem * time_mult) >> time_shift);
1731.fi
1732.TP
f2b1d720
MK
1733.I data_head
1734This points to the head of the data section.
7db515ef
MK
1735The value continuously increases, it does not wrap.
1736The value needs to be manually wrapped by the size of the mmap buffer
f2b1d720 1737before accessing the samples.
efeece04 1738.IP
ce88f77b
MK
1739On SMP-capable platforms, after reading the
1740.I data_head
1741value,
ad73a2cc 1742user space should issue an rmb().
f2b1d720 1743.TP
fecd584f 1744.I data_tail
f2b1d720
MK
1745When the mapping is
1746.BR PROT_WRITE ,
7db515ef
MK
1747the
1748.I data_tail
1749value should be written by user space to reflect the last read data.
31020de9 1750In this case, the kernel will not overwrite unread data.
21d9849a
VW
1751.TP
1752.IR data_offset " (since Linux 4.1)"
1753.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1754Contains the offset of the location in the mmap buffer
1755where perf sample data begins.
1756.TP
1757.IR data_size " (since Linux 4.1)"
1758.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
1759Contains the size of the perf sample region within
1760the mmap buffer.
4e47c6e5
VW
1761.TP
1762.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)
1763.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff
95655a22
MK
1764The AUX region allows mmaping a separate sample buffer for
1765high-bandwidth data streams (separate from the main perf sample buffer).
1766An example of a high-bandwidth stream is instruction tracing support,
4e47c6e5 1767as is found in newer Intel processors.
efeece04 1768.IP
4e47c6e5
VW
1769To set up an AUX area, first
1770.I aux_offset
1771needs to be set with an offset greater than
1772.IR data_offset + data_size
1773and
1774.I aux_size
1775needs to be set to the desired buffer size.
1776The desired offset and size must be page aligned, and the size
1777must be a power of two.
1778These values are then passed to mmap in order to map the AUX buffer.
95655a22
MK
1779Pages in the AUX buffer are included as part of the
1780.BR RLIMIT_MEMLOCK
1781resource limit (see
1782.BR setrlimit (2)),
1783and also as part of the
4e47c6e5
VW
1784.I perf_event_mlock_kb
1785allowance.
efeece04 1786.IP
95655a22 1787By default, the AUX buffer will be truncated if it will not fit
b1355f6a
VW
1788in the available space in the ring buffer.
1789If the AUX buffer is mapped as a read only buffer, then it will
1790operate in ring buffer mode where old data will be overwritten
1791by new.
95655a22 1792In overwrite mode, it might not be possible to infer where the
b1355f6a
VW
1793new data began, and it is the consumer's job to disable
1794measurement while reading to avoid possible data races.
efeece04 1795.IP
4e47c6e5
VW
1796The
1797.IR aux_head " and " aux_tail
1798ring buffer pointers have the same behavior and ordering
1799rules as the previous described
1800.IR data_head " and " data_tail .
e525b89f 1801.PP
f2b1d720 1802The following 2^n ring-buffer pages have the layout described below.
efeece04 1803.PP
f2b1d720
MK
1804If
1805.I perf_event_attr.sample_id_all
1806is set, then all event types will
1807have the sample_type selected fields related to where/when (identity)
1808an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1809.B PERF_RECORD_SAMPLE
1810below, it will be stashed just after the
7db515ef
MK
1811.I perf_event_header
1812and the fields already present for the existing
3d1ee497 1813fields, that is, at the end of the payload.
4b3a5f01
MK
1814This allows a newer perf.data
1815file to be supported by older perf tools, with the new optional
f2b1d720 1816fields being ignored.
efeece04 1817.PP
f2b1d720 1818The mmap values start with a header:
efeece04 1819.PP
f2b1d720 1820.in +4n
b8302363 1821.EX
f2b1d720
MK
1822struct perf_event_header {
1823 __u32 type;
1824 __u16 misc;
1825 __u16 size;
1826};
b8302363 1827.EE
f2b1d720 1828.in
efeece04 1829.PP
f2b1d720
MK
1830Below, we describe the
1831.I perf_event_header
1832fields in more detail.
4047bc6c
MK
1833For ease of reading,
1834the fields with shorter descriptions are presented first.
1835.TP
1836.I size
1837This indicates the size of the record.
1838.TP
1839.I misc
1840The
1841.I misc
1842field contains additional information about the sample.
efeece04 1843.IP
4047bc6c
MK
1844The CPU mode can be determined from this value by masking with
1845.B PERF_RECORD_MISC_CPUMODE_MASK
1846and looking for one of the following (note these are not
1847bit masks, only one can be set at a time):
1848.RS
1849.TP
1850.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1851Unknown CPU mode.
1852.TP
1853.B PERF_RECORD_MISC_KERNEL
1854Sample happened in the kernel.
1855.TP
1856.B PERF_RECORD_MISC_USER
1857Sample happened in user code.
1858.TP
1859.B PERF_RECORD_MISC_HYPERVISOR
1860Sample happened in the hypervisor.
1861.TP
747a6e7c 1862.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)"
60dafbc1 1863.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1864Sample happened in the guest kernel.
1865.TP
747a6e7c 1866.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)"
60dafbc1 1867.\" commit 39447b386c846bbf1c56f6403c5282837486200f
4047bc6c
MK
1868Sample happened in guest user code.
1869.RE
efeece04 1870.PP
4047bc6c 1871.RS
d5a24378
MK
1872Since the following three statuses are generated by
1873different record types, they alias to the same bit:
4047bc6c 1874.TP
60dafbc1
MK
1875.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)"
1876.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75
4047bc6c
MK
1877This is set when the mapping is not executable;
1878otherwise the mapping is executable.
1879.TP
60dafbc1
MK
1880.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)"
1881.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
49bc411c
VW
1882This is set for a
1883.B PERF_RECORD_COMM
1884record on kernels more recent than Linux 3.16
1885if a process name change was caused by an
1886.BR exec (2)
1887system call.
9277a75d
VW
1888.TP
1889.BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)"
1890.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
1891When a
d5a24378
MK
1892.BR PERF_RECORD_SWITCH
1893or
1894.BR PERF_RECORD_SWITCH_CPU_WIDE
1895record is generated, this bit indicates that the
9277a75d 1896context switch is away from the current process
d5a24378 1897(instead of into the current process).
9277a75d 1898.RE
efeece04 1899.PP
9277a75d
VW
1900.RS
1901In addition, the following bits can be set:
49bc411c 1902.TP
4047bc6c
MK
1903.B PERF_RECORD_MISC_EXACT_IP
1904This indicates that the content of
1905.B PERF_SAMPLE_IP
1906points
1907to the actual instruction that triggered the event.
1908See also
1909.IR perf_event_attr.precise_ip .
1910.TP
60dafbc1
MK
1911.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)"
1912.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74
4047bc6c 1913This indicates there is extended data available (currently not used).
ffbc7c02
VW
1914.TP
1915.B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT
1916.\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4
1917This bit is not set by the kernel.
141efa1b
MK
1918It is reserved for the user-space perf utility to indicate that
1919.I /proc/i[pid]/maps
1920parsing was taking too long and was stopped, and thus the mmap
ffbc7c02 1921records may be truncated.
4047bc6c 1922.RE
f2b1d720
MK
1923.TP
1924.I type
1925The
1926.I type
1927value is one of the below.
1928The values in the corresponding record (that follows the header)
1929depend on the
1930.I type
1931selected as shown.
f2b1d720 1932.RS
7db515ef 1933.TP 4
f2b1d720
MK
1934.B PERF_RECORD_MMAP
1935The MMAP events record the
1936.B PROT_EXEC
1937mappings so that we can correlate
ad73a2cc 1938user-space IPs to code.
f2b1d720 1939They have the following structure:
efeece04 1940.IP
f2b1d720 1941.in +4n
b8302363 1942.EX
f2b1d720
MK
1943struct {
1944 struct perf_event_header header;
1945 u32 pid, tid;
1946 u64 addr;
1947 u64 len;
1948 u64 pgoff;
1949 char filename[];
1950};
b8302363 1951.EE
f2b1d720 1952.in
9bfc542b
VW
1953.RS
1954.TP
1955.I pid
3a058284 1956is the process ID.
9bfc542b
VW
1957.TP
1958.I tid
3a058284 1959is the thread ID.
9bfc542b
VW
1960.TP
1961.I addr
1962is the address of the allocated memory.
1963.I len
1964is the length of the allocated memory.
1965.I pgoff
1966is the page offset of the allocated memory.
1967.I filename
1968is a string describing the backing of the allocated memory.
1969.RE
f2b1d720
MK
1970.TP
1971.B PERF_RECORD_LOST
1972This record indicates when events are lost.
efeece04 1973.IP
f2b1d720 1974.in +4n
b8302363 1975.EX
f2b1d720
MK
1976struct {
1977 struct perf_event_header header;
7a10da70
MK
1978 u64 id;
1979 u64 lost;
7480dabb 1980 struct sample_id sample_id;
f2b1d720 1981};
b8302363 1982.EE
f2b1d720 1983.in
f2b1d720
MK
1984.RS
1985.TP
1986.I id
1987is the unique event ID for the samples that were lost.
1988.TP
1989.I lost
1990is the number of events that were lost.
1991.RE
f2b1d720
MK
1992.TP
1993.B PERF_RECORD_COMM
1994This record indicates a change in the process name.
efeece04 1995.IP
f2b1d720 1996.in +4n
b8302363 1997.EX
f2b1d720
MK
1998struct {
1999 struct perf_event_header header;
7a10da70
MK
2000 u32 pid;
2001 u32 tid;
2002 char comm[];
7480dabb 2003 struct sample_id sample_id;
f2b1d720 2004};
b8302363 2005.EE
f2b1d720 2006.in
49bc411c
VW
2007.RS
2008.TP
2009.I pid
5ab35ae5 2010is the process ID.
49bc411c
VW
2011.TP
2012.I tid
5ab35ae5 2013is the thread ID.
49bc411c
VW
2014.TP
2015.I comm
2016is a string containing the new name of the process.
2017.RE
f2b1d720
MK
2018.TP
2019.B PERF_RECORD_EXIT
2020This record indicates a process exit event.
efeece04 2021.IP
f2b1d720 2022.in +4n
b8302363 2023.EX
f2b1d720
MK
2024struct {
2025 struct perf_event_header header;
7a10da70
MK
2026 u32 pid, ppid;
2027 u32 tid, ptid;
2028 u64 time;
7480dabb 2029 struct sample_id sample_id;
f2b1d720 2030};
b8302363 2031.EE
f2b1d720 2032.in
f2b1d720
MK
2033.TP
2034.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
2035This record indicates a throttle/unthrottle event.
efeece04 2036.IP
f2b1d720 2037.in +4n
b8302363 2038.EX
f2b1d720
MK
2039struct {
2040 struct perf_event_header header;
7a10da70
MK
2041 u64 time;
2042 u64 id;
2043 u64 stream_id;
7480dabb 2044 struct sample_id sample_id;
f2b1d720 2045};
b8302363 2046.EE
f2b1d720 2047.in
f2b1d720
MK
2048.TP
2049.B PERF_RECORD_FORK
2050This record indicates a fork event.
efeece04 2051.IP
f2b1d720 2052.in +4n
b8302363 2053.EX
f2b1d720
MK
2054struct {
2055 struct perf_event_header header;
7a10da70
MK
2056 u32 pid, ppid;
2057 u32 tid, ptid;
2058 u64 time;
7480dabb 2059 struct sample_id sample_id;
f2b1d720 2060};
b8302363 2061.EE
f2b1d720 2062.in
f2b1d720
MK
2063.TP
2064.B PERF_RECORD_READ
2065This record indicates a read event.
efeece04 2066.IP
f2b1d720 2067.in +4n
b8302363 2068.EX
f2b1d720
MK
2069struct {
2070 struct perf_event_header header;
7a10da70 2071 u32 pid, tid;
f2b1d720 2072 struct read_format values;
7480dabb 2073 struct sample_id sample_id;
f2b1d720 2074};
b8302363 2075.EE
f2b1d720 2076.in
f2b1d720
MK
2077.TP
2078.B PERF_RECORD_SAMPLE
2079This record indicates a sample.
efeece04 2080.IP
f2b1d720 2081.in +4n
b8302363 2082.EX
f2b1d720
MK
2083struct {
2084 struct perf_event_header header;
f96e6174
MK
2085 u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
2086 u64 ip; /* if PERF_SAMPLE_IP */
2087 u32 pid, tid; /* if PERF_SAMPLE_TID */
2088 u64 time; /* if PERF_SAMPLE_TIME */
2089 u64 addr; /* if PERF_SAMPLE_ADDR */
2090 u64 id; /* if PERF_SAMPLE_ID */
2091 u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
2092 u32 cpu, res; /* if PERF_SAMPLE_CPU */
2093 u64 period; /* if PERF_SAMPLE_PERIOD */
2094 struct read_format v;
2095 /* if PERF_SAMPLE_READ */
2096 u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
2097 u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
2098 u32 size; /* if PERF_SAMPLE_RAW */
2099 char data[size]; /* if PERF_SAMPLE_RAW */
2100 u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
7db515ef 2101 struct perf_branch_entry lbr[bnr];
f96e6174
MK
2102 /* if PERF_SAMPLE_BRANCH_STACK */
2103 u64 abi; /* if PERF_SAMPLE_REGS_USER */
7a10da70 2104 u64 regs[weight(mask)];
f96e6174
MK
2105 /* if PERF_SAMPLE_REGS_USER */
2106 u64 size; /* if PERF_SAMPLE_STACK_USER */
2107 char data[size]; /* if PERF_SAMPLE_STACK_USER */
2108 u64 dyn_size; /* if PERF_SAMPLE_STACK_USER &&
2109 size != 0 */
2110 u64 weight; /* if PERF_SAMPLE_WEIGHT */
2111 u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
2112 u64 transaction; /* if PERF_SAMPLE_TRANSACTION */
2113 u64 abi; /* if PERF_SAMPLE_REGS_INTR */
7a10da70 2114 u64 regs[weight(mask)];
f96e6174 2115 /* if PERF_SAMPLE_REGS_INTR */
f2b1d720 2116};
ba4924aa 2117.EE
4047bc6c
MK
2118.RS 4
2119.TP 4
7480dabb
VW
2120.I sample_id
2121If
2122.B PERF_SAMPLE_IDENTIFIER
2123is enabled, a 64-bit unique ID is included.
e9bd9b2c 2124This is a duplication of the
7480dabb
VW
2125.B PERF_SAMPLE_ID
2126.I id
2127value, but included at the beginning of the sample
2128so parsers can easily obtain the value.
2129.TP
f2b1d720 2130.I ip
7db515ef
MK
2131If
2132.B PERF_SAMPLE_IP
2133is enabled, then a 64-bit instruction
f2b1d720 2134pointer value is included.
f2b1d720 2135.TP
7db515ef
MK
2136.IR pid ", " tid
2137If
2138.B PERF_SAMPLE_TID
2139is enabled, then a 32-bit process ID
2140and 32-bit thread ID are included.
f2b1d720
MK
2141.TP
2142.I time
7db515ef
MK
2143If
2144.B PERF_SAMPLE_TIME
2145is enabled, then a 64-bit timestamp
f2b1d720
MK
2146is included.
2147This is obtained via local_clock() which is a hardware timestamp
2148if available and the jiffies value if not.
f2b1d720
MK
2149.TP
2150.I addr
7db515ef
MK
2151If
2152.B PERF_SAMPLE_ADDR
2153is enabled, then a 64-bit address is included.
f2b1d720
MK
2154This is usually the address of a tracepoint,
2155breakpoint, or software event; otherwise the value is 0.
f2b1d720
MK
2156.TP
2157.I id
7db515ef
MK
2158If
2159.B PERF_SAMPLE_ID
2160is enabled, a 64-bit unique ID is included.
f2b1d720 2161If the event is a member of an event group, the group leader ID is returned.
7db515ef
MK
2162This ID is the same as the one returned by
2163.BR PERF_FORMAT_ID .
f2b1d720
MK
2164.TP
2165.I stream_id
7db515ef
MK
2166If
2167.B PERF_SAMPLE_STREAM_ID
2168is enabled, a 64-bit unique ID is included.
f2b1d720
MK
2169Unlike
2170.B PERF_SAMPLE_ID
2171the actual ID is returned, not the group leader.
7db515ef
MK
2172This ID is the same as the one returned by
2173.BR PERF_FORMAT_ID .
f2b1d720 2174.TP
7db515ef
MK
2175.IR cpu ", " res
2176If
2177.B PERF_SAMPLE_CPU
2178is enabled, this is a 32-bit value indicating
f2b1d720
MK
2179which CPU was being used, in addition to a reserved (unused)
218032-bit value.
f2b1d720
MK
2181.TP
2182.I period
7db515ef
MK
2183If
2184.B PERF_SAMPLE_PERIOD
2185is enabled, a 64-bit value indicating
f2b1d720 2186the current sampling period is written.
f2b1d720
MK
2187.TP
2188.I v
7db515ef
MK
2189If
2190.B PERF_SAMPLE_READ
2191is enabled, a structure of type read_format
f2b1d720
MK
2192is included which has values for all events in the event group.
2193The values included depend on the
2194.I read_format
7db515ef
MK
2195value used at
2196.BR perf_event_open ()
2197time.
f2b1d720 2198.TP
7db515ef
MK
2199.IR nr ", " ips[nr]
2200If
2201.B PERF_SAMPLE_CALLCHAIN
2202is enabled, then a 64-bit number is included
f2b1d720 2203which indicates how many following 64-bit instruction pointers will
7db515ef
MK
2204follow.
2205This is the current callchain.
f2b1d720 2206.TP
7ede2f66 2207.IR size ", " data[size]
7db515ef
MK
2208If
2209.B PERF_SAMPLE_RAW
2210is enabled, then a 32-bit value indicating size
f2b1d720
MK
2211is included followed by an array of 8-bit values of length size.
2212The values are padded with 0 to have 64-bit alignment.
efeece04 2213.IP
f2b1d720
MK
2214This RAW record data is opaque with respect to the ABI.
2215The ABI doesn't make any promises with respect to the stability
2216of its content, it may vary depending
2217on event, hardware, and kernel version.
f2b1d720 2218.TP
7db515ef
MK
2219.IR bnr ", " lbr[bnr]
2220If
2221.B PERF_SAMPLE_BRANCH_STACK
2222is enabled, then a 64-bit value indicating
2223the number of records is included, followed by
2224.I bnr
2225.I perf_branch_entry
045bf4d3
VW
2226structures which each include the fields:
2227.RS
2228.TP
2229.I from
2b538c3e 2230This indicates the source instruction (may not be a branch).
045bf4d3
VW
2231.TP
2232.I to
2b538c3e 2233The branch target.
045bf4d3
VW
2234.TP
2235.I mispred
2b538c3e 2236The branch target was mispredicted.
045bf4d3
VW
2237.TP
2238.I predicted
2b538c3e 2239The branch target was predicted.
e3c9782b 2240.TP
31c1f2b0 2241.IR in_tx " (since Linux 3.11)"
747a6e7c 2242.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2243The branch was in a transactional memory transaction.
e3c9782b 2244.TP
31c1f2b0 2245.IR abort " (since Linux 3.11)"
747a6e7c 2246.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
2b538c3e 2247The branch was in an aborted transactional memory transaction.
96919592
VW
2248.TP
2249.IR cycles " (since Linux 4.3)"
2250.\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
2251This reports the number of cycles elapsed since the
2252previous branch stack update.
11ac5b51 2253.PP
045bf4d3
VW
2254The entries are from most to least recent, so the first entry
2255has the most recent branch.
efeece04 2256.PP
8a94e783 2257Support for
dceb9af6
MK
2258.IR mispred ,
2259.IR predicted ,
2260and
2261.IR cycles
96919592 2262is optional; if not supported, those
045bf4d3 2263values will be 0.
efeece04 2264.PP
e3c9782b
VW
2265The type of branches recorded is specified by the
2266.I branch_sample_type
2267field.
2268.RE
f2b1d720 2269.TP
7db515ef
MK
2270.IR abi ", " regs[weight(mask)]
2271If
2272.B PERF_SAMPLE_REGS_USER
d1007d14 2273is enabled, then the user CPU registers are recorded.
efeece04 2274.IP
f2b1d720
MK
2275The
2276.I abi
2277field is one of
2278.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
7db515ef 2279.BR PERF_SAMPLE_REGS_ABI_64 .
efeece04 2280.IP
d1007d14
VW
2281The
2282.I regs
2283field is an array of the CPU registers that were specified by
2284the
2285.I sample_regs_user
2286attr field.
2287The number of values is the number of bits set in the
51700fd7 2288.I sample_regs_user
4651e412 2289bit mask.
f2b1d720 2290.TP
7db515ef
MK
2291.IR size ", " data[size] ", " dyn_size
2292If
2293.B PERF_SAMPLE_STACK_USER
02ca78a0
VW
2294is enabled, then the user stack is recorded.
2295This can be used to generate stack backtraces.
d1007d14
VW
2296.I size
2297is the size requested by the user in
02ca78a0 2298.I sample_stack_user
d1007d14
VW
2299or else the maximum record size.
2300.I data
02ca78a0
VW
2301is the stack data (a raw dump of the memory pointed to by the
2302stack pointer at the time of sampling).
d1007d14
VW
2303.I dyn_size
2304is the amount of data actually dumped (can be less than
460e3d7a 2305.IR size ).
4dc411dd
KF
2306Note that
2307.I dyn_size
2308is omitted if
2309.I size
2310is 0.
d1007d14 2311.TP
51700fd7 2312.I weight
d1007d14
VW
2313If
2314.B PERF_SAMPLE_WEIGHT
7de4a1e3 2315is enabled, then a 64-bit value provided by the hardware
d1007d14
VW
2316is recorded that indicates how costly the event was.
2317This allows expensive events to stand out more clearly
2318in profiles.
2319.TP
2320.I data_src
51700fd7 2321If
d1007d14 2322.B PERF_SAMPLE_DATA_SRC
7de4a1e3 2323is enabled, then a 64-bit value is recorded that is made up of
d1007d14
VW
2324the following fields:
2325.RS
2b538c3e 2326.TP 4
d1007d14 2327.I mem_op
2b538c3e 2328Type of opcode, a bitwise combination of:
efeece04 2329.IP
2b538c3e
MK
2330.PD 0
2331.RS
2332.TP 24
d1007d14 2333.B PERF_MEM_OP_NA
2b538c3e
MK
2334Not available
2335.TP
d1007d14 2336.B PERF_MEM_OP_LOAD
2b538c3e
MK
2337Load instruction
2338.TP
d1007d14 2339.B PERF_MEM_OP_STORE
2b538c3e
MK
2340Store instruction
2341.TP
d1007d14 2342.B PERF_MEM_OP_PFETCH
2b538c3e
MK
2343Prefetch
2344.TP
d1007d14 2345.B PERF_MEM_OP_EXEC
2b538c3e
MK
2346Executable code
2347.RE
2348.PD
d1007d14
VW
2349.TP
2350.I mem_lvl
bc9d90b5 2351Memory hierarchy level hit or miss, a bitwise combination of
ef4f4031 2352the following, shifted left by
bc9d90b5 2353.BR PERF_MEM_LVL_SHIFT :
efeece04 2354.IP
2b538c3e
MK
2355.PD 0
2356.RS
2357.TP 24
d1007d14 2358.B PERF_MEM_LVL_NA
2b538c3e
MK
2359Not available
2360.TP
d1007d14 2361.B PERF_MEM_LVL_HIT
2b538c3e
MK
2362Hit
2363.TP
d1007d14 2364.B PERF_MEM_LVL_MISS
2b538c3e
MK
2365Miss
2366.TP
d1007d14 2367.B PERF_MEM_LVL_L1
2b538c3e
MK
2368Level 1 cache
2369.TP
d1007d14 2370.B PERF_MEM_LVL_LFB
2b538c3e
MK
2371Line fill buffer
2372.TP
d1007d14 2373.B PERF_MEM_LVL_L2
2b538c3e
MK
2374Level 2 cache
2375.TP
d1007d14 2376.B PERF_MEM_LVL_L3
2b538c3e
MK
2377Level 3 cache
2378.TP
d1007d14 2379.B PERF_MEM_LVL_LOC_RAM
2b538c3e
MK
2380Local DRAM
2381.TP
d1007d14 2382.B PERF_MEM_LVL_REM_RAM1
2b538c3e
MK
2383Remote DRAM 1 hop
2384.TP
d1007d14 2385.B PERF_MEM_LVL_REM_RAM2
2b538c3e
MK
2386Remote DRAM 2 hops
2387.TP
d1007d14 2388.B PERF_MEM_LVL_REM_CCE1
2b538c3e
MK
2389Remote cache 1 hop
2390.TP
d1007d14 2391.B PERF_MEM_LVL_REM_CCE2
2b538c3e
MK
2392Remote cache 2 hops
2393.TP
d1007d14 2394.B PERF_MEM_LVL_IO
2b538c3e
MK
2395I/O memory
2396.TP
d1007d14 2397.B PERF_MEM_LVL_UNC
2b538c3e
MK
2398Uncached memory
2399.RE
2400.PD
d1007d14
VW
2401.TP
2402.I mem_snoop
bc9d90b5
VW
2403Snoop mode, a bitwise combination of the following, shifted left by
2404.BR PERF_MEM_SNOOP_SHIFT :
efeece04 2405.IP
2b538c3e
MK
2406.PD 0
2407.RS
2408.TP 24
d1007d14 2409.B PERF_MEM_SNOOP_NA
2b538c3e
MK
2410Not available
2411.TP
d1007d14 2412.B PERF_MEM_SNOOP_NONE
2b538c3e
MK
2413No snoop
2414.TP
d1007d14 2415.B PERF_MEM_SNOOP_HIT
2b538c3e
MK
2416Snoop hit
2417.TP
d1007d14 2418.B PERF_MEM_SNOOP_MISS
2b538c3e
MK
2419Snoop miss
2420.TP
d1007d14 2421.B PERF_MEM_SNOOP_HITM
2b538c3e
MK
2422Snoop hit modified
2423.RE
2424.PD
d1007d14
VW
2425.TP
2426.I mem_lock
bc9d90b5
VW
2427Lock instruction, a bitwise combination of the following, shifted left by
2428.BR PERF_MEM_LOCK_SHIFT :
efeece04 2429.IP
2b538c3e
MK
2430.PD 0
2431.RS
2432.TP 24
d1007d14 2433.B PERF_MEM_LOCK_NA
2b538c3e
MK
2434Not available
2435.TP
d1007d14 2436.B PERF_MEM_LOCK_LOCKED
2b538c3e
MK
2437Locked transaction
2438.RE
2439.PD
d1007d14
VW
2440.TP
2441.I mem_dtlb
bc9d90b5
VW
2442TLB access hit or miss, a bitwise combination of the following, shifted
2443left by
2444.BR PERF_MEM_TLB_SHIFT :
efeece04 2445.IP
2b538c3e
MK
2446.PD 0
2447.RS
2448.TP 24
d1007d14 2449.B PERF_MEM_TLB_NA
2b538c3e
MK
2450Not available
2451.TP
d1007d14 2452.B PERF_MEM_TLB_HIT
2b538c3e
MK
2453Hit
2454.TP
d1007d14 2455.B PERF_MEM_TLB_MISS
2b538c3e
MK
2456Miss
2457.TP
d1007d14 2458.B PERF_MEM_TLB_L1
2b538c3e
MK
2459Level 1 TLB
2460.TP
d1007d14 2461.B PERF_MEM_TLB_L2
2b538c3e
MK
2462Level 2 TLB
2463.TP
d1007d14 2464.B PERF_MEM_TLB_WK
2b538c3e
MK
2465Hardware walker
2466.TP
d1007d14 2467.B PERF_MEM_TLB_OS
2b538c3e
MK
2468OS fault handler
2469.RE
2470.PD
d1007d14 2471.RE
1e043959
VW
2472.TP
2473.I transaction
2474If the
2475.B PERF_SAMPLE_TRANSACTION
37bee118 2476flag is set, then a 64-bit field is recorded describing
1e043959 2477the sources of any transactional memory aborts.
efeece04 2478.IP
1e043959
VW
2479The field is a bitwise combination of the following values:
2480.RS
2481.TP
2482.B PERF_TXN_ELISION
b3f39642 2483Abort from an elision type transaction (Intel-CPU-specific).
1e043959
VW
2484.TP
2485.B PERF_TXN_TRANSACTION
b3f39642 2486Abort from a generic transaction.
1e043959
VW
2487.TP
2488.B PERF_TXN_SYNC
b3f39642 2489Synchronous abort (related to the reported instruction).
1e043959
VW
2490.TP
2491.B PERF_TXN_ASYNC
b3f39642 2492Asynchronous abort (not related to the reported instruction).
1e043959
VW
2493.TP
2494.B PERF_TXN_RETRY
053a3e08 2495Retryable abort (retrying the transaction may have succeeded).
1e043959
VW
2496.TP
2497.B PERF_TXN_CONFLICT
b3f39642 2498Abort due to memory conflicts with other threads.
1e043959
VW
2499.TP
2500.B PERF_TXN_CAPACITY_WRITE
b3f39642 2501Abort due to write capacity overflow.
1e043959
VW
2502.TP
2503.B PERF_TXN_CAPACITY_READ
b3f39642 2504Abort due to read capacity overflow.
1e043959 2505.RE
b3f39642
MK
2506.IP
2507In addition, a user-specified abort code can be obtained from
2508the high 32 bits of the field by shifting right by
1e043959 2509.B PERF_TXN_ABORT_SHIFT
4b3a5f01 2510and masking with the value
1e043959 2511.BR PERF_TXN_ABORT_MASK .
f5281dfd
VW
2512.TP
2513.IR abi ", " regs[weight(mask)]
2514If
2515.B PERF_SAMPLE_REGS_INTR
2516is enabled, then the user CPU registers are recorded.
efeece04 2517.IP
f5281dfd
VW
2518The
2519.I abi
2520field is one of
4b3a5f01
MK
2521.BR PERF_SAMPLE_REGS_ABI_NONE ,
2522.BR PERF_SAMPLE_REGS_ABI_32 ,
2523or
f5281dfd 2524.BR PERF_SAMPLE_REGS_ABI_64 .
efeece04 2525.IP
f5281dfd
VW
2526The
2527.I regs
2528field is an array of the CPU registers that were specified by
2529the
2530.I sample_regs_intr
2531attr field.
2532The number of values is the number of bits set in the
2533.I sample_regs_intr
2534bit mask.
f2b1d720 2535.RE
9bfc542b
VW
2536.TP
2537.B PERF_RECORD_MMAP2
2538This record includes extended information on
2539.BR mmap (2)
2540calls returning executable mappings.
2541The format is similar to that of the
2542.B PERF_RECORD_MMAP
3a058284 2543record, but includes extra values that allow uniquely identifying
9bfc542b 2544shared mappings.
efeece04 2545.IP
9bfc542b 2546.in +4n
b8302363 2547.EX
9bfc542b
VW
2548struct {
2549 struct perf_event_header header;
7a10da70
MK
2550 u32 pid;
2551 u32 tid;
2552 u64 addr;
2553 u64 len;
2554 u64 pgoff;
2555 u32 maj;
2556 u32 min;
2557 u64 ino;
2558 u64 ino_generation;
2559 u32 prot;
2560 u32 flags;
2561 char filename[];
9bfc542b
VW
2562 struct sample_id sample_id;
2563};
ba4924aa 2564.EE
9bfc542b
VW
2565.RS
2566.TP
2567.I pid
3a058284 2568is the process ID.
9bfc542b
VW
2569.TP
2570.I tid
3a058284 2571is the thread ID.
9bfc542b
VW
2572.TP
2573.I addr
2574is the address of the allocated memory.
2575.TP
2576.I len
2577is the length of the allocated memory.
2578.TP
2579.I pgoff
2580is the page offset of the allocated memory.
2581.TP
2582.I maj
3a058284 2583is the major ID of the underlying device.
9bfc542b
VW
2584.TP
2585.I min
3a058284 2586is the minor ID of the underlying device.
9bfc542b
VW
2587.TP
2588.I ino
3a058284 2589is the inode number.
9bfc542b
VW
2590.TP
2591.I ino_generation
2592is the inode generation.
2593.TP
2594.I prot
2595is the protection information.
2596.TP
2597.I flags
2598is the flags information.
2599.TP
2600.I filename
2601is a string describing the backing of the allocated memory.
2602.RE
1fda209c
VW
2603.TP
2604.BR PERF_RECORD_AUX " (since Linux 4.1)"
2605\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0
2606This record reports that new data is available in the separate
2607AUX buffer region.
efeece04 2608.IP
1fda209c 2609.in +4n
b8302363 2610.EX
1fda209c
VW
2611struct {
2612 struct perf_event_header header;
7a10da70
MK
2613 u64 aux_offset;
2614 u64 aux_size;
2615 u64 flags;
1fda209c
VW
2616 struct sample_id sample_id;
2617};
ba4924aa 2618.EE
1fda209c
VW
2619.RS
2620.TP
2621.I aux_offset
2622offset in the AUX mmap region where the new data begins.
2623.TP
2624.I aux_size
2625size of the data made available.
2626.TP
2627.I flags
95655a22 2628describes the AUX update.
1fda209c
VW
2629.RS
2630.TP
2631.B PERF_AUX_FLAG_TRUNCATED
95655a22 2632if set, then the data returned was truncated to fit the available
1fda209c 2633buffer size.
b1355f6a
VW
2634.TP
2635.B PERF_AUX_FLAG_OVERWRITE
2636.\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142
95655a22 2637if set, then the data returned has overwritten previous data.
1fda209c
VW
2638.RE
2639.RE
6932aac3
VW
2640.TP
2641.BR PERF_RECORD_ITRACE_START " (since Linux 4.1)"
2642\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368
2643This record indicates which process has initiated an instruction
2644trace event, allowing tools to properly correlate the instruction
2645addresses in the AUX buffer with the proper executable.
efeece04 2646.IP
6932aac3 2647.in +4n
b8302363 2648.EX
6932aac3
VW
2649struct {
2650 struct perf_event_header header;
7a10da70
MK
2651 u32 pid;
2652 u32 tid;
6932aac3 2653};
ba4924aa 2654.EE
6932aac3
VW
2655.RS
2656.TP
2657.I pid
95655a22 2658process ID of the thread starting an instruction trace.
6932aac3
VW
2659.TP
2660.I tid
95655a22 2661thread ID of the thread starting an instruction trace.
6932aac3 2662.RE
46012ba3
DH
2663.TP
2664.BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)"
2665\" f38b0dbb491a6987e198aa6b428db8692a6480f8
2666When using hardware sampling (such as Intel PEBS) this record
4199d3a1 2667indicates some number of samples that may have been lost.
efeece04 2668.IP
46012ba3 2669.in +4n
b8302363 2670.EX
46012ba3
DH
2671struct {
2672 struct perf_event_header header;
7a10da70 2673 u64 lost;
46012ba3
DH
2674 struct sample_id sample_id;
2675};
ba4924aa 2676.EE
46012ba3
DH
2677.RS
2678.TP
2679.I lost
2680the number of potentially lost samples.
2681.RE
9277a75d
VW
2682.TP
2683.BR PERF_RECORD_SWITCH " (since Linux 4.3)"
2684\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
2685This record indicates a context switch has happened.
2686The
2687.B PERF_RECORD_MISC_SWITCH_OUT
2688bit in the
2689.I misc
2690field indicates whether it was a context switch into
2691or away from the current process.
efeece04 2692.IP
9277a75d 2693.in +4n
b8302363 2694.EX
9277a75d
VW
2695struct {
2696 struct perf_event_header header;
2697 struct sample_id sample_id;
2698};
ba4924aa 2699.EE
9277a75d
VW
2700.TP
2701.BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)"
2702\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
2703As with
2704.B PERF_RECORD_SWITCH
2705this record indicates a context switch has happened,
d5a24378 2706but it only occurs when sampling in CPU-wide mode
9277a75d
VW
2707and provides additional information on the process
2708being switched to/from.
2709The
2710.B PERF_RECORD_MISC_SWITCH_OUT
2711bit in the
2712.I misc
2713field indicates whether it was a context switch into
2714or away from the current process.
efeece04 2715.IP
9277a75d 2716.in +4n
b8302363 2717.EX
9277a75d
VW
2718struct {
2719 struct perf_event_header header;
2720 u32 next_prev_pid;
2721 u32 next_prev_tid;
2722 struct sample_id sample_id;
2723};
ba4924aa 2724.EE
9277a75d
VW
2725.RS
2726.TP
2727.I next_prev_pid
d5a24378 2728The process ID of the previous (if switching in)
9277a75d
VW
2729or next (if switching out) process on the CPU.
2730.TP
2731.I next_prev_tid
d5a24378 2732The thread ID of the previous (if switching in)
9277a75d
VW
2733or next (if switching out) thread on the CPU.
2734.RE
f2b1d720 2735.RE
21977c9d
VW
2736.SS Overflow handling
2737Events can be set to notify when a threshold is crossed,
2738indicating an overflow.
2739Overflow conditions can be captured by monitoring the
2740event file descriptor with
f2b1d720
MK
2741.BR poll (2),
2742.BR select (2),
21977c9d 2743or
4b3a5f01 2744.BR epoll (7).
6831ba6b
MK
2745Alternatively, the overflow events can be captured via sa signal handler,
2746by enabling I/O signaling on the file descriptor; see the discussion of the
fc79d996 2747.BR F_SETOWN
6831ba6b
MK
2748and
2749.BR F_SETSIG
2750operations in
2751.BR fcntl (2).
efeece04 2752.PP
6170255e 2753Overflows are generated only by sampling events
f2b1d720 2754.RI ( sample_period
7d182bb6 2755must have a nonzero value).
efeece04 2756.PP
21977c9d 2757There are two ways to generate overflow notifications.
efeece04 2758.PP
f2b1d720
MK
2759The first is to set a
2760.I wakeup_events
2761or
2762.I wakeup_watermark
21977c9d 2763value that will trigger if a certain number of samples
f2b1d720 2764or bytes have been written to the mmap ring buffer.
fc79d996 2765In this case,
7db515ef 2766.B POLL_IN
21977c9d 2767is indicated.
efeece04 2768.PP
f2b1d720 2769The other way is by use of the
7db515ef 2770.B PERF_EVENT_IOC_REFRESH
f2b1d720
MK
2771ioctl.
2772This ioctl adds to a counter that decrements each time the event overflows.
21977c9d 2773When nonzero,
7db515ef 2774.B POLL_IN
21977c9d
VW
2775is indicated, but
2776once the counter reaches 0
7db515ef 2777.B POLL_HUP
21977c9d 2778is indicated and
f2b1d720 2779the underlying event is disabled.
efeece04 2780.PP
50e4319c
VW
2781Refreshing an event group leader refreshes all siblings and
2782refreshing with a parameter of 0 currently enables infinite
2783refreshes;
2784these behaviors are unsupported and should not be relied on.
2785.\" See https://lkml.org/lkml/2011/5/24/337
efeece04 2786.PP
4010bc07 2787Starting with Linux 3.18,
747a6e7c 2788.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883
21977c9d
VW
2789.B POLL_HUP
2790is indicated if the event being monitored is attached to a different
2791process and that process exits.
73d8cece 2792.SS rdpmc instruction
f2b1d720 2793Starting with Linux 3.4 on x86, you can use the
747a6e7c 2794.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
f2b1d720
MK
2795.I rdpmc
2796instruction to get low-latency reads without having to enter the kernel.
2797Note that using
2798.I rdpmc
2799is not necessarily faster than other methods for reading event values.
efeece04 2800.PP
f2b1d720
MK
2801Support for this can be detected with the
2802.I cap_usr_rdpmc
2803field in the mmap page; documentation on how
2804to calculate event values can be found in that section.
efeece04 2805.PP
562c69f6
VW
2806Originally, when rdpmc support was enabled, any process (not just ones
2807with an active perf event) could use the rdpmc instruction to access
2808the counters.
fc79d996 2809Starting with Linux 4.0,
562c69f6
VW
2810.\" 7911d3f7af14a614617e38245fedf98a724e46a9
2811rdpmc support is only allowed if an event is currently enabled
95655a22 2812in a process's context.
562c69f6
VW
2813To restore the old behavior, write the value 2 to
2814.IR /sys/devices/cpu/rdpmc .
73d8cece 2815.SS perf_event ioctl calls
f2b1d720
MK
2816.PP
2817Various ioctls act on
7db515ef 2818.BR perf_event_open ()
ce88f77b 2819file descriptors:
f2b1d720
MK
2820.TP
2821.B PERF_EVENT_IOC_ENABLE
ce88f77b 2822This enables the individual event or event group specified by the
7db515ef 2823file descriptor argument.
efeece04 2824.IP
51700fd7 2825If the
8cc8b90d 2826.B PERF_IOC_FLAG_GROUP
51700fd7 2827bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2828enabled, even if the event specified is not the group leader
2829(but see BUGS).
f2b1d720
MK
2830.TP
2831.B PERF_EVENT_IOC_DISABLE
ce88f77b 2832This disables the individual counter or event group specified by the
7db515ef 2833file descriptor argument.
efeece04 2834.IP
f2b1d720
MK
2835Enabling or disabling the leader of a group enables or disables the
2836entire group; that is, while the group leader is disabled, none of the
2837counters in the group will count.
33a0ccb2
MK
2838Enabling or disabling a member of a group other than the leader
2839affects only that counter; disabling a non-leader
f2b1d720 2840stops that counter from counting but doesn't affect any other counter.
efeece04 2841.IP
51700fd7 2842If the
8cc8b90d 2843.B PERF_IOC_FLAG_GROUP
51700fd7 2844bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2845disabled, even if the event specified is not the group leader
2846(but see BUGS).
f2b1d720
MK
2847.TP
2848.B PERF_EVENT_IOC_REFRESH
2849Non-inherited overflow counters can use this
2850to enable a counter for a number of overflows specified by the argument,
2851after which it is disabled.
2852Subsequent calls of this ioctl add the argument value to the current
2853count.
21977c9d 2854An overflow notification with
7db515ef
MK
2855.B POLL_IN
2856set will happen on each overflow until the
21977c9d
VW
2857count reaches 0; when that happens a notification with
2858.B POLL_HUP
7db515ef 2859set is sent and the event is disabled.
f2b1d720 2860Using an argument of 0 is considered undefined behavior.
f2b1d720
MK
2861.TP
2862.B PERF_EVENT_IOC_RESET
36127c0e 2863Reset the event count specified by the
6061d29f 2864file descriptor argument to zero.
33a0ccb2 2865This resets only the counts; there is no way to reset the
f2b1d720
MK
2866multiplexing
2867.I time_enabled
2868or
2869.I time_running
2870values.
efeece04 2871.IP
51700fd7 2872If the
8cc8b90d 2873.B PERF_IOC_FLAG_GROUP
51700fd7 2874bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2875reset, even if the event specified is not the group leader
2876(but see BUGS).
f2b1d720
MK
2877.TP
2878.B PERF_EVENT_IOC_PERIOD
e6cf5694 2879This updates the overflow period for the event.
efeece04 2880.IP
747a6e7c
VW
2881Since Linux 3.7 (on ARM)
2882.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc
2883and Linux 3.14 (all other architectures),
2884.\" commit bad7192b842c83e580747ca57104dd51fe08c223
3f118a29 2885the new period takes effect immediately.
ed81fdd9 2886On older kernels, the new period did not take effect until
3f118a29 2887after the next overflow.
efeece04 2888.IP
f2b1d720
MK
2889The argument is a pointer to a 64-bit value containing the
2890desired new period.
efeece04 2891.IP
fc79d996 2892Prior to Linux 2.6.36,
747a6e7c
VW
2893.\" commit ad0cf3478de8677f720ee06393b3147819568d6a
2894this ioctl always failed due to a bug
e6cf5694 2895in the kernel.
f2b1d720
MK
2896.TP
2897.B PERF_EVENT_IOC_SET_OUTPUT
2898This tells the kernel to report event notifications to the specified
2899file descriptor rather than the default one.
2900The file descriptors must all be on the same CPU.
efeece04 2901.IP
f2b1d720
MK
2902The argument specifies the desired file descriptor, or \-1 if
2903output should be ignored.
f2b1d720 2904.TP
31c1f2b0 2905.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
60dafbc1 2906.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830
f2b1d720 2907This adds an ftrace filter to this event.
efeece04 2908.IP
f2b1d720 2909The argument is a pointer to the desired ftrace filter.
a0dcc8dd 2910.TP
31c1f2b0 2911.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
60dafbc1 2912.\" commit cf4957f17f2a89984915ea808876d9c82225b862
bec6277e 2913This returns the event ID value for the given event file descriptor.
efeece04 2914.IP
a0dcc8dd
VW
2915The argument is a pointer to a 64-bit unsigned integer
2916to hold the result.
b0f7b411
VW
2917.TP
2918.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)"
2919.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
2920This allows attaching a Berkeley Packet Filter (BPF)
2921program to an existing kprobe tracepoint event.
2922You need
2923.B CAP_SYS_ADMIN
2924privileges to use this ioctl.
efeece04 2925.IP
b0f7b411
VW
2926The argument is a BPF program file descriptor that was created by
2927a previous
2928.BR bpf (2)
2929system call.
06a61b36
VW
2930.TP
2931.BR PERF_EVENT_IOC_PAUSE_OUTPUT " (since Linux 4.7)"
2932.\" commit 86e7972f690c1017fd086cdfe53d8524e68c661c
2933This allows pausing and resuming the event's ring-buffer.
2934A paused ring-buffer does not prevent generation of samples,
2935but simply discards them.
2936The discarded samples are considered lost, and cause a
2937.BR PERF_RECORD_LOST
2938sample to be generated when possible.
2939An overflow signal may still be triggered by the discarded sample
2940even though the ring-buffer remains empty.
2941.IP
2942The argument is an unsigned 32-bit integer.
2943A nonzero value pauses the ring-buffer, while a
2944zero value resumes the ring-buffer.
8496491d
VW
2945.TP
2946.BR PERF_EVENT_MODIFY_ATTRIBUTES " (since Linux 4.17)"
2947.\" commit 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573
2948This allows modifying an existing event without the overhead
2949of closing and reopening a new event.
92e696b9 2950Currently this is supported only for breakpoint events.
8496491d
VW
2951.IP
2952The argument is a pointer to a
2953.I perf_event_attr
2954structure containing the updated event settings.
43cc0d8a
VW
2955.TP
2956.BR PERF_EVENT_IOC_QUERY_BPF " (since Linux 4.16)"
2957.\" commit f371b304f12e31fe30207c41ca7754564e0ea4dc
2958This allows querying which Berkeley Packet Filter (BPF)
2959programs are attached to an existing kprobe tracepoint.
2960You can only attach one BPF program per event, but you can
2961have multiple events attached to a tracepoint.
2962Querying this value on one tracepoint event returns the id
2963of all BPF programs in all events attached to the tracepoint.
2964You need
2965.B CAP_SYS_ADMIN
2966privileges to use this ioctl.
2967.IP
2968The argument is a pointer to a structure
2969.in +4n
2970.EX
2971struct perf_event_query_bpf {
2972 __u32 ids_len;
2973 __u32 prog_cnt;
2974 __u32 ids[0];
2975};
2976.EE
2977.IP
2978The
2979.I ids_len
2980field indicates the number of ids that can fit in the provided
2981.I ids
2982array.
2983The
2984.I prog_cnt
2985value is filled in by the kernel with the number of attached
2986BPF programs.
2987The
2988.I ids
2989array is filled with the id of each attached BPF program.
2990If there are more programs than will fit in the array, then the
2991kernel will return
2992.B ENOSPC
2993and
2994.I ids_len
2995will indicate the number of program IDs that were successfully copied.
06a61b36 2996.\"
fc79d996 2997.SS Using prctl(2)
d134c429
VW
2998A process can enable or disable all currently open event groups
2999using the
f2b1d720
MK
3000.BR prctl (2)
3001.B PR_TASK_PERF_EVENTS_ENABLE
3002and
3003.B PR_TASK_PERF_EVENTS_DISABLE
3004operations.
d134c429
VW
3005This applies only to events created locally by the calling process.
3006This does not apply to events created by other processes attached
3007to the calling process or inherited events from a parent process.
3008Only group leaders are enabled and disabled,
3009not any other members of the groups.
f2b1d720 3010.SS perf_event related configuration files
efeece04 3011.PP
7db515ef
MK
3012Files in
3013.I /proc/sys/kernel/
7db515ef 3014.RS 4
f2b1d720 3015.TP
7db515ef 3016.I /proc/sys/kernel/perf_event_paranoid
f2b1d720
MK
3017The
3018.I perf_event_paranoid
3019file can be set to restrict access to the performance counters.
efeece04 3020.IP
dc9ec146 3021.PD 0
2b538c3e
MK
3022.RS
3023.IP 2 4
3eb95192 3024allow only user-space measurements (default since Linux 4.6).
b5eb75f7 3025.\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66
2b538c3e 3026.IP 1
3eb95192 3027allow both kernel and user measurements (default before Linux 4.6).
2b538c3e
MK
3028.IP 0
3029allow access to CPU-specific data but not raw tracepoint samples.
3030.IP \-1
3031no restrictions.
3032.RE
dc9ec146 3033.PD
2b538c3e 3034.IP
f2b1d720
MK
3035The existence of the
3036.I perf_event_paranoid
3037file is the official method for determining if a kernel supports
7db515ef 3038.BR perf_event_open ().
f2b1d720
MK
3039.TP
3040.I /proc/sys/kernel/perf_event_max_sample_rate
7db515ef
MK
3041This sets the maximum sample rate.
3042Setting this too high can allow
f2b1d720 3043users to sample at a rate that impacts overall machine performance
7db515ef
MK
3044and potentially lock up the machine.
3045The default value is
f2b1d720 3046100000 (samples per second).
fd133d5d
VW
3047.TP
3048.I /proc/sys/kernel/perf_event_max_stack
3049.\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e
5dd3feec 3050This file sets the maximum depth of stack frame entries reported
fd133d5d 3051when generating a call trace.
f2b1d720
MK
3052.TP
3053.I /proc/sys/kernel/perf_event_mlock_kb
ce88f77b
MK
3054Maximum number of pages an unprivileged user can
3055.BR mlock (2).
f2b1d720
MK
3056The default is 516 (kB).
3057.RE
efeece04 3058.PP
7db515ef
MK
3059Files in
3060.I /sys/bus/event_source/devices/
efeece04 3061.PP
7db515ef 3062.RS 4
ce88f77b 3063Since Linux 2.6.34, the kernel supports having multiple PMUs
f2b1d720
MK
3064available for monitoring.
3065Information on how to program these PMUs can be found under
3066.IR /sys/bus/event_source/devices/ .
3067Each subdirectory corresponds to a different PMU.
f2b1d720 3068.TP
31c1f2b0 3069.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
747a6e7c 3070.\" commit abe43400579d5de0078c2d3a760e6598e183f871
f2b1d720
MK
3071This contains an integer that can be used in the
3072.I type
ce88f77b
MK
3073field of
3074.I perf_event_attr
3075to indicate that you wish to use this PMU.
f2b1d720 3076.TP
562c69f6 3077.IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)"
747a6e7c 3078.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f
8a94e783 3079If this file is 1, then direct user-space access to the
e30dc77f
VW
3080performance counter registers is allowed via the rdpmc instruction.
3081This can be disabled by echoing 0 to the file.
efeece04 3082.IP
562c69f6
VW
3083As of Linux 4.0
3084.\" a66734297f78707ce39d756b656bfae861d53f62
3085.\" 7911d3f7af14a614617e38245fedf98a724e46a9
3086the behavior has changed, so that 1 now means only allow access
3087to processes with active perf events, with 2 indicating the old
3088allow-anyone-access behavior.
f2b1d720 3089.TP
31c1f2b0 3090.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
747a6e7c 3091.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6
MK
3092This subdirectory contains information on the architecture-specific
3093subfields available for programming the various
f2b1d720 3094.I config
ce88f77b
MK
3095fields in the
3096.I perf_event_attr
3097struct.
efeece04 3098.IP
e30dc77f
VW
3099The content of each file is the name of the config field, followed
3100by a colon, followed by a series of integer bit ranges separated by
3101commas.
8a94e783 3102For example, the file
e30dc77f
VW
3103.I event
3104may contain the value
d2fdb1e3
MK
3105.I config1:1,6\-10,44
3106which indicates that event is an attribute that occupies bits 1,6\(en10, and 44
ce88f77b
MK
3107of
3108.IR perf_event_attr::config1 .
e30dc77f 3109.TP
31c1f2b0 3110.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
747a6e7c 3111.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
7d182bb6 3112This subdirectory contains files with predefined events.
f2b1d720 3113The contents are strings describing the event settings
e30dc77f 3114expressed in terms of the fields found in the previously mentioned
f2b1d720
MK
3115.I ./format/
3116directory.
3117These are not necessarily complete lists of all events supported by
3118a PMU, but usually a subset of events deemed useful or interesting.
efeece04 3119.IP
e30dc77f 3120The content of each file is a list of attribute names
8a94e783
MK
3121separated by commas.
3122Each entry has an optional value (either hex or decimal).
37bee118 3123If no value is specified, then it is assumed to be a single-bit
e30dc77f
VW
3124field with a value of 1.
3125An example entry may look like this:
699893d8 3126.IR event=0x2,inv,ldlat=3 .
f2b1d720
MK
3127.TP
3128.I /sys/bus/event_source/devices/*/uevent
e30dc77f
VW
3129This file is the standard kernel device interface
3130for injecting hotplug events.
3131.TP
31c1f2b0 3132.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
747a6e7c 3133.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac
699893d8
DP
3134The
3135.I cpumask
3136file contains a comma-separated list of integers that
3137indicate a representative CPU number for each socket (package)
e30dc77f
VW
3138on the motherboard.
3139This is needed when setting up uncore or northbridge events, as
3140those PMUs present socket-wide events.
f2b1d720 3141.RE
47297adb 3142.SH RETURN VALUE
f2b1d720
MK
3143.BR perf_event_open ()
3144returns the new file descriptor, or \-1 if an error occurred
3145(in which case,
3146.I errno
3147is set appropriately).
3148.SH ERRORS
d8b7d950
VW
3149The errors returned by
3150.BR perf_event_open ()
3151can be inconsistent, and may
3152vary across processor architectures and performance monitoring units.
f2b1d720 3153.TP
82b09254 3154.B E2BIG
ce88f77b
MK
3155Returned if the
3156.I perf_event_attr
82b09254
VW
3157.I size
3158value is too small
3159(smaller than
3160.BR PERF_ATTR_SIZE_VER0 ),
3161too big (larger than the page size),
3162or larger than the kernel supports and the extra bytes are not zero.
3163When
3164.B E2BIG
ce88f77b
MK
3165is returned, the
3166.I perf_event_attr
e9bd9b2c 3167.I size
d6af98f8 3168field is overwritten by the kernel to be the size of the structure
82b09254
VW
3169it was expecting.
3170.TP
d8b7d950 3171.B EACCES
27f0af8e
VW
3172Returned when the requested event requires
3173.B CAP_SYS_ADMIN
3174permissions (or a more permissive perf_event paranoid setting).
3175Some common cases where an unprivileged process
3176may encounter this error:
3177attaching to a process owned by a different user;
2b23ecbd
MK
3178monitoring all processes on a given CPU (i.e., specifying the
3179.I pid
3180argument as \-1);
079928f3 3181and not setting
accec051 3182.I exclude_kernel
079928f3 3183when the paranoid setting requires it.
d8b7d950
VW
3184.TP
3185.B EBADF
3186Returned if the
3187.I group_fd
accec051
MK
3188file descriptor is not valid, or, if
3189.B PERF_FLAG_PID_CGROUP
3190is set,
d8b7d950
VW
3191the cgroup file descriptor in
3192.I pid
3193is not valid.
3194.TP
f27486cb
VW
3195.BR EBUSY " (since Linux 4.1)"
3196.\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0
3197Returned if another event already has exclusive
3198access to the PMU.
3199.TP
d8b7d950
VW
3200.B EFAULT
3201Returned if the
3202.I attr
3203pointer points at an invalid memory address.
3204.TP
f2b1d720 3205.B EINVAL
d8b7d950
VW
3206Returned if the specified event is invalid.
3207There are many possible reasons for this.
3208A not-exhaustive list:
3209.I sample_freq
accec051 3210is higher than the maximum setting;
d8b7d950
VW
3211the
3212.I cpu
accec051 3213to monitor does not exist;
d8b7d950 3214.I read_format
accec051 3215is out of range;
d8b7d950 3216.I sample_type
accec051 3217is out of range;
d8b7d950
VW
3218the
3219.I flags
accec051 3220value is out of range;
d8b7d950
VW
3221.I exclusive
3222or
3223.I pinned
accec051 3224set and the event is not a group leader;
d8b7d950
VW
3225the event
3226.I config
accec051
MK
3227values are out of range or set reserved bits;
3228the generic event selected is not supported; or
d8b7d950
VW
3229there is not enough room to add the selected event.
3230.TP
3231.B EMFILE
3232Each opened event uses one file descriptor.
26c32fab
MK
3233If a large number of events are opened,
3234the per-process limit on the number of open file descriptors will be reached,
3235and no more events can be created.
d8b7d950
VW
3236.TP
3237.B ENODEV
3238Returned when the event involves a feature not supported
accec051 3239by the current CPU.
d8b7d950
VW
3240.TP
3241.B ENOENT
3242Returned if the
3243.I type
3244setting is not valid.
accec051 3245This error is also returned for
d8b7d950 3246some unsupported generic events.
f2b1d720
MK
3247.TP
3248.B ENOSPC
3249Prior to Linux 3.3, if there was not enough room for the event,
747a6e7c 3250.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6
f2b1d720
MK
3251.B ENOSPC
3252was returned.
accec051 3253In Linux 3.3, this was changed to
f2b1d720
MK
3254.BR EINVAL .
3255.B ENOSPC
d8b7d950 3256is still returned if you try to add more breakpoint events
accec051 3257than supported by the hardware.
d8b7d950
VW
3258.TP
3259.B ENOSYS
3260Returned if
3261.B PERF_SAMPLE_STACK_USER
3262is set in
3263.I sample_type
3264and it is not supported by hardware.
3265.TP
3266.B EOPNOTSUPP
3267Returned if an event requiring a specific hardware feature is
3268requested but there is no hardware support.
3269This includes requesting low-skid events if not supported,
3270branch tracing if it is not available, sampling if no PMU
3271interrupt is available, and branch stacks for software events.
3272.TP
fd133d5d
VW
3273.BR EOVERFLOW " (since Linux 4.8)"
3274.\" 97c79a38cd454602645f0470ffb444b3b75ce574
3275Returned if
3276.B PERF_SAMPLE_CALLCHAIN
3277is requested and
3278.I sample_max_stack
3279is larger than the maximum specified in
3280.IR /proc/sys/kernel/perf_event_max_stack .
3281.TP
d8b7d950 3282.B EPERM
27f0af8e
VW
3283Returned on many (but not all) architectures when an unsupported
3284.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel
3285setting is specified.
efeece04 3286.IP
27f0af8e
VW
3287It can also happen, as with
3288.BR EACCES ,
3289when the requested event requires
3290.B CAP_SYS_ADMIN
3291permissions (or a more permissive perf_event paranoid setting).
3292This includes setting a breakpoint on a kernel address,
3293and (since Linux 3.13) setting a kernel function-trace tracepoint.
747a6e7c 3294.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34
d8b7d950
VW
3295.TP
3296.B ESRCH
3297Returned if attempting to attach to a process that does not exist.
f2b1d720 3298.SH VERSION
f2b1d720
MK
3299.BR perf_event_open ()
3300was introduced in Linux 2.6.31 but was called
747a6e7c 3301.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e
ffd4dec0 3302.BR perf_counter_open ().
f2b1d720 3303It was renamed in Linux 2.6.32.
747a6e7c 3304.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
f2b1d720 3305.SH CONFORMING TO
7db515ef
MK
3306This
3307.BR perf_event_open ()
dc9ec146 3308system call Linux-specific
f2b1d720 3309and should not be used in programs intended to be portable.
f2b1d720
MK
3310.SH NOTES
3311Glibc does not provide a wrapper for this system call; call it using
3312.BR syscall (2).
7db515ef 3313See the example below.
efeece04 3314.PP
f2b1d720 3315The official way of knowing if
7db515ef 3316.BR perf_event_open ()
f2b1d720
MK
3317support is enabled is checking
3318for the existence of the file
7db515ef 3319.IR /proc/sys/kernel/perf_event_paranoid .
f2b1d720 3320.SH BUGS
f2b1d720
MK
3321The
3322.B F_SETOWN_EX
3323option to
7db515ef 3324.BR fcntl (2)
f2b1d720
MK
3325is needed to properly get overflow signals in threads.
3326This was introduced in Linux 2.6.32.
747a6e7c 3327.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5
efeece04 3328.PP
747a6e7c
VW
3329Prior to Linux 2.6.33 (at least for x86),
3330.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1
3331the kernel did not check
f2b1d720
MK
3332if events could be scheduled together until read time.
3333The same happens on all known kernels if the NMI watchdog is enabled.
3334This means to see if a given set of events works you have to
3335.BR perf_event_open (),
3336start, then read before you know for sure you
3337can get valid measurements.
efeece04 3338.PP
b5190152
MK
3339Prior to Linux 2.6.34,
3340.\" FIXME . cannot find a kernel commit for this one
3341event constraints were not enforced by the kernel.
f2b1d720
MK
3342In that case, some events would silently return "0" if the kernel
3343scheduled them in an improper counter slot.
efeece04 3344.PP
ce88f77b 3345Prior to Linux 2.6.34, there was a bug when multiplexing where the
f2b1d720 3346wrong results could be returned.
747a6e7c 3347.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8
efeece04 3348.PP
f2b1d720
MK
3349Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
3350"inherit" is enabled and many threads are started.
747a6e7c 3351.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd
efeece04 3352.PP
f2b1d720 3353Prior to Linux 2.6.35,
747a6e7c 3354.\" commit 050735b08ca8a016bbace4445fa025b88fee770b
f2b1d720
MK
3355.B PERF_FORMAT_GROUP
3356did not work with attached processes.
efeece04 3357.PP
f2b1d720
MK
3358There is a bug in the kernel code between
3359Linux 2.6.36 and Linux 3.0 that ignores the
3360"watermark" field and acts as if a wakeup_event
3361was chosen if the union has a
7d182bb6 3362nonzero value in it.
747a6e7c 3363.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02
efeece04 3364.PP
8a94e783 3365From Linux 2.6.31 to Linux 3.4, the
dbc01ecd
VW
3366.B PERF_IOC_FLAG_GROUP
3367ioctl argument was broken and would repeatedly operate
3368on the event specified rather than iterating across
3369all sibling events in a group.
747a6e7c 3370.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e
efeece04 3371.PP
7205b8df 3372From Linux 3.4 to Linux 3.11, the mmap
747a6e7c 3373.\" commit fa7315871046b9a4c48627905691dbde57e51033
135cba8b
VW
3374.I cap_usr_rdpmc
3375and
3376.I cap_usr_time
3377bits mapped to the same location.
3378Code should migrate to the new
3379.I cap_user_rdpmc
3380and
3381.I cap_user_time
3382fields instead.
efeece04 3383.PP
7db515ef
MK
3384Always double-check your results!
3385Various generalized events have had wrong values.
f2b1d720
MK
3386For example, retired branches measured
3387the wrong thing on AMD machines until Linux 2.6.35.
747a6e7c 3388.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
f2b1d720
MK
3389.SH EXAMPLE
3390The following is a short example that measures the total
7db515ef
MK
3391instruction count of a call to
3392.BR printf (3).
408731d4
MK
3393.PP
3394.EX
f2b1d720
MK
3395#include <stdlib.h>
3396#include <stdio.h>
3397#include <unistd.h>
3398#include <string.h>
3399#include <sys/ioctl.h>
3400#include <linux/perf_event.h>
3401#include <asm/unistd.h>
3402
571767ca 3403static long
7db515ef
MK
3404perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
3405 int cpu, int group_fd, unsigned long flags)
f2b1d720
MK
3406{
3407 int ret;
3408
7db515ef
MK
3409 ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
3410 group_fd, flags);
f2b1d720
MK
3411 return ret;
3412}
3413
f2b1d720
MK
3414int
3415main(int argc, char **argv)
3416{
f2b1d720
MK
3417 struct perf_event_attr pe;
3418 long long count;
3419 int fd;
3420
3421 memset(&pe, 0, sizeof(struct perf_event_attr));
3422 pe.type = PERF_TYPE_HARDWARE;
3423 pe.size = sizeof(struct perf_event_attr);
3424 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
3425 pe.disabled = 1;
3426 pe.exclude_kernel = 1;
3427 pe.exclude_hv = 1;
3428
3429 fd = perf_event_open(&pe, 0, \-1, \-1, 0);
7db515ef 3430 if (fd == \-1) {
d1a71985 3431 fprintf(stderr, "Error opening leader %llx\en", pe.config);
7db515ef 3432 exit(EXIT_FAILURE);
f2b1d720
MK
3433 }
3434
3435 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
3436 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
3437
d1a71985 3438 printf("Measuring instruction count for this printf\en");
f2b1d720
MK
3439
3440 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
3441 read(fd, &count, sizeof(long long));
3442
d1a71985 3443 printf("Used %lld instructions\en", count);
f2b1d720
MK
3444
3445 close(fd);
3446}
408731d4 3447.EE
47297adb 3448.SH SEE ALSO
022b038e 3449.BR perf (1),
f2b1d720
MK
3450.BR fcntl (2),
3451.BR mmap (2),
3452.BR open (2),
3453.BR prctl (2),
3454.BR read (2)