]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/perf_event_open.2
pthread_attr_setguardsize.3: ATTRIBUTES: Note functions that are thread-safe
[thirdparty/man-pages.git] / man2 / perf_event_open.2
CommitLineData
f2b1d720
MK
1.\" Copyright (c) 2012, Vincent Weaver
2.\"
1dd72f9c 3.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
f2b1d720
MK
4.\" This is free documentation; you can redistribute it and/or
5.\" modify it under the terms of the GNU General Public License as
6.\" published by the Free Software Foundation; either version 2 of
7.\" the License, or (at your option) any later version.
8.\"
9.\" The GNU General Public License's references to "object code"
10.\" and "executables" are to be interpreted as the output of any
11.\" document formatting or typesetting system, including
12.\" intermediate and printed output.
13.\"
14.\" This manual is distributed in the hope that it will be useful,
15.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
16.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17.\" GNU General Public License for more details.
18.\"
19.\" You should have received a copy of the GNU General Public
20.\" License along with this manual; if not, see
21.\" <http://www.gnu.org/licenses/>.
6a8d8745 22.\" %%%LICENSE_END
f2b1d720
MK
23.\"
24.\" This document is based on the perf_event.h header file, the
25.\" tools/perf/design.txt file, and a lot of bitter experience.
26.\"
accec051 27.TH PERF_EVENT_OPEN 2 2014-04-10 "Linux" "Linux Programmer's Manual"
f2b1d720
MK
28.SH NAME
29perf_event_open \- set up performance monitoring
30.SH SYNOPSIS
31.nf
32.B #include <linux/perf_event.h>
33.B #include <linux/hw_breakpoint.h>
34.sp
35.BI "int perf_event_open(struct perf_event_attr *" attr ,
36.BI " pid_t " pid ", int " cpu ", int " group_fd ,
37.BI " unsigned long " flags );
38.fi
39
40.IR Note :
41There is no glibc wrapper for this system call; see NOTES.
42.SH DESCRIPTION
43Given a list of parameters,
44.BR perf_event_open ()
45returns a file descriptor, for use in subsequent system calls
46.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
47.PP
48A call to
49.BR perf_event_open ()
50creates a file descriptor that allows measuring performance
51information.
52Each file descriptor corresponds to one
53event that is measured; these can be grouped together
54to measure multiple events simultaneously.
55.PP
56Events can be enabled and disabled in two ways: via
57.BR ioctl (2)
58and via
0fe9e4b1 59.BR prctl (2).
f2b1d720
MK
60When an event is disabled it does not count or generate overflows but does
61continue to exist and maintain its count value.
62.PP
63Events come in two flavors: counting and sampled.
64A
65.I counting
66event is one that is used for counting the aggregate number of events
67that occur.
68In general, counting event results are gathered with a
69.BR read (2)
70call.
71A
72.I sampling
73event periodically writes measurements to a buffer that can then
74be accessed via
0fe9e4b1 75.BR mmap (2).
f2b1d720
MK
76.SS Arguments
77.P
f2b1d720 78The
a02a1737 79.I pid
f2b1d720 80and
a02a1737
VW
81.I cpu
82arguments allow specifying which process and CPU to monitor:
83.TP
f2d15dc9 84.BR "pid == 0" " and " "cpu == \-1"
a02a1737
VW
85This measures the current process/thread on any CPU.
86.TP
f2d15dc9 87.BR "pid == 0" " and " "cpu >= 0"
b8efc3ed 88This measures the current process/thread only
a02a1737
VW
89when running on the specified CPU.
90.TP
f2d15dc9 91.BR "pid > 0" " and " "cpu == \-1"
a02a1737
VW
92This measures the specified process/thread on any CPU.
93.TP
f2d15dc9 94.BR "pid > 0" " and " "cpu >= 0"
a02a1737
VW
95This measures the specified process/thread only
96when running on the specified CPU.
97.TP
f2d15dc9 98.BR "pid == \-1" " and " "cpu >= 0"
a02a1737
VW
99This measures all processes/threads on the specified CPU.
100Measurements such as this require the
f2b1d720
MK
101.B CAP_SYS_ADMIN
102capability or a
103.I /proc/sys/kernel/perf_event_paranoid
104value of less than 1.
a02a1737
VW
105.TP
106.BR pid==\-1 " and " cpu==\-1
107This setting is invalid and will return an error.
f2b1d720
MK
108.P
109The
110.I group_fd
111argument allows event groups to be created.
112An event group has one event which is the group leader.
113The leader is created first, with
114.IR group_fd " = \-1."
115The rest of the group members are created with subsequent
116.BR perf_event_open ()
117calls with
118.IR group_fd
119being set to the fd of the group leader.
120(A single event on its own is created with
121.IR group_fd " = \-1"
122and is considered to be a group with only 1 member.)
33a0ccb2 123An event group is scheduled onto the CPU as a unit: it will
d1007d14 124be put onto the CPU only if all of the events in the group can be put onto
f2b1d720
MK
125the CPU.
126This means that the values of the member events can be
f78ed33a 127meaningfully compared, added, divided (to get ratios), and so on, with each
f2b1d720
MK
128other, since they have counted events for the same set of executed
129instructions.
130.P
131The
132.I flags
08e325e8 133argument is formed by ORing together zero or more of the following values:
f2b1d720 134.TP
e9b1ab78
MK
135.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)."
136This flag enables the close-on-exec flag for the created
137event file descriptor,
138so that the file descriptor is automatically closed on
139.BR execve (2).
8bad22e5
MK
140Setting the close-on-exec flags at creation time, rather than later with
141.BR fcntl (2),
e9b1ab78
MK
142avoids potential race conditions where the calling thread invokes
143.BR perf_event_open ()
a61dba34
MK
144and
145.BR fcntl (2)
e9b1ab78
MK
146at the same time as another thread calls
147.BR fork (2)
148then
149.BR execve (2).
150.TP
f2b1d720
MK
151.BR PERF_FLAG_FD_NO_GROUP
152.\" FIXME The following sentence is unclear
153This flag allows creating an event as part of an event group but
154having no group leader.
155It is unclear why this is useful.
156.\" FIXME So, why is it useful?
157.TP
158.BR PERF_FLAG_FD_OUTPUT
7d182bb6 159This flag reroutes the output from an event to the group leader.
f2b1d720 160.TP
31c1f2b0 161.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)."
f2b1d720
MK
162This flag activates per-container system-wide monitoring.
163A container
164is an abstraction that isolates a set of resources for finer grain
699893d8 165control (CPUs, memory, etc.).
f2b1d720
MK
166In this mode, the event is measured
167only if the thread running on the monitored CPU belongs to the designated
168container (cgroup).
169The cgroup is identified by passing a file descriptor
170opened on its directory in the cgroupfs filesystem.
171For instance, if the
172cgroup to monitor is called
173.IR test ,
174then a file descriptor opened on
175.I /dev/cgroup/test
176(assuming cgroupfs is mounted on
177.IR /dev/cgroup )
178must be passed as the
179.I pid
180parameter.
33a0ccb2 181cgroup monitoring is available only
f2b1d720
MK
182for system-wide events and may therefore require extra permissions.
183.P
184The
185.I perf_event_attr
186structure provides detailed configuration information
187for the event being created.
188
189.in +4n
190.nf
191struct perf_event_attr {
192 __u32 type; /* Type of event */
193 __u32 size; /* Size of attribute structure */
194 __u64 config; /* Type-specific configuration */
195
196 union {
197 __u64 sample_period; /* Period of sampling */
198 __u64 sample_freq; /* Frequency of sampling */
199 };
200
201 __u64 sample_type; /* Specifies values included in sample */
202 __u64 read_format; /* Specifies values returned in read */
203
7db515ef
MK
204 __u64 disabled : 1, /* off by default */
205 inherit : 1, /* children inherit it */
206 pinned : 1, /* must always be on PMU */
207 exclusive : 1, /* only group on PMU */
208 exclude_user : 1, /* don't count user */
209 exclude_kernel : 1, /* don't count kernel */
f2b1d720 210 exclude_hv : 1, /* don't count hypervisor */
7db515ef
MK
211 exclude_idle : 1, /* don't count when idle */
212 mmap : 1, /* include mmap data */
213 comm : 1, /* include comm data */
214 freq : 1, /* use freq, not period */
215 inherit_stat : 1, /* per task counts */
216 enable_on_exec : 1, /* next exec enables */
217 task : 1, /* trace fork/exit */
218 watermark : 1, /* wakeup_watermark */
219 precise_ip : 2, /* skid constraint */
220 mmap_data : 1, /* non-exec mmap data */
f2b1d720 221 sample_id_all : 1, /* sample_type all events */
7db515ef
MK
222 exclude_host : 1, /* don't count in host */
223 exclude_guest : 1, /* don't count in guest */
224 exclude_callchain_kernel : 1,
225 /* exclude kernel callchains */
226 exclude_callchain_user : 1,
227 /* exclude user callchains */
f2b1d720
MK
228 __reserved_1 : 41;
229
230 union {
231 __u32 wakeup_events; /* wakeup every n events */
7db515ef 232 __u32 wakeup_watermark; /* bytes before wakeup */
f2b1d720
MK
233 };
234
235 __u32 bp_type; /* breakpoint type */
236
237 union {
238 __u64 bp_addr; /* breakpoint address */
239 __u64 config1; /* extension of config */
240 };
241
242 union {
243 __u64 bp_len; /* breakpoint length */
244 __u64 config2; /* extension of config1 */
245 };
7db515ef
MK
246 __u64 branch_sample_type; /* enum perf_branch_sample_type */
247 __u64 sample_regs_user; /* user regs to dump on samples */
248 __u32 sample_stack_user; /* size of stack to dump on
249 samples */
250 __u32 __reserved_2; /* Align to u64 */
f2b1d720
MK
251
252};
253.fi
254.in
255
256The fields of the
257.I perf_event_attr
258structure are described in more detail below:
f2b1d720
MK
259.TP
260.I type
261This field specifies the overall event type.
262It has one of the following values:
263.RS
264.TP
265.B PERF_TYPE_HARDWARE
266This indicates one of the "generalized" hardware events provided
267by the kernel.
268See the
269.I config
270field definition for more details.
271.TP
272.B PERF_TYPE_SOFTWARE
273This indicates one of the software-defined events provided by the kernel
274(even if no hardware support is available).
275.TP
276.B PERF_TYPE_TRACEPOINT
277This indicates a tracepoint
278provided by the kernel tracepoint infrastructure.
279.TP
280.B PERF_TYPE_HW_CACHE
281This indicates a hardware cache event.
282This has a special encoding, described in the
283.I config
284field definition.
285.TP
286.B PERF_TYPE_RAW
287This indicates a "raw" implementation-specific event in the
288.IR config " field."
289.TP
31c1f2b0 290.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
f2b1d720
MK
291This indicates a hardware breakpoint as provided by the CPU.
292Breakpoints can be read/write accesses to an address as well as
293execution of an instruction address.
294.TP
295.RB "dynamic PMU"
296Since Linux 2.6.39,
7db515ef 297.BR perf_event_open ()
f2b1d720
MK
298can support multiple PMUs.
299To enable this, a value exported by the kernel can be used in the
300.I type
301field to indicate which PMU to use.
302The value to use can be found in the sysfs filesystem:
303there is a subdirectory per PMU instance under
304.IR /sys/bus/event_source/devices .
7d182bb6 305In each subdirectory there is a
f2b1d720
MK
306.I type
307file whose content is an integer that can be used in the
308.I type
309field.
310For instance,
311.I /sys/bus/event_source/devices/cpu/type
312contains the value for the core CPU PMU, which is usually 4.
313.RE
f2b1d720
MK
314.TP
315.I "size"
316The size of the
317.I perf_event_attr
318structure for forward/backward compatibility.
319Set this using
320.I sizeof(struct perf_event_attr)
321to allow the kernel to see
322the struct size at the time of compilation.
323
324The related define
325.B PERF_ATTR_SIZE_VER0
326is set to 64; this was the size of the first published struct.
327.B PERF_ATTR_SIZE_VER1
328is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
329.B PERF_ATTR_SIZE_VER2
330is 80 corresponding to the addition of branch sampling in Linux 3.4.
331.B PERF_ATR_SIZE_VER3
332is 96 corresponding to the addition
7ede2f66
DP
333of
334.I sample_regs_user
335and
336.I sample_stack_user
337in Linux 3.7.
f2b1d720
MK
338.TP
339.I "config"
340This specifies which event you want, in conjunction with
341the
342.I type
343field.
344The
345.IR config1 " and " config2
346fields are also taken into account in cases where 64 bits is not
347enough to fully specify the event.
348The encoding of these fields are event dependent.
349
350The most significant bit (bit 63) of
351.I config
352signifies CPU-specific (raw) counter configuration data;
353if the most significant bit is unset, the next 7 bits are an event
354type and the rest of the bits are the event identifier.
355
356There are various ways to set the
357.I config
358field that are dependent on the value of the previously
359described
360.I type
361field.
362What follows are various possible settings for
363.I config
364separated out by
365.IR type .
366
367If
368.I type
369is
370.BR PERF_TYPE_HARDWARE ,
371we are measuring one of the generalized hardware CPU events.
372Not all of these are available on all platforms.
373Set
374.I config
375to one of the following:
376.RS 12
377.TP
378.B PERF_COUNT_HW_CPU_CYCLES
379Total cycles.
2b538c3e 380Be wary of what happens during CPU frequency scaling.
f2b1d720
MK
381.TP
382.B PERF_COUNT_HW_INSTRUCTIONS
383Retired instructions.
384Be careful, these can be affected by various
2b538c3e 385issues, most notably hardware interrupt counts.
f2b1d720
MK
386.TP
387.B PERF_COUNT_HW_CACHE_REFERENCES
388Cache accesses.
389Usually this indicates Last Level Cache accesses but this may
390vary depending on your CPU.
391This may include prefetches and coherency messages; again this
392depends on the design of your CPU.
393.TP
394.B PERF_COUNT_HW_CACHE_MISSES
395Cache misses.
396Usually this indicates Last Level Cache misses; this is intended to be
397used in conjunction with the
398.B PERF_COUNT_HW_CACHE_REFERENCES
399event to calculate cache miss rates.
400.TP
401.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
402Retired branch instructions.
403Prior to Linux 2.6.34, this used
404the wrong event on AMD processors.
405.TP
406.B PERF_COUNT_HW_BRANCH_MISSES
407Mispredicted branch instructions.
408.TP
409.B PERF_COUNT_HW_BUS_CYCLES
410Bus cycles, which can be different from total cycles.
411.TP
31c1f2b0 412.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
f2b1d720
MK
413Stalled cycles during issue.
414.TP
31c1f2b0 415.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
f2b1d720
MK
416Stalled cycles during retirement.
417.TP
31c1f2b0 418.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
f2b1d720
MK
419Total cycles; not affected by CPU frequency scaling.
420.RE
421.IP
422If
423.I type
424is
425.BR PERF_TYPE_SOFTWARE ,
426we are measuring software events provided by the kernel.
427Set
428.I config
429to one of the following:
430.RS 12
431.TP
432.B PERF_COUNT_SW_CPU_CLOCK
433This reports the CPU clock, a high-resolution per-CPU timer.
434.TP
435.B PERF_COUNT_SW_TASK_CLOCK
436This reports a clock count specific to the task that is running.
437.TP
438.B PERF_COUNT_SW_PAGE_FAULTS
439This reports the number of page faults.
440.TP
441.B PERF_COUNT_SW_CONTEXT_SWITCHES
442This counts context switches.
443Until Linux 2.6.34, these were all reported as user-space
444events, after that they are reported as happening in the kernel.
445.TP
446.B PERF_COUNT_SW_CPU_MIGRATIONS
447This reports the number of times the process
448has migrated to a new CPU.
449.TP
450.B PERF_COUNT_SW_PAGE_FAULTS_MIN
451This counts the number of minor page faults.
452These did not require disk I/O to handle.
453.TP
454.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
455This counts the number of major page faults.
456These required disk I/O to handle.
457.TP
31c1f2b0 458.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
f2b1d720
MK
459This counts the number of alignment faults.
460These happen when unaligned memory accesses happen; the kernel
461can handle these but it reduces performance.
33a0ccb2 462This happens only on some architectures (never on x86).
f2b1d720 463.TP
31c1f2b0 464.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
f2b1d720
MK
465This counts the number of emulation faults.
466The kernel sometimes traps on unimplemented instructions
7db515ef 467and emulates them for user space.
f2b1d720 468This can negatively impact performance.
dab38455 469.TP
31c1f2b0 470.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
dab38455
VW
471This is a placeholder event that counts nothing.
472Informational sample record types such as mmap or comm
473must be associated with an active event.
474This dummy event allows gathering such records without requiring
475a counting event.
f2b1d720 476.RE
f2b1d720 477
f2b1d720
MK
478.RS
479If
480.I type
481is
482.BR PERF_TYPE_TRACEPOINT ,
483then we are measuring kernel tracepoints.
484The value to use in
485.I config
486can be obtained from under debugfs
487.I tracing/events/*/*/id
488if ftrace is enabled in the kernel.
f2b1d720 489.RE
1f22e274 490
f2b1d720
MK
491.RS
492If
493.I type
494is
495.BR PERF_TYPE_HW_CACHE ,
496then we are measuring a hardware CPU cache event.
497To calculate the appropriate
498.I config
499value use the following equation:
500.RS 4
501.nf
502
503 (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
504 (perf_hw_cache_op_result_id << 16)
505.fi
506.P
507where
508.I perf_hw_cache_id
509is one of:
7db515ef 510.RS 4
f2b1d720
MK
511.TP
512.B PERF_COUNT_HW_CACHE_L1D
513for measuring Level 1 Data Cache
514.TP
515.B PERF_COUNT_HW_CACHE_L1I
516for measuring Level 1 Instruction Cache
517.TP
518.B PERF_COUNT_HW_CACHE_LL
519for measuring Last-Level Cache
520.TP
521.B PERF_COUNT_HW_CACHE_DTLB
522for measuring the Data TLB
523.TP
524.B PERF_COUNT_HW_CACHE_ITLB
525for measuring the Instruction TLB
526.TP
527.B PERF_COUNT_HW_CACHE_BPU
528for measuring the branch prediction unit
529.TP
31c1f2b0 530.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.0)"
f2b1d720
MK
531for measuring local memory accesses
532.RE
f2b1d720
MK
533.P
534and
535.I perf_hw_cache_op_id
536is one of
7db515ef 537.RS 4
f2b1d720
MK
538.TP
539.B PERF_COUNT_HW_CACHE_OP_READ
540for read accesses
541.TP
542.B PERF_COUNT_HW_CACHE_OP_WRITE
543for write accesses
544.TP
545.B PERF_COUNT_HW_CACHE_OP_PREFETCH
546for prefetch accesses
547.RE
f2b1d720
MK
548.P
549and
550.I perf_hw_cache_op_result_id
551is one of
7db515ef 552.RS 4
f2b1d720
MK
553.TP
554.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
555to measure accesses
556.TP
557.B PERF_COUNT_HW_CACHE_RESULT_MISS
558to measure misses
559.RE
560.RE
561
562If
563.I type
564is
565.BR PERF_TYPE_RAW ,
566then a custom "raw"
567.I config
568value is needed.
569Most CPUs support events that are not covered by the "generalized" events.
570These are implementation defined; see your CPU manual (for example
571the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
572Guide).
573The libpfm4 library can be used to translate from the name in the
574architectural manuals to the raw hex value
575.BR perf_event_open ()
576expects in this field.
577
578If
579.I type
580is
581.BR PERF_TYPE_BREAKPOINT ,
582then leave
583.I config
584set to zero.
585Its parameters are set in other places.
586.RE
587.TP
588.IR sample_period ", " sample_freq
589A "sampling" counter is one that generates an interrupt
590every N events, where N is given by
591.IR sample_period .
592A sampling counter has
593.IR sample_period " > 0."
594When an overflow interrupt occurs, requested data is recorded
595in the mmap buffer.
596The
597.I sample_type
598field controls what data is recorded on each interrupt.
599
600.I sample_freq
601can be used if you wish to use frequency rather than period.
37bee118 602In this case, you set the
f2b1d720
MK
603.I freq
604flag.
605The kernel will adjust the sampling period
606to try and achieve the desired rate.
607The rate of adjustment is a
608timer tick.
f2b1d720
MK
609.TP
610.I "sample_type"
611The various bits in this field specify which values to include
612in the sample.
613They will be recorded in a ring-buffer,
ad73a2cc 614which is available to user space using
f2b1d720
MK
615.BR mmap (2).
616The order in which the values are saved in the
617sample are documented in the MMAP Layout subsection below;
618it is not the
619.I "enum perf_event_sample_format"
620order.
621.RS
622.TP
623.B PERF_SAMPLE_IP
624Records instruction pointer.
625.TP
626.B PERF_SAMPLE_TID
7db515ef 627Records the process and thread IDs.
f2b1d720
MK
628.TP
629.B PERF_SAMPLE_TIME
630Records a timestamp.
631.TP
632.B PERF_SAMPLE_ADDR
633Records an address, if applicable.
634.TP
635.B PERF_SAMPLE_READ
636Record counter values for all events in a group, not just the group leader.
637.TP
638.B PERF_SAMPLE_CALLCHAIN
639Records the callchain (stack backtrace).
640.TP
641.B PERF_SAMPLE_ID
642Records a unique ID for the opened event's group leader.
643.TP
644.B PERF_SAMPLE_CPU
645Records CPU number.
646.TP
647.B PERF_SAMPLE_PERIOD
648Records the current sampling period.
649.TP
650.B PERF_SAMPLE_STREAM_ID
651Records a unique ID for the opened event.
652Unlike
653.B PERF_SAMPLE_ID
654the actual ID is returned, not the group leader.
8859d3a9
DP
655This ID is the same as the one returned by
656.BR PERF_FORMAT_ID .
f2b1d720
MK
657.TP
658.B PERF_SAMPLE_RAW
659Records additional data, if applicable.
660Usually returned by tracepoint events.
661.TP
31c1f2b0 662.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
045bf4d3
VW
663This provides a record of recent branches, as provided
664by CPU branch sampling hardware (such as Intel Last Branch Record).
665Not all hardware supports this feature.
666
667See the
668.I branch_sample_type
669field for how to filter which branches are reported.
f2b1d720 670.TP
31c1f2b0 671.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
d1007d14
VW
672Records the current user-level CPU register state
673(the values in the process before the kernel was called).
f2b1d720 674.TP
31c1f2b0 675.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
d1007d14
VW
676Records the user level stack, allowing stack unwinding.
677.TP
31c1f2b0 678.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
d1007d14 679Records a hardware provided weight value that expresses how
51700fd7 680costly the sampled event was.
d1007d14
VW
681This allows the hardware to highlight expensive events in
682a profile.
683.TP
31c1f2b0 684.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
d1007d14
VW
685Records the data source: where in the memory hierarchy
686the data associated with the sampled instruction came from.
687This is only available if the underlying hardware
688supports this feature.
7480dabb 689.TP
31c1f2b0 690.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
8859d3a9
DP
691Places the
692.B SAMPLE_ID
693value in a fixed position in the record,
7480dabb
VW
694either at the beginning (for sample events) or at the end
695(if a non-sample event).
696
697This was necessary because a sample stream may have
698records from various different event sources with different
699.I sample_type
700settings.
e9bd9b2c 701Parsing the event stream properly was not possible because the
8859d3a9
DP
702format of the record was needed to find
703.BR SAMPLE_ID ,
704but
27f52b52 705the format could not be found without knowing what
7480dabb
VW
706event the sample belonged to (causing a circular
707dependency).
708
709This new
710.B PERF_SAMPLE_IDENTIFIER
711setting makes the event stream always parsable
8859d3a9
DP
712by putting
713.B SAMPLE_ID
714in a fixed location, even though
715it means having duplicate
716.B SAMPLE_ID
717values in records.
1e043959
VW
718.TP
719.BR PERF_SAMPLE_TRANSACTION " (Since Linux 3.13)"
84fc2a6e 720Records reasons for transactional memory abort events
1e043959
VW
721(for example, from Intel TSX transactional memory support).
722
723The
724.I precise_ip
b3f39642 725setting must be greater than 0 and a transactional memory abort
1e043959 726event must be measured or no values will be recorded.
84fc2a6e
MK
727Also note that some perf_event measurements, such as sampled
728cycle counting, may cause extraneous aborts (by causing an
1e043959 729interrupt during a transaction).
f2b1d720 730.RE
f2b1d720
MK
731.TP
732.IR "read_format"
733This field specifies the format of the data returned by
734.BR read (2)
735on a
7db515ef 736.BR perf_event_open ()
f2b1d720
MK
737file descriptor.
738.RS
739.TP
740.B PERF_FORMAT_TOTAL_TIME_ENABLED
7ede2f66
DP
741Adds the 64-bit
742.I time_enabled
743field.
f2b1d720
MK
744This can be used to calculate estimated totals if
745the PMU is overcommitted and multiplexing is happening.
746.TP
747.B PERF_FORMAT_TOTAL_TIME_RUNNING
7ede2f66
DP
748Adds the 64-bit
749.I time_running
750field.
f2b1d720
MK
751This can be used to calculate estimated totals if
752the PMU is overcommitted and multiplexing is happening.
753.TP
754.B PERF_FORMAT_ID
755Adds a 64-bit unique value that corresponds to the event group.
756.TP
757.B PERF_FORMAT_GROUP
758Allows all counter values in an event group to be read with one read.
759.RE
f2b1d720
MK
760.TP
761.IR "disabled"
762The
763.I disabled
764bit specifies whether the counter starts out disabled or enabled.
765If disabled, the event can later be enabled by
766.BR ioctl (2),
767.BR prctl (2),
768or
769.IR enable_on_exec .
406650db
VW
770
771When creating an event group, typically the group leader is initialized
772with
773.I disabled
774set to 1 and any child events are initialized with
775.I disabled
776set to 0.
777Despite
778.I disabled
779being 0, the child events will not start until the group leader
780is enabled.
f2b1d720
MK
781.TP
782.IR "inherit"
783The
784.I inherit
785bit specifies that this counter should count events of child
786tasks as well as the task specified.
33a0ccb2 787This applies only to new children, not to any existing children at
f2b1d720
MK
788the time the counter is created (nor to any new children of
789existing children).
790
791Inherit does not work for some combinations of
792.IR read_format s,
793such as
794.BR PERF_FORMAT_GROUP .
f2b1d720
MK
795.TP
796.IR "pinned"
797The
798.I pinned
799bit specifies that the counter should always be on the CPU if at all
800possible.
33a0ccb2 801It applies only to hardware counters and only to group leaders.
f2b1d720
MK
802If a pinned counter cannot be put onto the CPU (e.g., because there are
803not enough hardware counters or because of a conflict with some other
804event), then the counter goes into an 'error' state, where reads
805return end-of-file (i.e.,
806.BR read (2)
807returns 0) until the counter is subsequently enabled or disabled.
f2b1d720
MK
808.TP
809.IR "exclusive"
810The
811.I exclusive
812bit specifies that when this counter's group is on the CPU,
813it should be the only group using the CPU's counters.
814In the future this may allow monitoring programs to
815support PMU features that need to run alone so that they do not
816disrupt other hardware counters.
bea10c8c
VW
817
818Note that many unexpected situations may prevent events with the
819.I exclusive
d3532647 820bit set from ever running.
bea10c8c 821This includes any users running a system-wide
d3532647 822measurement as well as any kernel use of the performance counters
bea10c8c 823(including the commonly enabled NMI Watchdog Timer interface).
f2b1d720
MK
824.TP
825.IR "exclude_user"
ad73a2cc 826If this bit is set, the count excludes events that happen in user space.
f2b1d720
MK
827.TP
828.IR "exclude_kernel"
829If this bit is set, the count excludes events that happen in kernel-space.
f2b1d720
MK
830.TP
831.IR "exclude_hv"
832If this bit is set, the count excludes events that happen in the
833hypervisor.
834This is mainly for PMUs that have built-in support for handling this
835(such as POWER).
836Extra support is needed for handling hypervisor measurements on most
837machines.
f2b1d720
MK
838.TP
839.IR "exclude_idle"
840If set, don't count when the CPU is idle.
f2b1d720
MK
841.TP
842.IR "mmap"
843The
844.I mmap
75ee11e5 845bit enables generation of
cd7c700a 846.B PERF_RECORD_MMAP
75ee11e5
VW
847samples for every
848.BR mmap (2)
849call that has
cd7c700a 850.B PROT_EXEC
75ee11e5
VW
851set.
852This allows tools to notice new executable code being mapped into
853a program (dynamic shared libraries for example)
854so that addresses can be mapped back to the original code.
f2b1d720
MK
855.TP
856.IR "comm"
857The
858.I comm
859bit enables tracking of process command name as modified by the
cd7c700a 860.BR exec (2)
f2b1d720 861and
cd7c700a 862.BR prctl (PR_SET_NAME)
f2b1d720
MK
863system calls.
864Unfortunately for tools,
865there is no way to distinguish one system call versus the other.
f2b1d720
MK
866.TP
867.IR "freq"
868If this bit is set, then
869.I sample_frequency
870not
871.I sample_period
872is used when setting up the sampling interval.
f2b1d720
MK
873.TP
874.IR "inherit_stat"
875This bit enables saving of event counts on context switch for
876inherited tasks.
33a0ccb2 877This is meaningful only if the
f2b1d720
MK
878.I inherit
879field is set.
f2b1d720
MK
880.TP
881.IR "enable_on_exec"
882If this bit is set, a counter is automatically
883enabled after a call to
884.BR exec (2).
f2b1d720
MK
885.TP
886.IR "task"
887If this bit is set, then
888fork/exit notifications are included in the ring buffer.
f2b1d720
MK
889.TP
890.IR "watermark"
891If set, have a sampling interrupt happen when we cross the
892.I wakeup_watermark
893boundary.
894Otherwise interrupts happen after
895.I wakeup_events
896samples.
f2b1d720 897.TP
31c1f2b0 898.IR "precise_ip" " (since Linux 2.6.35)"
f2b1d720
MK
899This controls the amount of skid.
900Skid is how many instructions
901execute between an event of interest happening and the kernel
902being able to stop and record the event.
903Smaller skid is
904better and allows more accurate reporting of which events
905correspond to which instructions, but hardware is often limited
906with how small this can be.
907
908The values of this are the following:
909.RS
910.TP
9110 -
912.B SAMPLE_IP
2b538c3e 913can have arbitrary skid.
f2b1d720
MK
914.TP
9151 -
916.B SAMPLE_IP
2b538c3e 917must have constant skid.
f2b1d720
MK
918.TP
9192 -
920.B SAMPLE_IP
2b538c3e 921requested to have 0 skid.
f2b1d720
MK
922.TP
9233 -
924.B SAMPLE_IP
925must have 0 skid.
926See also
927.BR PERF_RECORD_MISC_EXACT_IP .
928.RE
f2b1d720 929.TP
31c1f2b0 930.IR "mmap_data" " (since Linux 2.6.36)"
f2b1d720
MK
931The counterpart of the
932.I mmap
75ee11e5
VW
933field.
934This enables generation of
cd7c700a 935.B PERF_RECORD_MMAP
75ee11e5
VW
936samples for
937.BR mmap (2)
938calls that do not have
cd7c700a 939.B PROT_EXEC
75ee11e5 940set (for example data and SysV shared memory).
f2b1d720 941.TP
31c1f2b0 942.IR "sample_id_all" " (since Linux 2.6.38)"
7480dabb 943If set, then TID, TIME, ID, STREAM_ID, and CPU can
f2b1d720
MK
944additionally be included in
945.RB non- PERF_RECORD_SAMPLE s
946if the corresponding
947.I sample_type
948is selected.
7480dabb 949
e9bd9b2c 950If
7480dabb 951.B PERF_SAMPLE_IDENTIFIER
37bee118 952is specified, then an additional ID value is included
7480dabb
VW
953as the last value to ease parsing the record stream.
954This may lead to the
e9bd9b2c 955.I id
7480dabb
VW
956value appearing twice.
957
958The layout is described by this pseudo-structure:
959.in +4n
960.nf
961struct sample_id {
962 { u32 pid, tid; } /* if PERF_SAMPLE_TID set */
963 { u64 time; } /* if PERF_SAMPLE_TIME set */
964 { u64 id; } /* if PERF_SAMPLE_ID set */
965 { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
966 { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
967 { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
968};
969.fi
f2b1d720 970.TP
31c1f2b0 971.IR "exclude_host" " (since Linux 3.2)"
33d6e2c7 972Do not measure time spent in VM host.
f2b1d720 973.TP
31c1f2b0 974.IR "exclude_guest" " (since Linux 3.2)"
33d6e2c7 975Do not measure time spent in VM guest.
f2b1d720 976.TP
31c1f2b0 977.IR "exclude_callchain_kernel" " (since Linux 3.7)"
f2b1d720 978Do not include kernel callchains.
f2b1d720 979.TP
31c1f2b0 980.IR "exclude_callchain_user" " (since Linux 3.7)"
f2b1d720 981Do not include user callchains.
f2b1d720
MK
982.TP
983.IR "wakeup_events" ", " "wakeup_watermark"
984This union sets how many samples
985.RI ( wakeup_events )
986or bytes
987.RI ( wakeup_watermark )
988happen before an overflow signal happens.
989Which one is used is selected by the
990.I watermark
991bitflag.
751c0f1a
VW
992
993.I wakeup_events
994only counts
995.B PERF_RECORD_SAMPLE
51700fd7 996record types.
751c0f1a
VW
997To receive a signal for every incoming
998.B PERF_RECORD
999type set
1000.I wakeup_watermark
1001to 1.
f2b1d720 1002.TP
31c1f2b0 1003.IR "bp_type" " (since Linux 2.6.33)"
f2b1d720
MK
1004This chooses the breakpoint type.
1005It is one of:
1006.RS
1007.TP
1008.BR HW_BREAKPOINT_EMPTY
2b538c3e 1009No breakpoint.
f2b1d720
MK
1010.TP
1011.BR HW_BREAKPOINT_R
2b538c3e 1012Count when we read the memory location.
f2b1d720
MK
1013.TP
1014.BR HW_BREAKPOINT_W
2b538c3e 1015Count when we write the memory location.
f2b1d720
MK
1016.TP
1017.BR HW_BREAKPOINT_RW
2b538c3e 1018Count when we read or write the memory location.
f2b1d720
MK
1019.TP
1020.BR HW_BREAKPOINT_X
2b538c3e 1021Count when we execute code at the memory location.
f2b1d720 1022.LP
7db515ef 1023The values can be combined via a bitwise or, but the
f2b1d720
MK
1024combination of
1025.B HW_BREAKPOINT_R
1026or
1027.B HW_BREAKPOINT_W
1028with
1029.B HW_BREAKPOINT_X
1030is not allowed.
1031.RE
f2b1d720 1032.TP
31c1f2b0 1033.IR "bp_addr" " (since Linux 2.6.33)"
f2b1d720
MK
1034.I bp_addr
1035address of the breakpoint.
1036For execution breakpoints this is the memory address of the instruction
1037of interest; for read and write breakpoints it is the memory address
1038of the memory location of interest.
f2b1d720 1039.TP
31c1f2b0 1040.IR "config1" " (since Linux 2.6.39)"
f2b1d720
MK
1041.I config1
1042is used for setting events that need an extra register or otherwise
1043do not fit in the regular config field.
1044Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
1045on 3.3 and later kernels.
f2b1d720 1046.TP
31c1f2b0 1047.IR "bp_len" " (since Linux 2.6.33)"
f2b1d720
MK
1048.I bp_len
1049is the length of the breakpoint being measured if
1050.I type
1051is
1052.BR PERF_TYPE_BREAKPOINT .
1053Options are
1054.BR HW_BREAKPOINT_LEN_1 ,
1055.BR HW_BREAKPOINT_LEN_2 ,
1056.BR HW_BREAKPOINT_LEN_4 ,
1057.BR HW_BREAKPOINT_LEN_8 .
1058For an execution breakpoint, set this to
1059.IR sizeof(long) .
f2b1d720 1060.TP
31c1f2b0 1061.IR "config2" " (since Linux 2.6.39)"
f2b1d720
MK
1062
1063.I config2
1064is a further extension of the
1065.I config1
1066field.
f2b1d720 1067.TP
31c1f2b0 1068.IR "branch_sample_type" " (since Linux 3.4)"
8a94e783 1069If
045bf4d3
VW
1070.B PERF_SAMPLE_BRANCH_STACK
1071is enabled, then this specifies what branches to include
1072in the branch record.
e3c9782b
VW
1073
1074The first part of the value is the privilege level, which
1075is a combination of one of the following values.
045bf4d3
VW
1076If the user does not set privilege level explicitly, the kernel
1077will use the event's privilege level.
1078Event and branch privilege levels do not have to match.
f2b1d720
MK
1079.RS
1080.TP
1081.B PERF_SAMPLE_BRANCH_USER
33d6e2c7 1082Branch target is in user space.
f2b1d720
MK
1083.TP
1084.B PERF_SAMPLE_BRANCH_KERNEL
33d6e2c7 1085Branch target is in kernel space.
f2b1d720
MK
1086.TP
1087.B PERF_SAMPLE_BRANCH_HV
33d6e2c7 1088Branch target is in hypervisor.
e3c9782b
VW
1089.TP
1090.B PERF_SAMPLE_BRANCH_PLM_ALL
1091A convenience value that is the three preceding values ORed together.
1092
1093.P
1094In addition to the privilege value, at least one or more of the
1095following bits must be set.
1096
f2b1d720
MK
1097.TP
1098.B PERF_SAMPLE_BRANCH_ANY
33d6e2c7 1099Any branch type.
f2b1d720
MK
1100.TP
1101.B PERF_SAMPLE_BRANCH_ANY_CALL
33d6e2c7 1102Any call branch.
f2b1d720
MK
1103.TP
1104.B PERF_SAMPLE_BRANCH_ANY_RETURN
33d6e2c7 1105Any return branch.
f2b1d720 1106.TP
e3c9782b 1107.B PERF_SAMPLE_BRANCH_IND_CALL
33d6e2c7 1108Indirect calls.
f2b1d720 1109.TP
31c1f2b0 1110.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
33d6e2c7 1111Transactional memory aborts.
e3c9782b 1112.TP
31c1f2b0 1113.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
33d6e2c7 1114Branch in transactional memory transaction.
e3c9782b 1115.TP
31c1f2b0 1116.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
33d6e2c7 1117Branch not in transactional memory transaction.
f2b1d720 1118.RE
e3c9782b 1119
f2b1d720 1120.TP
31c1f2b0 1121.IR "sample_regs_user" " (since Linux 3.7)"
4651e412 1122This bit mask defines the set of user CPU registers to dump on samples.
76c637e1 1123The layout of the register mask is architecture-specific and
d1007d14
VW
1124described in the kernel header
1125.IR arch/ARCH/include/uapi/asm/perf_regs.h .
f2b1d720 1126.TP
31c1f2b0 1127.IR "sample_stack_user" " (since Linux 3.7)"
d1007d14
VW
1128This defines the size of the user stack to dump if
1129.B PERF_SAMPLE_STACK_USER
1130is specified.
73d8cece 1131.SS Reading results
f2b1d720 1132Once a
7db515ef 1133.BR perf_event_open ()
f2b1d720
MK
1134file descriptor has been opened, the values
1135of the events can be read from the file descriptor.
1136The values that are there are specified by the
1137.I read_format
7db515ef
MK
1138field in the
1139.I attr
1140structure at open time.
f2b1d720
MK
1141
1142If you attempt to read into a buffer that is not big enough to hold the
1143data
1144.B ENOSPC
1145is returned
1146
1147Here is the layout of the data returned by a read:
e525b89f 1148.IP * 2
f2b1d720
MK
1149If
1150.B PERF_FORMAT_GROUP
1151was specified to allow reading all events in a group at once:
1152
1153.in +4n
1154.nf
1155struct read_format {
e525b89f
MK
1156 u64 nr; /* The number of events */
1157 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1158 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
50b2aa27 1159 struct
e525b89f
MK
1160 u64 value; /* The value of the event */
1161 u64 id; /* if PERF_FORMAT_ID */
f2b1d720
MK
1162 } values[nr];
1163};
1164.fi
1165.in
e525b89f 1166.IP *
f2b1d720
MK
1167If
1168.B PERF_FORMAT_GROUP
1169was
1170.I not
e525b89f 1171specified:
f2b1d720
MK
1172
1173.in +4n
1174.nf
1175struct read_format {
1176 u64 value; /* The value of the event */
1177 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1178 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1179 u64 id; /* if PERF_FORMAT_ID */
1180};
1181.fi
1182.in
e525b89f
MK
1183.PP
1184The values read are as follows:
f2b1d720
MK
1185.TP
1186.I nr
1187The number of events in this file descriptor.
1188Only available if
1189.B PERF_FORMAT_GROUP
1190was specified.
f2b1d720
MK
1191.TP
1192.IR time_enabled ", " time_running
1193Total time the event was enabled and running.
1194Normally these are the same.
37bee118
MK
1195If more events are started,
1196then available counter slots on the PMU, then multiplexing
33a0ccb2 1197happens and events run only part of the time.
37bee118 1198In that case, the
f2b1d720
MK
1199.I time_enabled
1200and
1201.I time running
1202values can be used to scale an estimated value for the count.
f2b1d720
MK
1203.TP
1204.I value
1205An unsigned 64-bit value containing the counter result.
f2b1d720
MK
1206.TP
1207.I id
1208A globally unique value for this particular event, only there if
1209.B PERF_FORMAT_ID
e525b89f
MK
1210was specified in
1211.IR read_format .
73d8cece 1212.SS MMAP layout
f2b1d720 1213When using
7db515ef 1214.BR perf_event_open ()
f2b1d720
MK
1215in sampled mode, asynchronous events
1216(like counter overflow or
1217.B PROT_EXEC
1218mmap tracking)
1219are logged into a ring-buffer.
1220This ring-buffer is created and accessed through
1221.BR mmap (2).
1222
1223The mmap size should be 1+2^n pages, where the first page is a
1224metadata page
e525b89f 1225.RI ( "struct perf_event_mmap_page" )
f2b1d720
MK
1226that contains various
1227bits of information such as where the ring-buffer head is.
1228
1229Before kernel 2.6.39, there is a bug that means you must allocate a mmap
1230ring buffer when sampling even if you do not plan to access it.
1231
1232The structure of the first metadata mmap page is as follows:
1233
1234.in +4n
1235.nf
1236struct perf_event_mmap_page {
7db515ef 1237 __u32 version; /* version number of this structure */
f2b1d720 1238 __u32 compat_version; /* lowest version this is compat with */
7db515ef
MK
1239 __u32 lock; /* seqlock for synchronization */
1240 __u32 index; /* hardware counter identifier */
1241 __s64 offset; /* add to hardware counter value */
1242 __u64 time_enabled; /* time event active */
1243 __u64 time_running; /* time event on CPU */
f2b1d720
MK
1244 union {
1245 __u64 capabilities;
135cba8b
VW
1246 struct {
1247 __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
1248 cap_bit0_is_deprecated : 1,
1249 cap_user_rdpmc : 1,
1250 cap_user_time : 1,
1251 cap_user_time_zero : 1,
1252 };
f2b1d720
MK
1253 };
1254 __u16 pmc_width;
1255 __u16 time_shift;
1256 __u32 time_mult;
1257 __u64 time_offset;
7db515ef 1258 __u64 __reserved[120]; /* Pad to 1k */
f2b1d720 1259 __u64 data_head; /* head in the data section */
7db515ef 1260 __u64 data_tail; /* user-space written tail */
f2b1d720
MK
1261}
1262.fi
1263.in
1264
f2b1d720
MK
1265The following looks at the fields in the
1266.I perf_event_mmap_page
e525b89f 1267structure in more detail:
f2b1d720
MK
1268.TP
1269.I version
1270Version number of this structure.
f2b1d720
MK
1271.TP
1272.I compat_version
1273The lowest version this is compatible with.
f2b1d720
MK
1274.TP
1275.I lock
1276A seqlock for synchronization.
f2b1d720
MK
1277.TP
1278.I index
1279A unique hardware counter identifier.
f2b1d720
MK
1280.TP
1281.I offset
135cba8b
VW
1282When using rdpmc for reads this offset value
1283must be added to the one returned by rdpmc to get
1284the current total event count.
f2b1d720
MK
1285.TP
1286.I time_enabled
1287Time the event was active.
f2b1d720
MK
1288.TP
1289.I time_running
1290Time the event was running.
f2b1d720 1291.TP
31c1f2b0 1292.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
e9bd9b2c 1293There was a bug in the definition of
f2b1d720 1294.I cap_usr_time
135cba8b
VW
1295and
1296.I cap_usr_rdpmc
1297from Linux 3.4 until Linux 3.11.
1298Both bits were defined to point to the same location, so it was
e9bd9b2c 1299impossible to know if
135cba8b
VW
1300.I cap_usr_time
1301or
1302.I cap_usr_rdpmc
1303were actually set.
1304
1305Starting with 3.12 these are renamed to
1306.I cap_bit0
1307and you should use the new
1308.I cap_user_time
1309and
1310.I cap_user_rdpmc
1311fields instead.
1312
f2b1d720 1313.TP
31c1f2b0 1314.IR cap_bit0_is_deprecated " (since Linux 3.12)"
37bee118 1315If set, this bit indicates that the kernel supports
135cba8b
VW
1316the properly separated
1317.I cap_user_time
1318and
1319.I cap_user_rdpmc
1320bits.
1321
1322If not-set, it indicates an older kernel where
1323.I cap_usr_time
1324and
f2b1d720 1325.I cap_usr_rdpmc
135cba8b
VW
1326map to the same bit and thus both features should
1327be used with caution.
1328
1329.TP
31c1f2b0 1330.IR cap_user_rdpmc " (since Linux 3.12)"
f2b1d720
MK
1331If the hardware supports user-space read of performance counters
1332without syscall (this is the "rdpmc" instruction on x86), then
1333the following code can be used to do a read:
1334
1335.in +4n
1336.nf
1337u32 seq, time_mult, time_shift, idx, width;
1338u64 count, enabled, running;
1339u64 cyc, time_offset;
f2b1d720
MK
1340
1341do {
1342 seq = pc\->lock;
1343 barrier();
1344 enabled = pc\->time_enabled;
1345 running = pc\->time_running;
1346
1347 if (pc\->cap_usr_time && enabled != running) {
1348 cyc = rdtsc();
1349 time_offset = pc\->time_offset;
1350 time_mult = pc\->time_mult;
1351 time_shift = pc\->time_shift;
1352 }
1353
1354 idx = pc\->index;
1355 count = pc\->offset;
1356
1357 if (pc\->cap_usr_rdpmc && idx) {
1358 width = pc\->pmc_width;
135cba8b 1359 count += rdpmc(idx \- 1);
f2b1d720
MK
1360 }
1361
1362 barrier();
1363} while (pc\->lock != seq);
1364.fi
1365.in
f2b1d720 1366.TP
31c1f2b0 1367.I cap_user_time " (since Linux 3.12)"
7d182bb6 1368This bit indicates the hardware has a constant, nonstop
135cba8b
VW
1369timestamp counter (TSC on x86).
1370.TP
31c1f2b0 1371.IR cap_user_time_zero " (since Linux 3.12)"
135cba8b
VW
1372Indicates the presence of
1373.I time_zero
1374which allows mapping timestamp values to
1375the hardware clock.
1376.TP
f2b1d720
MK
1377.I pmc_width
1378If
1379.IR cap_usr_rdpmc ,
1380this field provides the bit-width of the value
1381read using the rdpmc or equivalent instruction.
1382This can be used to sign extend the result like:
1383
1384.in +4n
1385.nf
1386pmc <<= 64 \- pmc_width;
1387pmc >>= 64 \- pmc_width; // signed shift right
1388count += pmc;
1389.fi
1390.in
f2b1d720
MK
1391.TP
1392.IR time_shift ", " time_mult ", " time_offset
1393
1394If
1395.IR cap_usr_time ,
1396these fields can be used to compute the time
7db515ef 1397delta since time_enabled (in nanoseconds) using rdtsc or similar.
f2b1d720
MK
1398.nf
1399
1400 u64 quot, rem;
1401 u64 delta;
1402 quot = (cyc >> time_shift);
1403 rem = cyc & ((1 << time_shift) \- 1);
1404 delta = time_offset + quot * time_mult +
1405 ((rem * time_mult) >> time_shift);
1406.fi
1407
7db515ef
MK
1408Where
1409.IR time_offset ,
1410.IR time_mult ,
1411.IR time_shift ,
1412and
1413.IR cyc
1414are read in the
f2b1d720
MK
1415seqcount loop described above.
1416This delta can then be added to
1417enabled and possible running (if idx), improving the scaling:
1418.nf
1419
1420 enabled += delta;
1421 if (idx)
1422 running += delta;
1423 quot = count / running;
1424 rem = count % running;
1425 count = quot * enabled + (rem * enabled) / running;
1426.fi
f2b1d720 1427.TP
31c1f2b0 1428.IR time_zero " (since Linux 3.12)"
135cba8b 1429
e9bd9b2c 1430If
135cba8b 1431.I cap_usr_time_zero
37bee118 1432is set, then the hardware clock (the TSC timestamp counter on x86)
135cba8b
VW
1433can be calculated from the
1434.IR time_zero ", " time_mult ", and " time_shift " values:"
1435.nf
1436 time = timestamp - time_zero;
1437 quot = time / time_mult;
1438 rem = time % time_mult;
1439 cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
1440.fi
1441And vice versa:
1442.nf
1443 quot = cyc >> time_shift;
1444 rem = cyc & ((1 << time_shift) - 1);
1445 timestamp = time_zero + quot * time_mult +
1446 ((rem * time_mult) >> time_shift);
1447.fi
1448.TP
f2b1d720
MK
1449.I data_head
1450This points to the head of the data section.
7db515ef
MK
1451The value continuously increases, it does not wrap.
1452The value needs to be manually wrapped by the size of the mmap buffer
f2b1d720
MK
1453before accessing the samples.
1454
1455On SMP-capable platforms, after reading the data_head value,
ad73a2cc 1456user space should issue an rmb().
f2b1d720 1457.TP
fecd584f 1458.I data_tail
f2b1d720
MK
1459When the mapping is
1460.BR PROT_WRITE ,
7db515ef
MK
1461the
1462.I data_tail
1463value should be written by user space to reflect the last read data.
31020de9 1464In this case, the kernel will not overwrite unread data.
e525b89f 1465.PP
f2b1d720
MK
1466The following 2^n ring-buffer pages have the layout described below.
1467
1468If
1469.I perf_event_attr.sample_id_all
1470is set, then all event types will
1471have the sample_type selected fields related to where/when (identity)
1472an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1473.B PERF_RECORD_SAMPLE
1474below, it will be stashed just after the
7db515ef
MK
1475.I perf_event_header
1476and the fields already present for the existing
4047bc6c 1477fields, that is, at the end of the payload.
f2b1d720
MK
1478That way a newer perf.data
1479file will be supported by older perf tools, with these new optional
1480fields being ignored.
1481
1482The mmap values start with a header:
1483
1484.in +4n
1485.nf
1486struct perf_event_header {
1487 __u32 type;
1488 __u16 misc;
1489 __u16 size;
1490};
1491.fi
1492.in
1493
1494Below, we describe the
1495.I perf_event_header
1496fields in more detail.
4047bc6c
MK
1497For ease of reading,
1498the fields with shorter descriptions are presented first.
1499.TP
1500.I size
1501This indicates the size of the record.
1502.TP
1503.I misc
1504The
1505.I misc
1506field contains additional information about the sample.
1507
1508The CPU mode can be determined from this value by masking with
1509.B PERF_RECORD_MISC_CPUMODE_MASK
1510and looking for one of the following (note these are not
1511bit masks, only one can be set at a time):
1512.RS
1513.TP
1514.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1515Unknown CPU mode.
1516.TP
1517.B PERF_RECORD_MISC_KERNEL
1518Sample happened in the kernel.
1519.TP
1520.B PERF_RECORD_MISC_USER
1521Sample happened in user code.
1522.TP
1523.B PERF_RECORD_MISC_HYPERVISOR
1524Sample happened in the hypervisor.
1525.TP
1526.B PERF_RECORD_MISC_GUEST_KERNEL
1527Sample happened in the guest kernel.
1528.TP
1529.B PERF_RECORD_MISC_GUEST_USER
1530Sample happened in guest user code.
1531.RE
1532
1533.RS
1534In addition, one of the following bits can be set:
1535.TP
1536.B PERF_RECORD_MISC_MMAP_DATA
1537This is set when the mapping is not executable;
1538otherwise the mapping is executable.
1539.TP
1540.B PERF_RECORD_MISC_EXACT_IP
1541This indicates that the content of
1542.B PERF_SAMPLE_IP
1543points
1544to the actual instruction that triggered the event.
1545See also
1546.IR perf_event_attr.precise_ip .
1547.TP
1548.B PERF_RECORD_MISC_EXT_RESERVED
1549This indicates there is extended data available (currently not used).
1550.RE
f2b1d720
MK
1551.TP
1552.I type
1553The
1554.I type
1555value is one of the below.
1556The values in the corresponding record (that follows the header)
1557depend on the
1558.I type
1559selected as shown.
7480dabb 1560
f2b1d720 1561.RS
7db515ef 1562.TP 4
f2b1d720
MK
1563.B PERF_RECORD_MMAP
1564The MMAP events record the
1565.B PROT_EXEC
1566mappings so that we can correlate
ad73a2cc 1567user-space IPs to code.
f2b1d720
MK
1568They have the following structure:
1569
1570.in +4n
1571.nf
1572struct {
1573 struct perf_event_header header;
1574 u32 pid, tid;
1575 u64 addr;
1576 u64 len;
1577 u64 pgoff;
1578 char filename[];
1579};
1580.fi
1581.in
f2b1d720
MK
1582.TP
1583.B PERF_RECORD_LOST
1584This record indicates when events are lost.
1585
1586.in +4n
1587.nf
1588struct {
1589 struct perf_event_header header;
1590 u64 id;
1591 u64 lost;
7480dabb 1592 struct sample_id sample_id;
f2b1d720
MK
1593};
1594.fi
1595.in
f2b1d720
MK
1596.RS
1597.TP
1598.I id
1599is the unique event ID for the samples that were lost.
1600.TP
1601.I lost
1602is the number of events that were lost.
1603.RE
f2b1d720
MK
1604.TP
1605.B PERF_RECORD_COMM
1606This record indicates a change in the process name.
1607
1608.in +4n
1609.nf
1610struct {
1611 struct perf_event_header header;
1612 u32 pid, tid;
1613 char comm[];
7480dabb 1614 struct sample_id sample_id;
f2b1d720
MK
1615};
1616.fi
1617.in
f2b1d720
MK
1618.TP
1619.B PERF_RECORD_EXIT
1620This record indicates a process exit event.
1621
1622.in +4n
1623.nf
1624struct {
1625 struct perf_event_header header;
1626 u32 pid, ppid;
1627 u32 tid, ptid;
1628 u64 time;
7480dabb 1629 struct sample_id sample_id;
f2b1d720
MK
1630};
1631.fi
1632.in
f2b1d720
MK
1633.TP
1634.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
1635This record indicates a throttle/unthrottle event.
1636
1637.in +4n
1638.nf
1639struct {
1640 struct perf_event_header header;
1641 u64 time;
1642 u64 id;
1643 u64 stream_id;
7480dabb 1644 struct sample_id sample_id;
f2b1d720
MK
1645};
1646.fi
1647.in
f2b1d720
MK
1648.TP
1649.B PERF_RECORD_FORK
1650This record indicates a fork event.
1651
1652.in +4n
1653.nf
1654struct {
1655 struct perf_event_header header;
1656 u32 pid, ppid;
1657 u32 tid, ptid;
1658 u64 time;
7480dabb 1659 struct sample_id sample_id;
f2b1d720
MK
1660};
1661.fi
1662.in
f2b1d720
MK
1663.TP
1664.B PERF_RECORD_READ
1665This record indicates a read event.
1666
1667.in +4n
1668.nf
1669struct {
1670 struct perf_event_header header;
1671 u32 pid, tid;
1672 struct read_format values;
7480dabb 1673 struct sample_id sample_id;
f2b1d720
MK
1674};
1675.fi
1676.in
f2b1d720
MK
1677.TP
1678.B PERF_RECORD_SAMPLE
1679This record indicates a sample.
1680
1681.in +4n
1682.nf
1683struct {
1684 struct perf_event_header header;
7480dabb 1685 u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
7db515ef
MK
1686 u64 ip; /* if PERF_SAMPLE_IP */
1687 u32 pid, tid; /* if PERF_SAMPLE_TID */
1688 u64 time; /* if PERF_SAMPLE_TIME */
1689 u64 addr; /* if PERF_SAMPLE_ADDR */
1690 u64 id; /* if PERF_SAMPLE_ID */
1691 u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
1692 u32 cpu, res; /* if PERF_SAMPLE_CPU */
1693 u64 period; /* if PERF_SAMPLE_PERIOD */
f2b1d720 1694 struct read_format v; /* if PERF_SAMPLE_READ */
7db515ef
MK
1695 u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
1696 u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
1697 u32 size; /* if PERF_SAMPLE_RAW */
1698 char data[size]; /* if PERF_SAMPLE_RAW */
1699 u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
1700 struct perf_branch_entry lbr[bnr];
1701 /* if PERF_SAMPLE_BRANCH_STACK */
1702 u64 abi; /* if PERF_SAMPLE_REGS_USER */
1703 u64 regs[weight(mask)];
1704 /* if PERF_SAMPLE_REGS_USER */
1705 u64 size; /* if PERF_SAMPLE_STACK_USER */
1706 char data[size]; /* if PERF_SAMPLE_STACK_USER */
1707 u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
d1007d14
VW
1708 u64 weight; /* if PERF_SAMPLE_WEIGHT */
1709 u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
1e043959 1710 u64 transaction;/* if PERF_SAMPLE_TRANSACTION */
f2b1d720
MK
1711};
1712.fi
4047bc6c
MK
1713.RS 4
1714.TP 4
7480dabb
VW
1715.I sample_id
1716If
1717.B PERF_SAMPLE_IDENTIFIER
1718is enabled, a 64-bit unique ID is included.
e9bd9b2c 1719This is a duplication of the
7480dabb
VW
1720.B PERF_SAMPLE_ID
1721.I id
1722value, but included at the beginning of the sample
1723so parsers can easily obtain the value.
1724.TP
f2b1d720 1725.I ip
7db515ef
MK
1726If
1727.B PERF_SAMPLE_IP
1728is enabled, then a 64-bit instruction
f2b1d720 1729pointer value is included.
f2b1d720 1730.TP
7db515ef
MK
1731.IR pid ", " tid
1732If
1733.B PERF_SAMPLE_TID
1734is enabled, then a 32-bit process ID
1735and 32-bit thread ID are included.
f2b1d720
MK
1736.TP
1737.I time
7db515ef
MK
1738If
1739.B PERF_SAMPLE_TIME
1740is enabled, then a 64-bit timestamp
f2b1d720
MK
1741is included.
1742This is obtained via local_clock() which is a hardware timestamp
1743if available and the jiffies value if not.
f2b1d720
MK
1744.TP
1745.I addr
7db515ef
MK
1746If
1747.B PERF_SAMPLE_ADDR
1748is enabled, then a 64-bit address is included.
f2b1d720
MK
1749This is usually the address of a tracepoint,
1750breakpoint, or software event; otherwise the value is 0.
f2b1d720
MK
1751.TP
1752.I id
7db515ef
MK
1753If
1754.B PERF_SAMPLE_ID
1755is enabled, a 64-bit unique ID is included.
f2b1d720 1756If the event is a member of an event group, the group leader ID is returned.
7db515ef
MK
1757This ID is the same as the one returned by
1758.BR PERF_FORMAT_ID .
f2b1d720
MK
1759.TP
1760.I stream_id
7db515ef
MK
1761If
1762.B PERF_SAMPLE_STREAM_ID
1763is enabled, a 64-bit unique ID is included.
f2b1d720
MK
1764Unlike
1765.B PERF_SAMPLE_ID
1766the actual ID is returned, not the group leader.
7db515ef
MK
1767This ID is the same as the one returned by
1768.BR PERF_FORMAT_ID .
f2b1d720 1769.TP
7db515ef
MK
1770.IR cpu ", " res
1771If
1772.B PERF_SAMPLE_CPU
1773is enabled, this is a 32-bit value indicating
f2b1d720
MK
1774which CPU was being used, in addition to a reserved (unused)
177532-bit value.
f2b1d720
MK
1776.TP
1777.I period
7db515ef
MK
1778If
1779.B PERF_SAMPLE_PERIOD
1780is enabled, a 64-bit value indicating
f2b1d720 1781the current sampling period is written.
f2b1d720
MK
1782.TP
1783.I v
7db515ef
MK
1784If
1785.B PERF_SAMPLE_READ
1786is enabled, a structure of type read_format
f2b1d720
MK
1787is included which has values for all events in the event group.
1788The values included depend on the
1789.I read_format
7db515ef
MK
1790value used at
1791.BR perf_event_open ()
1792time.
f2b1d720 1793.TP
7db515ef
MK
1794.IR nr ", " ips[nr]
1795If
1796.B PERF_SAMPLE_CALLCHAIN
1797is enabled, then a 64-bit number is included
f2b1d720 1798which indicates how many following 64-bit instruction pointers will
7db515ef
MK
1799follow.
1800This is the current callchain.
f2b1d720 1801.TP
7ede2f66 1802.IR size ", " data[size]
7db515ef
MK
1803If
1804.B PERF_SAMPLE_RAW
1805is enabled, then a 32-bit value indicating size
f2b1d720
MK
1806is included followed by an array of 8-bit values of length size.
1807The values are padded with 0 to have 64-bit alignment.
1808
1809This RAW record data is opaque with respect to the ABI.
1810The ABI doesn't make any promises with respect to the stability
1811of its content, it may vary depending
1812on event, hardware, and kernel version.
f2b1d720 1813.TP
7db515ef
MK
1814.IR bnr ", " lbr[bnr]
1815If
1816.B PERF_SAMPLE_BRANCH_STACK
1817is enabled, then a 64-bit value indicating
1818the number of records is included, followed by
1819.I bnr
1820.I perf_branch_entry
045bf4d3
VW
1821structures which each include the fields:
1822.RS
1823.TP
1824.I from
2b538c3e 1825This indicates the source instruction (may not be a branch).
045bf4d3
VW
1826.TP
1827.I to
2b538c3e 1828The branch target.
045bf4d3
VW
1829.TP
1830.I mispred
2b538c3e 1831The branch target was mispredicted.
045bf4d3
VW
1832.TP
1833.I predicted
2b538c3e 1834The branch target was predicted.
e3c9782b 1835.TP
31c1f2b0 1836.IR in_tx " (since Linux 3.11)"
2b538c3e 1837The branch was in a transactional memory transaction.
e3c9782b 1838.TP
31c1f2b0 1839.IR abort " (since Linux 3.11)"
2b538c3e 1840The branch was in an aborted transactional memory transaction.
e3c9782b
VW
1841
1842.P
045bf4d3
VW
1843The entries are from most to least recent, so the first entry
1844has the most recent branch.
1845
8a94e783
MK
1846Support for
1847.I mispred
1848and
1849.I predicted
baf7029b 1850is optional; if not supported, both
045bf4d3
VW
1851values will be 0.
1852
e3c9782b
VW
1853The type of branches recorded is specified by the
1854.I branch_sample_type
1855field.
1856.RE
1857
f2b1d720 1858.TP
7db515ef
MK
1859.IR abi ", " regs[weight(mask)]
1860If
1861.B PERF_SAMPLE_REGS_USER
d1007d14 1862is enabled, then the user CPU registers are recorded.
f2b1d720
MK
1863
1864The
1865.I abi
1866field is one of
1867.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
7db515ef 1868.BR PERF_SAMPLE_REGS_ABI_64 .
d1007d14
VW
1869
1870The
1871.I regs
1872field is an array of the CPU registers that were specified by
1873the
1874.I sample_regs_user
1875attr field.
1876The number of values is the number of bits set in the
51700fd7 1877.I sample_regs_user
4651e412 1878bit mask.
f2b1d720 1879.TP
7db515ef
MK
1880.IR size ", " data[size] ", " dyn_size
1881If
1882.B PERF_SAMPLE_STACK_USER
d1007d14
VW
1883is enabled, then record the user stack to enable backtracing.
1884.I size
1885is the size requested by the user in
1886.I stack_user_size
1887or else the maximum record size.
1888.I data
1889is the stack data.
1890.I dyn_size
1891is the amount of data actually dumped (can be less than
460e3d7a 1892.IR size ).
d1007d14 1893.TP
51700fd7 1894.I weight
d1007d14
VW
1895If
1896.B PERF_SAMPLE_WEIGHT
7de4a1e3 1897is enabled, then a 64-bit value provided by the hardware
d1007d14
VW
1898is recorded that indicates how costly the event was.
1899This allows expensive events to stand out more clearly
1900in profiles.
1901.TP
1902.I data_src
51700fd7 1903If
d1007d14 1904.B PERF_SAMPLE_DATA_SRC
7de4a1e3 1905is enabled, then a 64-bit value is recorded that is made up of
d1007d14
VW
1906the following fields:
1907.RS
2b538c3e 1908.TP 4
d1007d14 1909.I mem_op
2b538c3e
MK
1910Type of opcode, a bitwise combination of:
1911
1912.PD 0
1913.RS
1914.TP 24
d1007d14 1915.B PERF_MEM_OP_NA
2b538c3e
MK
1916Not available
1917.TP
d1007d14 1918.B PERF_MEM_OP_LOAD
2b538c3e
MK
1919Load instruction
1920.TP
d1007d14 1921.B PERF_MEM_OP_STORE
2b538c3e
MK
1922Store instruction
1923.TP
d1007d14 1924.B PERF_MEM_OP_PFETCH
2b538c3e
MK
1925Prefetch
1926.TP
d1007d14 1927.B PERF_MEM_OP_EXEC
2b538c3e
MK
1928Executable code
1929.RE
1930.PD
d1007d14
VW
1931.TP
1932.I mem_lvl
2b538c3e
MK
1933Memory hierarchy level hit or miss, a bitwise combination of:
1934
1935.PD 0
1936.RS
1937.TP 24
d1007d14 1938.B PERF_MEM_LVL_NA
2b538c3e
MK
1939Not available
1940.TP
d1007d14 1941.B PERF_MEM_LVL_HIT
2b538c3e
MK
1942Hit
1943.TP
d1007d14 1944.B PERF_MEM_LVL_MISS
2b538c3e
MK
1945Miss
1946.TP
d1007d14 1947.B PERF_MEM_LVL_L1
2b538c3e
MK
1948Level 1 cache
1949.TP
d1007d14 1950.B PERF_MEM_LVL_LFB
2b538c3e
MK
1951Line fill buffer
1952.TP
d1007d14 1953.B PERF_MEM_LVL_L2
2b538c3e
MK
1954Level 2 cache
1955.TP
d1007d14 1956.B PERF_MEM_LVL_L3
2b538c3e
MK
1957Level 3 cache
1958.TP
d1007d14 1959.B PERF_MEM_LVL_LOC_RAM
2b538c3e
MK
1960Local DRAM
1961.TP
d1007d14 1962.B PERF_MEM_LVL_REM_RAM1
2b538c3e
MK
1963Remote DRAM 1 hop
1964.TP
d1007d14 1965.B PERF_MEM_LVL_REM_RAM2
2b538c3e
MK
1966Remote DRAM 2 hops
1967.TP
d1007d14 1968.B PERF_MEM_LVL_REM_CCE1
2b538c3e
MK
1969Remote cache 1 hop
1970.TP
d1007d14 1971.B PERF_MEM_LVL_REM_CCE2
2b538c3e
MK
1972Remote cache 2 hops
1973.TP
d1007d14 1974.B PERF_MEM_LVL_IO
2b538c3e
MK
1975I/O memory
1976.TP
d1007d14 1977.B PERF_MEM_LVL_UNC
2b538c3e
MK
1978Uncached memory
1979.RE
1980.PD
d1007d14
VW
1981.TP
1982.I mem_snoop
2b538c3e
MK
1983Snoop mode, a bitwise combination of:
1984
1985.PD 0
1986.RS
1987.TP 24
d1007d14 1988.B PERF_MEM_SNOOP_NA
2b538c3e
MK
1989Not available
1990.TP
d1007d14 1991.B PERF_MEM_SNOOP_NONE
2b538c3e
MK
1992No snoop
1993.TP
d1007d14 1994.B PERF_MEM_SNOOP_HIT
2b538c3e
MK
1995Snoop hit
1996.TP
d1007d14 1997.B PERF_MEM_SNOOP_MISS
2b538c3e
MK
1998Snoop miss
1999.TP
d1007d14 2000.B PERF_MEM_SNOOP_HITM
2b538c3e
MK
2001Snoop hit modified
2002.RE
2003.PD
d1007d14
VW
2004.TP
2005.I mem_lock
2b538c3e
MK
2006Lock instruction, a bitwise combination of:
2007
2008.PD 0
2009.RS
2010.TP 24
d1007d14 2011.B PERF_MEM_LOCK_NA
2b538c3e
MK
2012Not available
2013.TP
d1007d14 2014.B PERF_MEM_LOCK_LOCKED
2b538c3e
MK
2015Locked transaction
2016.RE
2017.PD
d1007d14
VW
2018.TP
2019.I mem_dtlb
2b538c3e
MK
2020TLB access hit or miss, a bitwise combination of:
2021
2022.PD 0
2023.RS
2024.TP 24
d1007d14 2025.B PERF_MEM_TLB_NA
2b538c3e
MK
2026Not available
2027.TP
d1007d14 2028.B PERF_MEM_TLB_HIT
2b538c3e
MK
2029Hit
2030.TP
d1007d14 2031.B PERF_MEM_TLB_MISS
2b538c3e
MK
2032Miss
2033.TP
d1007d14 2034.B PERF_MEM_TLB_L1
2b538c3e
MK
2035Level 1 TLB
2036.TP
d1007d14 2037.B PERF_MEM_TLB_L2
2b538c3e
MK
2038Level 2 TLB
2039.TP
d1007d14 2040.B PERF_MEM_TLB_WK
2b538c3e
MK
2041Hardware walker
2042.TP
d1007d14 2043.B PERF_MEM_TLB_OS
2b538c3e
MK
2044OS fault handler
2045.RE
2046.PD
d1007d14 2047.RE
1e043959
VW
2048.TP
2049.I transaction
2050If the
2051.B PERF_SAMPLE_TRANSACTION
37bee118 2052flag is set, then a 64-bit field is recorded describing
1e043959
VW
2053the sources of any transactional memory aborts.
2054
2055The field is a bitwise combination of the following values:
2056.RS
2057.TP
2058.B PERF_TXN_ELISION
b3f39642 2059Abort from an elision type transaction (Intel-CPU-specific).
1e043959
VW
2060.TP
2061.B PERF_TXN_TRANSACTION
b3f39642 2062Abort from a generic transaction.
1e043959
VW
2063.TP
2064.B PERF_TXN_SYNC
b3f39642 2065Synchronous abort (related to the reported instruction).
1e043959
VW
2066.TP
2067.B PERF_TXN_ASYNC
b3f39642 2068Asynchronous abort (not related to the reported instruction).
1e043959
VW
2069.TP
2070.B PERF_TXN_RETRY
053a3e08 2071Retryable abort (retrying the transaction may have succeeded).
1e043959
VW
2072.TP
2073.B PERF_TXN_CONFLICT
b3f39642 2074Abort due to memory conflicts with other threads.
1e043959
VW
2075.TP
2076.B PERF_TXN_CAPACITY_WRITE
b3f39642 2077Abort due to write capacity overflow.
1e043959
VW
2078.TP
2079.B PERF_TXN_CAPACITY_READ
b3f39642 2080Abort due to read capacity overflow.
1e043959 2081.RE
b3f39642
MK
2082.IP
2083In addition, a user-specified abort code can be obtained from
2084the high 32 bits of the field by shifting right by
1e043959
VW
2085.B PERF_TXN_ABORT_SHIFT
2086and masking with
2087.BR PERF_TXN_ABORT_MASK .
f2b1d720 2088.RE
f2b1d720 2089.RE
73d8cece 2090.SS Signal overflow
f2b1d720
MK
2091Events can be set to deliver a signal when a threshold is crossed.
2092The signal handler is set up using the
2093.BR poll (2),
2094.BR select (2),
2095.BR epoll (2)
2096and
2097.BR fcntl (2),
2098system calls.
2099
2100To generate signals, sampling must be enabled
2101.RI ( sample_period
7d182bb6 2102must have a nonzero value).
f2b1d720
MK
2103
2104There are two ways to generate signals.
2105
2106The first is to set a
2107.I wakeup_events
2108or
2109.I wakeup_watermark
2110value that will generate a signal if a certain number of samples
2111or bytes have been written to the mmap ring buffer.
31020de9 2112In this case, a signal of type
7db515ef
MK
2113.B POLL_IN
2114is sent.
f2b1d720
MK
2115
2116The other way is by use of the
7db515ef 2117.B PERF_EVENT_IOC_REFRESH
f2b1d720
MK
2118ioctl.
2119This ioctl adds to a counter that decrements each time the event overflows.
7d182bb6 2120When nonzero, a
7db515ef
MK
2121.B POLL_IN
2122signal is sent on overflow, but
2123once the value reaches 0, a signal is sent of type
2124.B POLL_HUP
2125and
f2b1d720
MK
2126the underlying event is disabled.
2127
2128Note: on newer kernels (definitely noticed with 3.2)
7db515ef 2129.\" FIXME(Vince) : Find out when this was introduced
f2b1d720
MK
2130a signal is provided for every overflow, even if
2131.I wakeup_events
2132is not set.
73d8cece 2133.SS rdpmc instruction
f2b1d720
MK
2134Starting with Linux 3.4 on x86, you can use the
2135.I rdpmc
2136instruction to get low-latency reads without having to enter the kernel.
2137Note that using
2138.I rdpmc
2139is not necessarily faster than other methods for reading event values.
2140
2141Support for this can be detected with the
2142.I cap_usr_rdpmc
2143field in the mmap page; documentation on how
2144to calculate event values can be found in that section.
73d8cece 2145.SS perf_event ioctl calls
f2b1d720
MK
2146.PP
2147Various ioctls act on
7db515ef 2148.BR perf_event_open ()
f2b1d720 2149file descriptors
f2b1d720
MK
2150.TP
2151.B PERF_EVENT_IOC_ENABLE
36127c0e 2152Enables the individual event or event group specified by the
7db515ef 2153file descriptor argument.
f2b1d720 2154
51700fd7 2155If the
8cc8b90d 2156.B PERF_IOC_FLAG_GROUP
51700fd7 2157bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2158enabled, even if the event specified is not the group leader
2159(but see BUGS).
f2b1d720
MK
2160.TP
2161.B PERF_EVENT_IOC_DISABLE
36127c0e 2162Disables the individual counter or event group specified by the
7db515ef 2163file descriptor argument.
f2b1d720
MK
2164
2165Enabling or disabling the leader of a group enables or disables the
2166entire group; that is, while the group leader is disabled, none of the
2167counters in the group will count.
33a0ccb2
MK
2168Enabling or disabling a member of a group other than the leader
2169affects only that counter; disabling a non-leader
f2b1d720
MK
2170stops that counter from counting but doesn't affect any other counter.
2171
51700fd7 2172If the
8cc8b90d 2173.B PERF_IOC_FLAG_GROUP
51700fd7 2174bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2175disabled, even if the event specified is not the group leader
2176(but see BUGS).
f2b1d720
MK
2177.TP
2178.B PERF_EVENT_IOC_REFRESH
2179Non-inherited overflow counters can use this
2180to enable a counter for a number of overflows specified by the argument,
2181after which it is disabled.
2182Subsequent calls of this ioctl add the argument value to the current
2183count.
7db515ef
MK
2184A signal with
2185.B POLL_IN
2186set will happen on each overflow until the
2187count reaches 0; when that happens a signal with
2188POLL_HUP
2189set is sent and the event is disabled.
f2b1d720 2190Using an argument of 0 is considered undefined behavior.
f2b1d720
MK
2191.TP
2192.B PERF_EVENT_IOC_RESET
36127c0e 2193Reset the event count specified by the
6061d29f 2194file descriptor argument to zero.
33a0ccb2 2195This resets only the counts; there is no way to reset the
f2b1d720
MK
2196multiplexing
2197.I time_enabled
2198or
2199.I time_running
2200values.
f2b1d720 2201
51700fd7 2202If the
8cc8b90d 2203.B PERF_IOC_FLAG_GROUP
51700fd7 2204bit is set in the ioctl argument, then all events in a group are
dbc01ecd
VW
2205reset, even if the event specified is not the group leader
2206(but see BUGS).
f2b1d720
MK
2207.TP
2208.B PERF_EVENT_IOC_PERIOD
e6cf5694 2209This updates the overflow period for the event.
3f118a29 2210
ed81fdd9 2211Since Linux 3.7 (on ARM) and Linux 3.14 (all other architectures),
3f118a29 2212the new period takes effect immediately.
ed81fdd9 2213On older kernels, the new period did not take effect until
3f118a29 2214after the next overflow.
f2b1d720
MK
2215
2216The argument is a pointer to a 64-bit value containing the
2217desired new period.
e6cf5694
VW
2218
2219Prior to Linux 2.6.36 this ioctl always failed due to a bug
2220in the kernel.
2221
f2b1d720
MK
2222.TP
2223.B PERF_EVENT_IOC_SET_OUTPUT
2224This tells the kernel to report event notifications to the specified
2225file descriptor rather than the default one.
2226The file descriptors must all be on the same CPU.
2227
2228The argument specifies the desired file descriptor, or \-1 if
2229output should be ignored.
f2b1d720 2230.TP
31c1f2b0 2231.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
f2b1d720
MK
2232This adds an ftrace filter to this event.
2233
2234The argument is a pointer to the desired ftrace filter.
a0dcc8dd 2235.TP
31c1f2b0 2236.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
a0dcc8dd
VW
2237Returns the event ID value for the given event fd.
2238
2239The argument is a pointer to a 64-bit unsigned integer
2240to hold the result.
73d8cece 2241.SS Using prctl
f2b1d720
MK
2242A process can enable or disable all the event groups that are
2243attached to it using the
2244.BR prctl (2)
2245.B PR_TASK_PERF_EVENTS_ENABLE
2246and
2247.B PR_TASK_PERF_EVENTS_DISABLE
2248operations.
2249This applies to all counters on the current process, whether created by
2250this process or by another, and does not affect any counters that this
2251process has created on other processes.
33a0ccb2 2252It enables or disables only
f2b1d720 2253the group leaders, not any other members in the groups.
f2b1d720 2254.SS perf_event related configuration files
7db515ef
MK
2255Files in
2256.I /proc/sys/kernel/
7db515ef 2257.RS 4
f2b1d720 2258.TP
7db515ef 2259.I /proc/sys/kernel/perf_event_paranoid
f2b1d720
MK
2260
2261The
2262.I perf_event_paranoid
2263file can be set to restrict access to the performance counters.
2b538c3e
MK
2264.RS
2265.IP 2 4
2266only allow user-space measurements.
2267.IP 1
2268allow both kernel and user measurements (default).
2269.IP 0
2270allow access to CPU-specific data but not raw tracepoint samples.
2271.IP \-1
2272no restrictions.
2273.RE
2274.IP
f2b1d720
MK
2275The existence of the
2276.I perf_event_paranoid
2277file is the official method for determining if a kernel supports
7db515ef 2278.BR perf_event_open ().
f2b1d720
MK
2279.TP
2280.I /proc/sys/kernel/perf_event_max_sample_rate
2281
7db515ef
MK
2282This sets the maximum sample rate.
2283Setting this too high can allow
f2b1d720 2284users to sample at a rate that impacts overall machine performance
7db515ef
MK
2285and potentially lock up the machine.
2286The default value is
f2b1d720 2287100000 (samples per second).
f2b1d720
MK
2288.TP
2289.I /proc/sys/kernel/perf_event_mlock_kb
2290
7db515ef 2291Maximum number of pages an unprivileged user can mlock (2) .
f2b1d720 2292The default is 516 (kB).
e30dc77f 2293
f2b1d720 2294.RE
7db515ef
MK
2295Files in
2296.I /sys/bus/event_source/devices/
7db515ef 2297.RS 4
f2b1d720
MK
2298Since Linux 2.6.34 the kernel supports having multiple PMUs
2299available for monitoring.
2300Information on how to program these PMUs can be found under
2301.IR /sys/bus/event_source/devices/ .
2302Each subdirectory corresponds to a different PMU.
f2b1d720 2303.TP
31c1f2b0 2304.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
f2b1d720
MK
2305This contains an integer that can be used in the
2306.I type
2307field of perf_event_attr to indicate you wish to use this PMU.
f2b1d720 2308.TP
31c1f2b0 2309.IR /sys/bus/event_source/devices/*/rdpmc " (since Linux 3.4)"
8a94e783 2310If this file is 1, then direct user-space access to the
e30dc77f
VW
2311performance counter registers is allowed via the rdpmc instruction.
2312This can be disabled by echoing 0 to the file.
f2b1d720 2313.TP
31c1f2b0 2314.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
7d182bb6
MK
2315This subdirectory contains information on the architecture-specific
2316subfields available for programming the various
f2b1d720 2317.I config
e30dc77f
VW
2318fields in the perf_event_attr struct.
2319
2320The content of each file is the name of the config field, followed
2321by a colon, followed by a series of integer bit ranges separated by
2322commas.
8a94e783 2323For example, the file
e30dc77f
VW
2324.I event
2325may contain the value
2326.I config1:1,6-10,44
2327which indicates that event is an attribute that occupies bits 1,6-10, and 44
2328of perf_event_attr::config1.
2329.TP
31c1f2b0 2330.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
7d182bb6 2331This subdirectory contains files with predefined events.
f2b1d720 2332The contents are strings describing the event settings
e30dc77f 2333expressed in terms of the fields found in the previously mentioned
f2b1d720
MK
2334.I ./format/
2335directory.
2336These are not necessarily complete lists of all events supported by
2337a PMU, but usually a subset of events deemed useful or interesting.
e30dc77f
VW
2338
2339The content of each file is a list of attribute names
8a94e783
MK
2340separated by commas.
2341Each entry has an optional value (either hex or decimal).
37bee118 2342If no value is specified, then it is assumed to be a single-bit
e30dc77f
VW
2343field with a value of 1.
2344An example entry may look like this:
699893d8 2345.IR event=0x2,inv,ldlat=3 .
f2b1d720
MK
2346.TP
2347.I /sys/bus/event_source/devices/*/uevent
e30dc77f
VW
2348This file is the standard kernel device interface
2349for injecting hotplug events.
2350.TP
31c1f2b0 2351.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
699893d8
DP
2352The
2353.I cpumask
2354file contains a comma-separated list of integers that
2355indicate a representative CPU number for each socket (package)
e30dc77f
VW
2356on the motherboard.
2357This is needed when setting up uncore or northbridge events, as
2358those PMUs present socket-wide events.
f2b1d720 2359.RE
47297adb 2360.SH RETURN VALUE
f2b1d720
MK
2361.BR perf_event_open ()
2362returns the new file descriptor, or \-1 if an error occurred
2363(in which case,
2364.I errno
2365is set appropriately).
2366.SH ERRORS
d8b7d950
VW
2367The errors returned by
2368.BR perf_event_open ()
2369can be inconsistent, and may
2370vary across processor architectures and performance monitoring units.
f2b1d720 2371.TP
82b09254
VW
2372.B E2BIG
2373Returned if the perf_event_attr
2374.I size
2375value is too small
2376(smaller than
2377.BR PERF_ATTR_SIZE_VER0 ),
2378too big (larger than the page size),
2379or larger than the kernel supports and the extra bytes are not zero.
2380When
2381.B E2BIG
2382is returned, the perf_event_attr
e9bd9b2c 2383.I size
d6af98f8 2384field is overwritten by the kernel to be the size of the structure
82b09254
VW
2385it was expecting.
2386.TP
d8b7d950
VW
2387.B EACCES
2388Returned when the requested event requires root permissions
2389(or a more permissive perf_event paranoid setting).
2390Some common causes are attaching to a process owned by a different user,
accec051
MK
2391monitoring all processes on a given CPU,
2392or not setting
2393.I exclude_kernel
2394and the paranoid setting requires it.
d8b7d950
VW
2395.TP
2396.B EBADF
2397Returned if the
2398.I group_fd
accec051
MK
2399file descriptor is not valid, or, if
2400.B PERF_FLAG_PID_CGROUP
2401is set,
d8b7d950
VW
2402the cgroup file descriptor in
2403.I pid
2404is not valid.
2405.TP
2406.B EFAULT
2407Returned if the
2408.I attr
2409pointer points at an invalid memory address.
2410.TP
f2b1d720 2411.B EINVAL
d8b7d950
VW
2412Returned if the specified event is invalid.
2413There are many possible reasons for this.
2414A not-exhaustive list:
2415.I sample_freq
accec051 2416is higher than the maximum setting;
d8b7d950
VW
2417the
2418.I cpu
accec051 2419to monitor does not exist;
d8b7d950 2420.I read_format
accec051 2421is out of range;
d8b7d950 2422.I sample_type
accec051 2423is out of range;
d8b7d950
VW
2424the
2425.I flags
accec051 2426value is out of range;
d8b7d950
VW
2427.I exclusive
2428or
2429.I pinned
accec051 2430set and the event is not a group leader;
d8b7d950
VW
2431the event
2432.I config
accec051
MK
2433values are out of range or set reserved bits;
2434the generic event selected is not supported; or
d8b7d950
VW
2435there is not enough room to add the selected event.
2436.TP
2437.B EMFILE
2438Each opened event uses one file descriptor.
2439If a large number of events are opened the per-user file
2440descriptor limit (often 1024) will be hit and no more
2441events can be created.
2442.TP
2443.B ENODEV
2444Returned when the event involves a feature not supported
accec051 2445by the current CPU.
d8b7d950
VW
2446.TP
2447.B ENOENT
2448Returned if the
2449.I type
2450setting is not valid.
accec051 2451This error is also returned for
d8b7d950 2452some unsupported generic events.
f2b1d720
MK
2453.TP
2454.B ENOSPC
2455Prior to Linux 3.3, if there was not enough room for the event,
2456.B ENOSPC
2457was returned.
accec051 2458In Linux 3.3, this was changed to
f2b1d720
MK
2459.BR EINVAL .
2460.B ENOSPC
d8b7d950 2461is still returned if you try to add more breakpoint events
accec051 2462than supported by the hardware.
d8b7d950
VW
2463.TP
2464.B ENOSYS
2465Returned if
2466.B PERF_SAMPLE_STACK_USER
2467is set in
2468.I sample_type
2469and it is not supported by hardware.
2470.TP
2471.B EOPNOTSUPP
2472Returned if an event requiring a specific hardware feature is
2473requested but there is no hardware support.
2474This includes requesting low-skid events if not supported,
2475branch tracing if it is not available, sampling if no PMU
2476interrupt is available, and branch stacks for software events.
2477.TP
2478.B EPERM
2479Returned if sufficient permissions not available to create the event.
2480This includes attempting to set a breakpoint on a kernel address
2481and setting a ftrace function trace tracepoint.
2482.TP
2483.B ESRCH
2484Returned if attempting to attach to a process that does not exist.
f2b1d720 2485.SH VERSION
f2b1d720
MK
2486.BR perf_event_open ()
2487was introduced in Linux 2.6.31 but was called
ffd4dec0 2488.BR perf_counter_open ().
f2b1d720 2489It was renamed in Linux 2.6.32.
f2b1d720 2490.SH CONFORMING TO
7db515ef
MK
2491This
2492.BR perf_event_open ()
2493system call Linux- specific
f2b1d720 2494and should not be used in programs intended to be portable.
f2b1d720
MK
2495.SH NOTES
2496Glibc does not provide a wrapper for this system call; call it using
2497.BR syscall (2).
7db515ef 2498See the example below.
f2b1d720
MK
2499
2500The official way of knowing if
7db515ef 2501.BR perf_event_open ()
f2b1d720
MK
2502support is enabled is checking
2503for the existence of the file
7db515ef 2504.IR /proc/sys/kernel/perf_event_paranoid .
f2b1d720 2505.SH BUGS
f2b1d720
MK
2506The
2507.B F_SETOWN_EX
2508option to
7db515ef 2509.BR fcntl (2)
f2b1d720
MK
2510is needed to properly get overflow signals in threads.
2511This was introduced in Linux 2.6.32.
2512
2513Prior to Linux 2.6.33 (at least for x86) the kernel did not check
2514if events could be scheduled together until read time.
2515The same happens on all known kernels if the NMI watchdog is enabled.
2516This means to see if a given set of events works you have to
2517.BR perf_event_open (),
2518start, then read before you know for sure you
2519can get valid measurements.
2520
2521Prior to Linux 2.6.34 event constraints were not enforced by the kernel.
2522In that case, some events would silently return "0" if the kernel
2523scheduled them in an improper counter slot.
2524
2525Prior to Linux 2.6.34 there was a bug when multiplexing where the
2526wrong results could be returned.
2527
2528Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
2529"inherit" is enabled and many threads are started.
2530
2531Prior to Linux 2.6.35,
2532.B PERF_FORMAT_GROUP
2533did not work with attached processes.
2534
2535In older Linux 2.6 versions,
2536refreshing an event group leader refreshed all siblings,
2537and refreshing with a parameter of 0 enabled infinite refresh.
2538This behavior is unsupported and should not be relied on.
2539
2540There is a bug in the kernel code between
2541Linux 2.6.36 and Linux 3.0 that ignores the
2542"watermark" field and acts as if a wakeup_event
2543was chosen if the union has a
7d182bb6 2544nonzero value in it.
f2b1d720 2545
8a94e783 2546From Linux 2.6.31 to Linux 3.4, the
dbc01ecd
VW
2547.B PERF_IOC_FLAG_GROUP
2548ioctl argument was broken and would repeatedly operate
2549on the event specified rather than iterating across
2550all sibling events in a group.
2551
7205b8df 2552From Linux 3.4 to Linux 3.11, the mmap
135cba8b
VW
2553.I cap_usr_rdpmc
2554and
2555.I cap_usr_time
2556bits mapped to the same location.
2557Code should migrate to the new
2558.I cap_user_rdpmc
2559and
2560.I cap_user_time
2561fields instead.
2562
7db515ef
MK
2563Always double-check your results!
2564Various generalized events have had wrong values.
f2b1d720
MK
2565For example, retired branches measured
2566the wrong thing on AMD machines until Linux 2.6.35.
f2b1d720
MK
2567.SH EXAMPLE
2568The following is a short example that measures the total
7db515ef
MK
2569instruction count of a call to
2570.BR printf (3).
f2b1d720
MK
2571.nf
2572
2573#include <stdlib.h>
2574#include <stdio.h>
2575#include <unistd.h>
2576#include <string.h>
2577#include <sys/ioctl.h>
2578#include <linux/perf_event.h>
2579#include <asm/unistd.h>
2580
571767ca 2581static long
7db515ef
MK
2582perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
2583 int cpu, int group_fd, unsigned long flags)
f2b1d720
MK
2584{
2585 int ret;
2586
7db515ef
MK
2587 ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
2588 group_fd, flags);
f2b1d720
MK
2589 return ret;
2590}
2591
f2b1d720
MK
2592int
2593main(int argc, char **argv)
2594{
f2b1d720
MK
2595 struct perf_event_attr pe;
2596 long long count;
2597 int fd;
2598
2599 memset(&pe, 0, sizeof(struct perf_event_attr));
2600 pe.type = PERF_TYPE_HARDWARE;
2601 pe.size = sizeof(struct perf_event_attr);
2602 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
2603 pe.disabled = 1;
2604 pe.exclude_kernel = 1;
2605 pe.exclude_hv = 1;
2606
2607 fd = perf_event_open(&pe, 0, \-1, \-1, 0);
7db515ef 2608 if (fd == \-1) {
f2b1d720 2609 fprintf(stderr, "Error opening leader %llx\\n", pe.config);
7db515ef 2610 exit(EXIT_FAILURE);
f2b1d720
MK
2611 }
2612
2613 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
2614 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
2615
2616 printf("Measuring instruction count for this printf\\n");
2617
2618 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
2619 read(fd, &count, sizeof(long long));
2620
2621 printf("Used %lld instructions\\n", count);
2622
2623 close(fd);
2624}
2625.fi
47297adb 2626.SH SEE ALSO
f2b1d720
MK
2627.BR fcntl (2),
2628.BR mmap (2),
2629.BR open (2),
2630.BR prctl (2),
2631.BR read (2)